Esempio n. 1
0
def find_most_equal(data, class_list):
    m_std = {}
    for matter in data.keys():
        m_stds = []
        for _class in class_list:
            c_tab = class_tab(data[matter], class_list[_class])
            m_stds.append(std(c_tab))
        m_std[matter] = std(m_stds)
    return min(m_std, key=m_std.get)
Esempio n. 2
0
    def CrossValidation(self, cv_method=0, **args):
        """Select ncomp by the requested CV method"""
        validation = self.model["validation"].AsDataFrame()

        # method 0: select the fewest components with PRESS within 1 stdev of the least PRESS (by the bootstrap)
        if cv_method == 0:  # Use the bootstrap to find the standard deviation of the MSEP
            # Get the leave-one-out CV error from R:
            columns = min(self.num_predictors, self.ncomp_max)
            cv = array.array("d", validation["pred"].AsVector())
            rows = len(cv) / columns
            cc = []
            for k in range(int(columns)):
                b = k * rows
                e = b + rows
                cc.append(array.array("d", cv[b:e]))
            cv = cc

            # PRESS = map(lambda x: sum((cv[:,x]-self.array_actual)**2), range(cv.shape[1]))
            PRESS = [sum([(cv[i][j] - self.actual[j]) ** 2 for j in range(rows)]) for i in range(int(columns))]
            # ncomp = np.argmin(PRESS)
            ncomp = [i for i in range(len(PRESS)) if PRESS[i] == min(PRESS)][0]

            # cv_squared_error = (cv[:,ncomp]-self.array_actual)**2
            cv_squared_error = [(cv[ncomp][j] - self.actual[j]) ** 2 for j in range(int(rows))]
            sample_space = xrange(rows)

            PRESS_stdev = list()

            # Cache random number generator and int's constructor for a speed boost
            _random, _int = random.random, int

            for i in range(100):
                PRESS_bootstrap = list()

                for j in range(100):
                    PRESS_bootstrap.append(sum([cv_squared_error[_int(_random() * rows)] for i in sample_space]))

                PRESS_stdev.append(utils.std(PRESS_bootstrap))

            med_stdev = utils.median(PRESS_stdev)

            # Maximum allowable PRESS is the minimum plus one standard deviation
            good_ncomp = [i for i in range(len(PRESS)) if PRESS[i] < min(PRESS) + med_stdev]
            self.ncomp = int(min(good_ncomp) + 1)

        # method 1: select the fewest components w/ PRESS less than the minimum plus a 4% of the range
        if cv_method == 1:
            # PRESS stands for predicted error sum of squares
            PRESS0 = validation["PRESS0"][0]
            PRESS = list(validation["PRESS"])

            # the range is the difference between the greatest and least PRESS values
            PRESS_range = abs(PRESS0 - min(PRESS))

            # Maximum allowable PRESS is the minimum plus a fraction of the range.
            max_CV_error = min(PRESS) + PRESS_range / 25
            good_ncomp = [i for i in range(len(PRESS)) if PRESS[i] < max_CV_error]

            # choose the most parsimonious model that satisfies that criterion
            self.ncomp = int(min(good_ncomp) + 1)
Esempio n. 3
0
    def AssignWeights(self, method=0):
        #Weight the observations in the training set based on their distance from the threshold.
        obs = self.data_dictionary[self.target]
        deviation = [(obs[i]-self.regulatory_threshold) / utils.std(obs) for i in range(len(obs))]
        
        #Integer weighting: weight is the observation's rounded-up whole number of standard deviations from the threshold.
        if method == 1: 
            weights = [1 for k in range(len(deviation))]
            breaks = range(int(math.floor(min(deviation))), int(math.ceil(max(deviation))))

            for i in breaks:
                #find all observations that meet the upper and lower criteria, separately
                first_slice = [k for k in range(len(deviation)) if deviation[k] > i]
                second_slice = [k for k in range(len(deviation)) if deviation < i+1]
                
                #now find all the observations that meet both criteria simultaneously
                rows = filter( lambda x: x in first_slice, second_slice )
                rows = [int(r) for r in rows]
                
                #Decide how many times to replicate each slice of data
                if i<0:
                    replicates = (abs(i) - 1)
                else:
                    replicates = i
                    
                if rows: weights[rows] = replicates + 1
                
        #Continuous weighting: weight is the observation's distance (in standard deviations) from the threshold.      
        elif method == 2:
            weights = abs(deviation)
            
        #No weights: all weights are one.
        else: weights = [1.0 for k in range(len(deviation))]
            
        return weights
Esempio n. 4
0
File: stats.py Progetto: stablum/soa
def snapshot():
    """ statistics and stuff (??) """
    global collector
    collector.total_weight = total_weight()
    collector.path_length = path_length()
    edges_importances = gi.get_all_edges_importances()
    collector.mean_edges_importance=utils.mean(edges_importances)
    collector.std_edges_importance=utils.std(edges_importances)
    print collector.__dict__
Esempio n. 5
0
    def fit(self, X):
        self._is_trained = True

        self.std = []
        self.mean = []
        self.count_feature = len(X[0])
        for feature_idx in range(self.count_feature):
            data = [x[feature_idx] for x in X]
            curr_mean = mean(data)
            curr_std = std(data)
            self.mean.append(curr_mean)
            self.std.append(curr_std)

        return self
Esempio n. 6
0
    def GetInfluence(self):        
        #Get the covariate names
        self.names = self.data_dictionary.keys()
        self.names.remove(self.target)

        #Now get the model coefficients from R.
        coefficients = self.Extract('coef').AsVector()
        
        #Get the standard deviations (from the data_dictionary) and package the influence in a dictionary.
        raw_influence = list()
        
        for i in range( len(self.names) ):
            standard_deviation = utils.std( self.data_dictionary[self.names[i]] )
            raw_influence.append( float(abs(standard_deviation * coefficients[i+1])) )
 
        self.influence = dict( zip([float(x/sum(raw_influence)) for x in raw_influence], self.names) )
        return self.influence
Esempio n. 7
0
    def AssignWeights(self, method=0):
        #Weight the observations in the training set based on their distance from the threshold.
        obs = self.data_dictionary[self.target]
        deviation = [(obs[i] - self.regulatory_threshold) / utils.std(obs)
                     for i in range(len(obs))]

        #Integer weighting: weight is the observation's rounded-up whole number of standard deviations from the threshold.
        if method == 1:
            weights = [1 for k in range(len(deviation))]
            breaks = range(int(math.floor(min(deviation))),
                           int(math.ceil(max(deviation))))

            for i in breaks:
                #find all observations that meet the upper and lower criteria, separately
                first_slice = [
                    k for k in range(len(deviation)) if deviation[k] > i
                ]
                second_slice = [
                    k for k in range(len(deviation)) if deviation < i + 1
                ]

                #now find all the observations that meet both criteria simultaneously
                rows = filter(lambda x: x in first_slice, second_slice)
                rows = [int(r) for r in rows]

                #Decide how many times to replicate each slice of data
                if i < 0:
                    replicates = (abs(i) - 1)
                else:
                    replicates = i

                if rows: weights[rows] = replicates + 1

        #Continuous weighting: weight is the observation's distance (in standard deviations) from the threshold.
        elif method == 2:
            weights = abs(deviation)

        #No weights: all weights are one.
        else:
            weights = [1.0 for k in range(len(deviation))]

        return weights
Esempio n. 8
0
    def GetInfluence(self):
        #Get the covariate names
        self.names = self.data_dictionary.keys()
        self.names.remove(self.target)

        #Now get the model coefficients from R.
        coefficients = self.Extract('coef').AsVector()

        #Get the standard deviations (from the data_dictionary) and package the influence in a dictionary.
        raw_influence = list()

        for i in range(len(self.names)):
            standard_deviation = utils.std(self.data_dictionary[self.names[i]])
            raw_influence.append(
                float(abs(standard_deviation * coefficients[i + 1])))

        self.influence = dict(
            zip([float(x / sum(raw_influence)) for x in raw_influence],
                self.names))
        return self.influence
Esempio n. 9
0
    def GetInfluence(self):
        #Get the model terms from R's model object
        terms = self.Extract('terms')
        terms = str(terms)
        
        #Get the covariate names
        self.names = self.data_dictionary.keys()
        self.names.remove(self.target)

        #Now get the model coefficients from R.
        coefficients = array.array('d', self.Extract('coef'))
        
        #Get the standard deviations (from the data_dictionary) and package the influence in a dictionary.
        raw_influence = list()
        
        for i in range( len(self.names) ):
            standard_deviation = utils.std(self.data_dictionary[self.names[i]])
            raw_influence.append(abs(standard_deviation * coefficients[i+1]))
            
        self.influence = dict(zip([raw_influence[k] / sum(raw_influence) for k in range(len(raw_influence))], self.names))
Esempio n. 10
0
    def GetInfluence(self):
        #Get the model terms from R's model object
        terms = self.Extract('terms')
        terms = str(terms)

        #Get the covariate names
        self.names = self.data_dictionary.keys()
        self.names.remove(self.target)

        #Now get the model coefficients from R.
        coefficients = array.array('d', self.Extract('coef'))

        #Get the standard deviations (from the data_dictionary) and package the influence in a dictionary.
        raw_influence = list()

        for i in range(len(self.names)):
            standard_deviation = utils.std(self.data_dictionary[self.names[i]])
            raw_influence.append(abs(standard_deviation * coefficients[i + 1]))

        self.influence = dict(
            zip([
                raw_influence[k] / sum(raw_influence)
                for k in range(len(raw_influence))
            ], self.names))
Esempio n. 11
0
def main():
    args = get_args()

    if not os.path.exists(args.out_dir):
        os.mkdir(args.out_dir)
    logfile = os.path.join(args.out_dir, 'output.log')
    if os.path.exists(logfile):
        os.remove(logfile)

    logging.basicConfig(format='[%(asctime)s] - %(message)s',
                        datefmt='%Y/%m/%d %H:%M:%S',
                        level=logging.INFO,
                        filename=os.path.join(args.out_dir, 'output.log'))
    logger.info(args)

    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    epsilon = (args.epsilon / 255.) / std(args.dataset)
    alpha = (args.alpha / 255.) / std(args.dataset)
    pgd_alpha = (2 / 255.) / std(args.dataset)
    if args.cfg is not None:
        pruned_cfg = [
            int(x) for x in args.cfg.replace('[', '').replace(']', '').replace(
                ' ', '').split(',') if x.isnumeric()
        ]
        print(pruned_cfg)
    else:
        pruned_cfg = torch.load(args.cfg_dir)['cfg']
    if args.model == 'vgg':
        if args.dataset == 'cifar10':
            train_loader, test_loader = get_loaders(args.data_dir,
                                                    args.batch_size, 'cifar10')
            model = vgg(16, cfg=pruned_cfg, seed=0)
        elif args.dataset == 'cifar100':
            train_loader, test_loader = get_loaders(args.data_dir,
                                                    args.batch_size,
                                                    'cifar100')
            model = vgg(16, cfg=pruned_cfg, dataset='cifar100', seed=0)
    if args.model == 'resnet':
        if args.dataset == 'cifar10':
            train_loader, test_loader = get_loaders(args.data_dir,
                                                    args.batch_size, 'cifar10')
            model = resnet18(seed=0, cfg=pruned_cfg, num_classes=10)
        elif args.dataset == 'cifar100':
            train_loader, test_loader = get_loaders(args.data_dir,
                                                    args.batch_size,
                                                    'cifar100')
            model = resnet18(seed=0, cfg=pruned_cfg, num_classes=100)
    model.load_state_dict(torch.load(args.model_dir))
    model.cuda()
    model.train()

    opt = torch.optim.SGD(model.parameters(),
                          lr=args.lr_max,
                          momentum=args.momentum,
                          weight_decay=args.weight_decay)
    amp_args = dict(opt_level=args.opt_level,
                    loss_scale=args.loss_scale,
                    verbosity=False)
    if args.opt_level == 'O2':
        amp_args['master_weights'] = args.master_weights
    model, opt = amp.initialize(model, opt, **amp_args)
    criterion = nn.CrossEntropyLoss()

    if args.delta_init == 'previous':
        delta = torch.zeros(args.batch_size, 3, 32, 32).cuda()

    lr_steps = args.epochs * len(train_loader)
    if args.lr_schedule == 'cyclic':
        scheduler = torch.optim.lr_scheduler.CyclicLR(
            opt,
            base_lr=args.lr_min,
            max_lr=args.lr_max,
            step_size_up=lr_steps / 2,
            step_size_down=lr_steps / 2)
    elif args.lr_schedule == 'multistep':
        scheduler = torch.optim.lr_scheduler.MultiStepLR(
            opt, milestones=[lr_steps / 2, lr_steps * 3 / 4], gamma=0.1)

    # Training
    prev_robust_acc = 0.
    start_train_time = time.time()
    logger.info('Epoch \t Seconds \t LR \t \t Train Loss \t Train Acc')
    for epoch in range(args.epochs):
        print(epoch)
        start_epoch_time = time.time()
        train_loss = 0
        train_acc = 0
        train_n = 0
        for i, (X, y) in enumerate(train_loader):
            X, y = X.cuda(), y.cuda()
            if i == 0:
                first_batch = (X, y)
            if args.delta_init != 'previous':
                delta = torch.zeros_like(X).cuda()
            if args.delta_init == 'random':
                for j in range(len(epsilon)):
                    delta[:, j, :, :].uniform_(-epsilon[j][0][0].item(),
                                               epsilon[j][0][0].item())
                delta.data = clamp(delta,
                                   lower_limit(args.dataset) - X,
                                   upper_limit(args.dataset) - X)
            delta.requires_grad = True
            output = model(X + delta[:X.size(0)])
            loss = F.cross_entropy(output, y)
            with amp.scale_loss(loss, opt) as scaled_loss:
                scaled_loss.backward()
            grad = delta.grad.detach()
            delta.data = clamp(delta + alpha * torch.sign(grad), -epsilon,
                               epsilon)
            delta.data[:X.size(0)] = clamp(delta[:X.size(0)],
                                           lower_limit(args.dataset) - X,
                                           upper_limit(args.dataset) - X)
            delta = delta.detach()
            output = model(X + delta[:X.size(0)])
            loss = criterion(output, y)
            opt.zero_grad()
            with amp.scale_loss(loss, opt) as scaled_loss:
                scaled_loss.backward()
            opt.step()
            train_loss += loss.item() * y.size(0)
            train_acc += (output.max(1)[1] == y).sum().item()
            train_n += y.size(0)
            scheduler.step()
        if args.early_stop:
            # Check current PGD robustness of model using random minibatch
            X, y = first_batch
            pgd_delta = attack_pgd(model, X, y, epsilon, pgd_alpha, 5, 1, opt,
                                   args.dataset)
            with torch.no_grad():
                output = model(
                    clamp(X + pgd_delta[:X.size(0)], lower_limit(args.dataset),
                          upper_limit(args.dataset)))
            robust_acc = (output.max(1)[1] == y).sum().item() / y.size(0)
            if robust_acc - prev_robust_acc < -0.2:
                break
            prev_robust_acc = robust_acc
            best_state_dict = copy.deepcopy(model.state_dict())
        epoch_time = time.time()
        lr = scheduler.get_lr()[0]
        pgd_loss, pgd_acc = evaluate_pgd(test_loader, model, 7, 5,
                                         args.dataset)
        test_loss, test_acc = evaluate_standard(test_loader, model)
        logger.info(
            f'epoch {epoch}: {pgd_acc} {test_acc} {pgd_loss} {test_loss}')
        logger.info('%d \t %.1f \t \t %.4f \t %.4f \t %.4f', epoch,
                    epoch_time - start_epoch_time, lr, train_loss / train_n,
                    train_acc / train_n)
    train_time = time.time()
    if not args.early_stop:
        best_state_dict = model.state_dict()
    torch.save(best_state_dict, os.path.join(args.out_dir, 'model.pth'))
    logger.info('Total train time: %.4f minutes',
                (train_time - start_train_time) / 60)
Esempio n. 12
0
def one_sample_t(A, mu):
    n = len(A)
    df = n - 1
    z = (np.mean(A) - mu) / std(A)
    t = z * np.sqrt(n)
    return t, stdtr(df, t)
Esempio n. 13
0
    def AssignWeights(self, method=0):
        '''Weight the observations in the training set based on their distance from the threshold.'''
        std = utils.std(self.actual)
        print std
        print self.regulatory_threshold
        print self.actual
        deviation = [(x-self.regulatory_threshold)/std for x in self.actual]
        print 'deviation: ' + str(deviation)
        
        #Integer weighting: weight is the observation's rounded-up whole number of standard deviations from the threshold.
        if method == 1: 
            weights = [1 for i in deviation]
            breaks = range( int(math.floor(min(deviation))), int(math.ceil(max(deviation))) )

            for i in breaks:                
                #Find all the observations that meet both criteria simultaneously
                rows = [j for j in range(len(deviation)) if deviation[j] >= i and deviation[j] < i+1]
                
                #Decide how many times to replicate each slice of data
                if i<=0:
                    replicates = 0
                else:
                    replicates = 2*i
                    
                weights = [replicates+1 if k in rows else weights[k] for k in range(len(weights))]
                
        #Continuous weighting: weight is the observation's distance (in standard deviations) from the threshold.      
        elif method == 2:
            weights = [abs(x) for x in deviation]

        #put more weight on exceedances
        elif method == 3:
            #initialize all weights to one.
            weights = [1 for i in deviation]

            #apply weight to the exceedances
            rows = [i for i in range(len(deviation)) if deviation[i] > 0]
            weights = [self.cost[1] if i in rows else weights[i] for i in range(len(weights))]

            #apply weight to the non-exceedances
            rows = [i for i in range(len(deviation)) if deviation[i] <= 0]
            weights = [self.cost[0] if i in rows else weights[i] for i in range(len(weights))]

        #put more weight on exceedances AND downweight near the threshold
        elif method == 4:
            #initialize all weights to one.
            weights = [1 for i in deviation]

            #apply weight to the exceedances
            rows = [i for i in range(len(deviation)) if deviation[i] > 0]
            weights = [self.cost[1] if i in rows else weights[i] for i in range(len(weights))]

            #apply weight to the non-exceedances
            rows = [i for i in range(len(deviation)) if deviation[i] <= 0]
            weights = [self.cost[0] if i in rows else weights[i] for i in range(len(weights))]

            #downweight near the threshold
            rows = [i for i in range(len(deviation)) if abs(deviation[i]) <= 0.25]
            weights = [weights[i]/4. if i in rows else weights[i] for i in range(len(weights))]

        #No weights: all weights are one.
        else: weights = [1 for i in deviation]
            
        return array.array('d', weights)
def extract_dataframes():

	for pid in pids:
		print ()
		print ('pid: ', pid)
		tac_reading = pd.read_csv('clean_tac/' + pid + '_clean_TAC.csv')
		acc_data = pd.read_csv('accelerometer/accelerometer_' + pid + '.csv')

		tac_labels = []

		for feat_no, feature in enumerate(features):
			print ('   feature:', feature)
			array_long = []

			for ind, row in tac_reading.iterrows():
				
				if ind!=0:
				
					t1, t2 = prev_row['timestamp'], row['timestamp']
					long_data = acc_data[ (acc_data['time']/1000 >= t1) & (acc_data['time']/1000 < t2) ]

					if not long_data.empty:
						
						if feat_no==0:
							if prev_row['TAC_Reading'] >= 0.08:
								tac_labels.append(1)
							else:
								tac_labels.append(0) 

						if feature=='rms':
							lt = []
							for axis in ['x', 'y', 'z']:
								lt.append(utils.rms(long_data[axis]))

							lt = np.array(lt)
							array_long.append(lt)

						else:
							short_datas = np.array_split(long_data, 300)
							
							# stores the features for every 1 second in 10 second segment
							array_short = []

							for short_seg, short_data in enumerate(short_datas):

								# data_short = data_long[data_long['short_segment']==short_seg]

								lt = []
								for axis in ['x', 'y', 'z']:
									data_axis =	np.array(short_data[axis])

									if feature=='mean':
										lt.append(utils.mean_feature(data_axis))
									elif feature=='std':
										lt.append(utils.std(data_axis))
									elif feature=='median':
										lt.append(utils.median(data_axis))
									elif feature=='crossing_rate':
										lt.append(utils.crossing_rate(data_axis))
									elif feature=='max_abs':
										lt.append(utils.max_abs(data_axis))
									elif feature=='min_abs':
										lt.append(utils.min_abs(data_axis))
									elif feature=='max_raw':
										lt.append(utils.max_raw(data_axis))
									elif feature=='min_raw':
										lt.append(utils.min_raw(data_axis))
									elif feature=='spec_entrp_freq':
										lt.append(utils.spectral_entropy_freq(data_axis))
									elif feature=='spec_entrp_time':
										lt.append(utils.spectral_entropy_time(data_axis))
									elif feature=='spec_centroid':
										lt.append(utils.spectral_centroid(data_axis))
									elif feature=='spec_spread':
										lt.append(utils.spectral_spread(data_axis))
									elif feature=='spec_rolloff':
										lt.append(utils.spectral_rolloff(data_axis))
									elif feature=='max_freq':
										lt.append(utils.max_freq(data_axis))
									elif feature=='spec_flux':
										if short_seg==0:
											lt.append(utils.spectral_flux(data_axis, np.zeros(len(data_axis))))
											if axis=='x':
												x = data_axis
											elif axis=='y':
												y = data_axis
											elif axis=='z':
												z = data_axis
										else:
											if axis=='x':
												if len(data_axis) > len(x):
													zeros = np.zeros(len(data_axis) - len(x))
													x = np.append(x, zeros)
												elif len(data_axis) < len(x):
													zeros = np.zeros(len(x) - len(data_axis))
													data_axis = np.append(data_axis, zeros)

												lt.append(utils.spectral_flux(data_axis, x))
											elif axis=='y':
												if len(data_axis) > len(y):
													zeros = np.zeros(len(data_axis) - len(y))
													y = np.append(y, zeros)
												elif len(data_axis) < len(y):
													zeros = np.zeros(len(y) - len(data_axis))
													data_axis = np.append(data_axis, zeros)

												lt.append(utils.spectral_flux(data_axis, y))
											elif axis=='z':
												if len(data_axis) > len(z):
													zeros = np.zeros(len(data_axis) - len(z))
													z = np.append(z, zeros)
												elif len(data_axis) < len(z):
													zeros = np.zeros(len(z) - len(data_axis))
													data_axis = np.append(data_axis, zeros)

												lt.append(utils.spectral_flux(data_axis, z))


								array_short.append(np.array(lt))
							
							short_metric = np.array(array_short)
							array_long.append(short_metric)

				prev_row = row
		
			if feature=='rms':
				df = pd.DataFrame(columns=['Rms_x', 'Rms_y', 'Rms_z'])
				long_metric = np.array(array_long)

				df['Rms_x'] = long_metric[:,0:1].flatten()
				df['Rms_y'] = long_metric[:,1:2].flatten()
				df['Rms_z'] = long_metric[:,2:].flatten()

				df.to_csv('features/' + feature + '_feature.csv', index=False)
			else:
				long_metric = np.array(array_long)

				summary_stats(long_metric, feature, pid)
		
		print ('   tac_labels: ', len(tac_labels))
		rename_column_and_concat(pid, tac_labels)
Esempio n. 15
0
def one_sample_t(A,mu):
    n = len(A)
    df = n-1
    z = (np.mean(A) - mu) / std(A)
    t = z * np.sqrt(n)
    return t, stdtr(df,t)
Esempio n. 16
0
    def CrossValidation(self, cv_method=0, **args):
        '''Select ncomp by the requested CV method'''
        validation = self.model['validation'].AsDataFrame()

        #method 0: select the fewest components with PRESS within 1 stdev of the least PRESS (by the bootstrap)
        if cv_method == 0:  #Use the bootstrap to find the standard deviation of the MSEP
            #Get the leave-one-out CV error from R:
            columns = min(self.num_predictors, self.ncomp_max)
            cv = array.array('d', validation['pred'].AsVector())
            rows = len(cv) / columns
            cc = []
            for k in range(int(columns)):
                b = k * rows
                e = b + rows
                cc.append(array.array('d', cv[b:e]))
            cv = cc

            #PRESS = map(lambda x: sum((cv[:,x]-self.array_actual)**2), range(cv.shape[1]))
            PRESS = [
                sum([(cv[i][j] - self.actual[j])**2 for j in range(rows)])
                for i in range(int(columns))
            ]
            #ncomp = np.argmin(PRESS)
            ncomp = [i for i in range(len(PRESS)) if PRESS[i] == min(PRESS)][0]

            #cv_squared_error = (cv[:,ncomp]-self.array_actual)**2
            cv_squared_error = [(cv[ncomp][j] - self.actual[j])**2
                                for j in range(int(rows))]
            sample_space = xrange(rows)

            PRESS_stdev = list()

            #Cache random number generator and int's constructor for a speed boost
            _random, _int = random.random, int

            for i in range(100):
                PRESS_bootstrap = list()

                for j in range(100):
                    PRESS_bootstrap.append(
                        sum([
                            cv_squared_error[_int(_random() * rows)]
                            for i in sample_space
                        ]))

                PRESS_stdev.append(utils.std(PRESS_bootstrap))

            med_stdev = utils.median(PRESS_stdev)

            #Maximum allowable PRESS is the minimum plus one standard deviation
            good_ncomp = [
                i for i in range(len(PRESS))
                if PRESS[i] < min(PRESS) + med_stdev
            ]
            self.ncomp = int(min(good_ncomp) + 1)

        #method 1: select the fewest components w/ PRESS less than the minimum plus a 4% of the range
        if cv_method == 1:
            #PRESS stands for predicted error sum of squares
            PRESS0 = validation['PRESS0'][0]
            PRESS = list(validation['PRESS'])

            #the range is the difference between the greatest and least PRESS values
            PRESS_range = abs(PRESS0 - min(PRESS))

            #Maximum allowable PRESS is the minimum plus a fraction of the range.
            max_CV_error = min(PRESS) + PRESS_range / 25
            good_ncomp = [
                i for i in range(len(PRESS)) if PRESS[i] < max_CV_error
            ]

            #choose the most parsimonious model that satisfies that criterion
            self.ncomp = int(min(good_ncomp) + 1)
Esempio n. 17
0
    entries = utils.list_bundle(bundle)
    random.shuffle(entries)
    sample = entries[:int(args.ratio * len(entries))]

    graphs, _ = utils.load_bundle(bundle, chunk=sample)
    testset.extend(graphs)

if args.mode == -1:
    modes = siggi.modes.items()
else:
    modes = [(args.mode, siggi.modes[args.mode])]

print "= Benchmarking modes for %g seconds" % args.time
for mode, fname in modes:
    times = []
    while sum(times) < args.time:
        start = time.time()
        graph = random.choice(testset)

        # Compute feature hashing
        func = getattr(siggi, fname)
        bag = func(graph)
        fvec = siggi.bag_to_fvec(bag)
        fvec = siggi.fvec_norm(fvec)

        times.append(time.time() - start)

    speed = float(len(times)) / sum(times)
    print "  Mode: %d | %5.0f graphs/s | %7.2f ms/graph | +/- %5.2f" % (
        mode, speed, 1000 * utils.mean(times), 1000 * utils.std(times))