def find_most_equal(data, class_list): m_std = {} for matter in data.keys(): m_stds = [] for _class in class_list: c_tab = class_tab(data[matter], class_list[_class]) m_stds.append(std(c_tab)) m_std[matter] = std(m_stds) return min(m_std, key=m_std.get)
def CrossValidation(self, cv_method=0, **args): """Select ncomp by the requested CV method""" validation = self.model["validation"].AsDataFrame() # method 0: select the fewest components with PRESS within 1 stdev of the least PRESS (by the bootstrap) if cv_method == 0: # Use the bootstrap to find the standard deviation of the MSEP # Get the leave-one-out CV error from R: columns = min(self.num_predictors, self.ncomp_max) cv = array.array("d", validation["pred"].AsVector()) rows = len(cv) / columns cc = [] for k in range(int(columns)): b = k * rows e = b + rows cc.append(array.array("d", cv[b:e])) cv = cc # PRESS = map(lambda x: sum((cv[:,x]-self.array_actual)**2), range(cv.shape[1])) PRESS = [sum([(cv[i][j] - self.actual[j]) ** 2 for j in range(rows)]) for i in range(int(columns))] # ncomp = np.argmin(PRESS) ncomp = [i for i in range(len(PRESS)) if PRESS[i] == min(PRESS)][0] # cv_squared_error = (cv[:,ncomp]-self.array_actual)**2 cv_squared_error = [(cv[ncomp][j] - self.actual[j]) ** 2 for j in range(int(rows))] sample_space = xrange(rows) PRESS_stdev = list() # Cache random number generator and int's constructor for a speed boost _random, _int = random.random, int for i in range(100): PRESS_bootstrap = list() for j in range(100): PRESS_bootstrap.append(sum([cv_squared_error[_int(_random() * rows)] for i in sample_space])) PRESS_stdev.append(utils.std(PRESS_bootstrap)) med_stdev = utils.median(PRESS_stdev) # Maximum allowable PRESS is the minimum plus one standard deviation good_ncomp = [i for i in range(len(PRESS)) if PRESS[i] < min(PRESS) + med_stdev] self.ncomp = int(min(good_ncomp) + 1) # method 1: select the fewest components w/ PRESS less than the minimum plus a 4% of the range if cv_method == 1: # PRESS stands for predicted error sum of squares PRESS0 = validation["PRESS0"][0] PRESS = list(validation["PRESS"]) # the range is the difference between the greatest and least PRESS values PRESS_range = abs(PRESS0 - min(PRESS)) # Maximum allowable PRESS is the minimum plus a fraction of the range. max_CV_error = min(PRESS) + PRESS_range / 25 good_ncomp = [i for i in range(len(PRESS)) if PRESS[i] < max_CV_error] # choose the most parsimonious model that satisfies that criterion self.ncomp = int(min(good_ncomp) + 1)
def AssignWeights(self, method=0): #Weight the observations in the training set based on their distance from the threshold. obs = self.data_dictionary[self.target] deviation = [(obs[i]-self.regulatory_threshold) / utils.std(obs) for i in range(len(obs))] #Integer weighting: weight is the observation's rounded-up whole number of standard deviations from the threshold. if method == 1: weights = [1 for k in range(len(deviation))] breaks = range(int(math.floor(min(deviation))), int(math.ceil(max(deviation)))) for i in breaks: #find all observations that meet the upper and lower criteria, separately first_slice = [k for k in range(len(deviation)) if deviation[k] > i] second_slice = [k for k in range(len(deviation)) if deviation < i+1] #now find all the observations that meet both criteria simultaneously rows = filter( lambda x: x in first_slice, second_slice ) rows = [int(r) for r in rows] #Decide how many times to replicate each slice of data if i<0: replicates = (abs(i) - 1) else: replicates = i if rows: weights[rows] = replicates + 1 #Continuous weighting: weight is the observation's distance (in standard deviations) from the threshold. elif method == 2: weights = abs(deviation) #No weights: all weights are one. else: weights = [1.0 for k in range(len(deviation))] return weights
def snapshot(): """ statistics and stuff (??) """ global collector collector.total_weight = total_weight() collector.path_length = path_length() edges_importances = gi.get_all_edges_importances() collector.mean_edges_importance=utils.mean(edges_importances) collector.std_edges_importance=utils.std(edges_importances) print collector.__dict__
def fit(self, X): self._is_trained = True self.std = [] self.mean = [] self.count_feature = len(X[0]) for feature_idx in range(self.count_feature): data = [x[feature_idx] for x in X] curr_mean = mean(data) curr_std = std(data) self.mean.append(curr_mean) self.std.append(curr_std) return self
def GetInfluence(self): #Get the covariate names self.names = self.data_dictionary.keys() self.names.remove(self.target) #Now get the model coefficients from R. coefficients = self.Extract('coef').AsVector() #Get the standard deviations (from the data_dictionary) and package the influence in a dictionary. raw_influence = list() for i in range( len(self.names) ): standard_deviation = utils.std( self.data_dictionary[self.names[i]] ) raw_influence.append( float(abs(standard_deviation * coefficients[i+1])) ) self.influence = dict( zip([float(x/sum(raw_influence)) for x in raw_influence], self.names) ) return self.influence
def AssignWeights(self, method=0): #Weight the observations in the training set based on their distance from the threshold. obs = self.data_dictionary[self.target] deviation = [(obs[i] - self.regulatory_threshold) / utils.std(obs) for i in range(len(obs))] #Integer weighting: weight is the observation's rounded-up whole number of standard deviations from the threshold. if method == 1: weights = [1 for k in range(len(deviation))] breaks = range(int(math.floor(min(deviation))), int(math.ceil(max(deviation)))) for i in breaks: #find all observations that meet the upper and lower criteria, separately first_slice = [ k for k in range(len(deviation)) if deviation[k] > i ] second_slice = [ k for k in range(len(deviation)) if deviation < i + 1 ] #now find all the observations that meet both criteria simultaneously rows = filter(lambda x: x in first_slice, second_slice) rows = [int(r) for r in rows] #Decide how many times to replicate each slice of data if i < 0: replicates = (abs(i) - 1) else: replicates = i if rows: weights[rows] = replicates + 1 #Continuous weighting: weight is the observation's distance (in standard deviations) from the threshold. elif method == 2: weights = abs(deviation) #No weights: all weights are one. else: weights = [1.0 for k in range(len(deviation))] return weights
def GetInfluence(self): #Get the covariate names self.names = self.data_dictionary.keys() self.names.remove(self.target) #Now get the model coefficients from R. coefficients = self.Extract('coef').AsVector() #Get the standard deviations (from the data_dictionary) and package the influence in a dictionary. raw_influence = list() for i in range(len(self.names)): standard_deviation = utils.std(self.data_dictionary[self.names[i]]) raw_influence.append( float(abs(standard_deviation * coefficients[i + 1]))) self.influence = dict( zip([float(x / sum(raw_influence)) for x in raw_influence], self.names)) return self.influence
def GetInfluence(self): #Get the model terms from R's model object terms = self.Extract('terms') terms = str(terms) #Get the covariate names self.names = self.data_dictionary.keys() self.names.remove(self.target) #Now get the model coefficients from R. coefficients = array.array('d', self.Extract('coef')) #Get the standard deviations (from the data_dictionary) and package the influence in a dictionary. raw_influence = list() for i in range( len(self.names) ): standard_deviation = utils.std(self.data_dictionary[self.names[i]]) raw_influence.append(abs(standard_deviation * coefficients[i+1])) self.influence = dict(zip([raw_influence[k] / sum(raw_influence) for k in range(len(raw_influence))], self.names))
def GetInfluence(self): #Get the model terms from R's model object terms = self.Extract('terms') terms = str(terms) #Get the covariate names self.names = self.data_dictionary.keys() self.names.remove(self.target) #Now get the model coefficients from R. coefficients = array.array('d', self.Extract('coef')) #Get the standard deviations (from the data_dictionary) and package the influence in a dictionary. raw_influence = list() for i in range(len(self.names)): standard_deviation = utils.std(self.data_dictionary[self.names[i]]) raw_influence.append(abs(standard_deviation * coefficients[i + 1])) self.influence = dict( zip([ raw_influence[k] / sum(raw_influence) for k in range(len(raw_influence)) ], self.names))
def main(): args = get_args() if not os.path.exists(args.out_dir): os.mkdir(args.out_dir) logfile = os.path.join(args.out_dir, 'output.log') if os.path.exists(logfile): os.remove(logfile) logging.basicConfig(format='[%(asctime)s] - %(message)s', datefmt='%Y/%m/%d %H:%M:%S', level=logging.INFO, filename=os.path.join(args.out_dir, 'output.log')) logger.info(args) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) epsilon = (args.epsilon / 255.) / std(args.dataset) alpha = (args.alpha / 255.) / std(args.dataset) pgd_alpha = (2 / 255.) / std(args.dataset) if args.cfg is not None: pruned_cfg = [ int(x) for x in args.cfg.replace('[', '').replace(']', '').replace( ' ', '').split(',') if x.isnumeric() ] print(pruned_cfg) else: pruned_cfg = torch.load(args.cfg_dir)['cfg'] if args.model == 'vgg': if args.dataset == 'cifar10': train_loader, test_loader = get_loaders(args.data_dir, args.batch_size, 'cifar10') model = vgg(16, cfg=pruned_cfg, seed=0) elif args.dataset == 'cifar100': train_loader, test_loader = get_loaders(args.data_dir, args.batch_size, 'cifar100') model = vgg(16, cfg=pruned_cfg, dataset='cifar100', seed=0) if args.model == 'resnet': if args.dataset == 'cifar10': train_loader, test_loader = get_loaders(args.data_dir, args.batch_size, 'cifar10') model = resnet18(seed=0, cfg=pruned_cfg, num_classes=10) elif args.dataset == 'cifar100': train_loader, test_loader = get_loaders(args.data_dir, args.batch_size, 'cifar100') model = resnet18(seed=0, cfg=pruned_cfg, num_classes=100) model.load_state_dict(torch.load(args.model_dir)) model.cuda() model.train() opt = torch.optim.SGD(model.parameters(), lr=args.lr_max, momentum=args.momentum, weight_decay=args.weight_decay) amp_args = dict(opt_level=args.opt_level, loss_scale=args.loss_scale, verbosity=False) if args.opt_level == 'O2': amp_args['master_weights'] = args.master_weights model, opt = amp.initialize(model, opt, **amp_args) criterion = nn.CrossEntropyLoss() if args.delta_init == 'previous': delta = torch.zeros(args.batch_size, 3, 32, 32).cuda() lr_steps = args.epochs * len(train_loader) if args.lr_schedule == 'cyclic': scheduler = torch.optim.lr_scheduler.CyclicLR( opt, base_lr=args.lr_min, max_lr=args.lr_max, step_size_up=lr_steps / 2, step_size_down=lr_steps / 2) elif args.lr_schedule == 'multistep': scheduler = torch.optim.lr_scheduler.MultiStepLR( opt, milestones=[lr_steps / 2, lr_steps * 3 / 4], gamma=0.1) # Training prev_robust_acc = 0. start_train_time = time.time() logger.info('Epoch \t Seconds \t LR \t \t Train Loss \t Train Acc') for epoch in range(args.epochs): print(epoch) start_epoch_time = time.time() train_loss = 0 train_acc = 0 train_n = 0 for i, (X, y) in enumerate(train_loader): X, y = X.cuda(), y.cuda() if i == 0: first_batch = (X, y) if args.delta_init != 'previous': delta = torch.zeros_like(X).cuda() if args.delta_init == 'random': for j in range(len(epsilon)): delta[:, j, :, :].uniform_(-epsilon[j][0][0].item(), epsilon[j][0][0].item()) delta.data = clamp(delta, lower_limit(args.dataset) - X, upper_limit(args.dataset) - X) delta.requires_grad = True output = model(X + delta[:X.size(0)]) loss = F.cross_entropy(output, y) with amp.scale_loss(loss, opt) as scaled_loss: scaled_loss.backward() grad = delta.grad.detach() delta.data = clamp(delta + alpha * torch.sign(grad), -epsilon, epsilon) delta.data[:X.size(0)] = clamp(delta[:X.size(0)], lower_limit(args.dataset) - X, upper_limit(args.dataset) - X) delta = delta.detach() output = model(X + delta[:X.size(0)]) loss = criterion(output, y) opt.zero_grad() with amp.scale_loss(loss, opt) as scaled_loss: scaled_loss.backward() opt.step() train_loss += loss.item() * y.size(0) train_acc += (output.max(1)[1] == y).sum().item() train_n += y.size(0) scheduler.step() if args.early_stop: # Check current PGD robustness of model using random minibatch X, y = first_batch pgd_delta = attack_pgd(model, X, y, epsilon, pgd_alpha, 5, 1, opt, args.dataset) with torch.no_grad(): output = model( clamp(X + pgd_delta[:X.size(0)], lower_limit(args.dataset), upper_limit(args.dataset))) robust_acc = (output.max(1)[1] == y).sum().item() / y.size(0) if robust_acc - prev_robust_acc < -0.2: break prev_robust_acc = robust_acc best_state_dict = copy.deepcopy(model.state_dict()) epoch_time = time.time() lr = scheduler.get_lr()[0] pgd_loss, pgd_acc = evaluate_pgd(test_loader, model, 7, 5, args.dataset) test_loss, test_acc = evaluate_standard(test_loader, model) logger.info( f'epoch {epoch}: {pgd_acc} {test_acc} {pgd_loss} {test_loss}') logger.info('%d \t %.1f \t \t %.4f \t %.4f \t %.4f', epoch, epoch_time - start_epoch_time, lr, train_loss / train_n, train_acc / train_n) train_time = time.time() if not args.early_stop: best_state_dict = model.state_dict() torch.save(best_state_dict, os.path.join(args.out_dir, 'model.pth')) logger.info('Total train time: %.4f minutes', (train_time - start_train_time) / 60)
def one_sample_t(A, mu): n = len(A) df = n - 1 z = (np.mean(A) - mu) / std(A) t = z * np.sqrt(n) return t, stdtr(df, t)
def AssignWeights(self, method=0): '''Weight the observations in the training set based on their distance from the threshold.''' std = utils.std(self.actual) print std print self.regulatory_threshold print self.actual deviation = [(x-self.regulatory_threshold)/std for x in self.actual] print 'deviation: ' + str(deviation) #Integer weighting: weight is the observation's rounded-up whole number of standard deviations from the threshold. if method == 1: weights = [1 for i in deviation] breaks = range( int(math.floor(min(deviation))), int(math.ceil(max(deviation))) ) for i in breaks: #Find all the observations that meet both criteria simultaneously rows = [j for j in range(len(deviation)) if deviation[j] >= i and deviation[j] < i+1] #Decide how many times to replicate each slice of data if i<=0: replicates = 0 else: replicates = 2*i weights = [replicates+1 if k in rows else weights[k] for k in range(len(weights))] #Continuous weighting: weight is the observation's distance (in standard deviations) from the threshold. elif method == 2: weights = [abs(x) for x in deviation] #put more weight on exceedances elif method == 3: #initialize all weights to one. weights = [1 for i in deviation] #apply weight to the exceedances rows = [i for i in range(len(deviation)) if deviation[i] > 0] weights = [self.cost[1] if i in rows else weights[i] for i in range(len(weights))] #apply weight to the non-exceedances rows = [i for i in range(len(deviation)) if deviation[i] <= 0] weights = [self.cost[0] if i in rows else weights[i] for i in range(len(weights))] #put more weight on exceedances AND downweight near the threshold elif method == 4: #initialize all weights to one. weights = [1 for i in deviation] #apply weight to the exceedances rows = [i for i in range(len(deviation)) if deviation[i] > 0] weights = [self.cost[1] if i in rows else weights[i] for i in range(len(weights))] #apply weight to the non-exceedances rows = [i for i in range(len(deviation)) if deviation[i] <= 0] weights = [self.cost[0] if i in rows else weights[i] for i in range(len(weights))] #downweight near the threshold rows = [i for i in range(len(deviation)) if abs(deviation[i]) <= 0.25] weights = [weights[i]/4. if i in rows else weights[i] for i in range(len(weights))] #No weights: all weights are one. else: weights = [1 for i in deviation] return array.array('d', weights)
def extract_dataframes(): for pid in pids: print () print ('pid: ', pid) tac_reading = pd.read_csv('clean_tac/' + pid + '_clean_TAC.csv') acc_data = pd.read_csv('accelerometer/accelerometer_' + pid + '.csv') tac_labels = [] for feat_no, feature in enumerate(features): print (' feature:', feature) array_long = [] for ind, row in tac_reading.iterrows(): if ind!=0: t1, t2 = prev_row['timestamp'], row['timestamp'] long_data = acc_data[ (acc_data['time']/1000 >= t1) & (acc_data['time']/1000 < t2) ] if not long_data.empty: if feat_no==0: if prev_row['TAC_Reading'] >= 0.08: tac_labels.append(1) else: tac_labels.append(0) if feature=='rms': lt = [] for axis in ['x', 'y', 'z']: lt.append(utils.rms(long_data[axis])) lt = np.array(lt) array_long.append(lt) else: short_datas = np.array_split(long_data, 300) # stores the features for every 1 second in 10 second segment array_short = [] for short_seg, short_data in enumerate(short_datas): # data_short = data_long[data_long['short_segment']==short_seg] lt = [] for axis in ['x', 'y', 'z']: data_axis = np.array(short_data[axis]) if feature=='mean': lt.append(utils.mean_feature(data_axis)) elif feature=='std': lt.append(utils.std(data_axis)) elif feature=='median': lt.append(utils.median(data_axis)) elif feature=='crossing_rate': lt.append(utils.crossing_rate(data_axis)) elif feature=='max_abs': lt.append(utils.max_abs(data_axis)) elif feature=='min_abs': lt.append(utils.min_abs(data_axis)) elif feature=='max_raw': lt.append(utils.max_raw(data_axis)) elif feature=='min_raw': lt.append(utils.min_raw(data_axis)) elif feature=='spec_entrp_freq': lt.append(utils.spectral_entropy_freq(data_axis)) elif feature=='spec_entrp_time': lt.append(utils.spectral_entropy_time(data_axis)) elif feature=='spec_centroid': lt.append(utils.spectral_centroid(data_axis)) elif feature=='spec_spread': lt.append(utils.spectral_spread(data_axis)) elif feature=='spec_rolloff': lt.append(utils.spectral_rolloff(data_axis)) elif feature=='max_freq': lt.append(utils.max_freq(data_axis)) elif feature=='spec_flux': if short_seg==0: lt.append(utils.spectral_flux(data_axis, np.zeros(len(data_axis)))) if axis=='x': x = data_axis elif axis=='y': y = data_axis elif axis=='z': z = data_axis else: if axis=='x': if len(data_axis) > len(x): zeros = np.zeros(len(data_axis) - len(x)) x = np.append(x, zeros) elif len(data_axis) < len(x): zeros = np.zeros(len(x) - len(data_axis)) data_axis = np.append(data_axis, zeros) lt.append(utils.spectral_flux(data_axis, x)) elif axis=='y': if len(data_axis) > len(y): zeros = np.zeros(len(data_axis) - len(y)) y = np.append(y, zeros) elif len(data_axis) < len(y): zeros = np.zeros(len(y) - len(data_axis)) data_axis = np.append(data_axis, zeros) lt.append(utils.spectral_flux(data_axis, y)) elif axis=='z': if len(data_axis) > len(z): zeros = np.zeros(len(data_axis) - len(z)) z = np.append(z, zeros) elif len(data_axis) < len(z): zeros = np.zeros(len(z) - len(data_axis)) data_axis = np.append(data_axis, zeros) lt.append(utils.spectral_flux(data_axis, z)) array_short.append(np.array(lt)) short_metric = np.array(array_short) array_long.append(short_metric) prev_row = row if feature=='rms': df = pd.DataFrame(columns=['Rms_x', 'Rms_y', 'Rms_z']) long_metric = np.array(array_long) df['Rms_x'] = long_metric[:,0:1].flatten() df['Rms_y'] = long_metric[:,1:2].flatten() df['Rms_z'] = long_metric[:,2:].flatten() df.to_csv('features/' + feature + '_feature.csv', index=False) else: long_metric = np.array(array_long) summary_stats(long_metric, feature, pid) print (' tac_labels: ', len(tac_labels)) rename_column_and_concat(pid, tac_labels)
def one_sample_t(A,mu): n = len(A) df = n-1 z = (np.mean(A) - mu) / std(A) t = z * np.sqrt(n) return t, stdtr(df,t)
def CrossValidation(self, cv_method=0, **args): '''Select ncomp by the requested CV method''' validation = self.model['validation'].AsDataFrame() #method 0: select the fewest components with PRESS within 1 stdev of the least PRESS (by the bootstrap) if cv_method == 0: #Use the bootstrap to find the standard deviation of the MSEP #Get the leave-one-out CV error from R: columns = min(self.num_predictors, self.ncomp_max) cv = array.array('d', validation['pred'].AsVector()) rows = len(cv) / columns cc = [] for k in range(int(columns)): b = k * rows e = b + rows cc.append(array.array('d', cv[b:e])) cv = cc #PRESS = map(lambda x: sum((cv[:,x]-self.array_actual)**2), range(cv.shape[1])) PRESS = [ sum([(cv[i][j] - self.actual[j])**2 for j in range(rows)]) for i in range(int(columns)) ] #ncomp = np.argmin(PRESS) ncomp = [i for i in range(len(PRESS)) if PRESS[i] == min(PRESS)][0] #cv_squared_error = (cv[:,ncomp]-self.array_actual)**2 cv_squared_error = [(cv[ncomp][j] - self.actual[j])**2 for j in range(int(rows))] sample_space = xrange(rows) PRESS_stdev = list() #Cache random number generator and int's constructor for a speed boost _random, _int = random.random, int for i in range(100): PRESS_bootstrap = list() for j in range(100): PRESS_bootstrap.append( sum([ cv_squared_error[_int(_random() * rows)] for i in sample_space ])) PRESS_stdev.append(utils.std(PRESS_bootstrap)) med_stdev = utils.median(PRESS_stdev) #Maximum allowable PRESS is the minimum plus one standard deviation good_ncomp = [ i for i in range(len(PRESS)) if PRESS[i] < min(PRESS) + med_stdev ] self.ncomp = int(min(good_ncomp) + 1) #method 1: select the fewest components w/ PRESS less than the minimum plus a 4% of the range if cv_method == 1: #PRESS stands for predicted error sum of squares PRESS0 = validation['PRESS0'][0] PRESS = list(validation['PRESS']) #the range is the difference between the greatest and least PRESS values PRESS_range = abs(PRESS0 - min(PRESS)) #Maximum allowable PRESS is the minimum plus a fraction of the range. max_CV_error = min(PRESS) + PRESS_range / 25 good_ncomp = [ i for i in range(len(PRESS)) if PRESS[i] < max_CV_error ] #choose the most parsimonious model that satisfies that criterion self.ncomp = int(min(good_ncomp) + 1)
entries = utils.list_bundle(bundle) random.shuffle(entries) sample = entries[:int(args.ratio * len(entries))] graphs, _ = utils.load_bundle(bundle, chunk=sample) testset.extend(graphs) if args.mode == -1: modes = siggi.modes.items() else: modes = [(args.mode, siggi.modes[args.mode])] print "= Benchmarking modes for %g seconds" % args.time for mode, fname in modes: times = [] while sum(times) < args.time: start = time.time() graph = random.choice(testset) # Compute feature hashing func = getattr(siggi, fname) bag = func(graph) fvec = siggi.bag_to_fvec(bag) fvec = siggi.fvec_norm(fvec) times.append(time.time() - start) speed = float(len(times)) / sum(times) print " Mode: %d | %5.0f graphs/s | %7.2f ms/graph | +/- %5.2f" % ( mode, speed, 1000 * utils.mean(times), 1000 * utils.std(times))