def initiate_data(): test_dict = {} # test_load_data(test_dict) load_data(test_dict) with open('data_dict.obj', 'wb') as fp: pickle.dump(test_dict, fp) storage_tree = build_tree(test_dict) for letter, node in storage_tree.children.items(): with open('subtrees/sub_tree_' + letter + '.obj', 'wb') as fp: pickle.dump(node, fp)
def __init__(self): self.weather_data = np.array( data_parser.load_data( os.path.abspath(os.path.dirname(sys.argv[0])) + "/Data/MonthlyDurham.csv"))
def main(): datasets = load_all_files() features_names = load_features_names() diseases_names = load_diseases_names() data = load_data(datasets) data.print_classes_strength() data.print_combined_strength() show_distribution(datasets, features_names) features = np.concatenate([*datasets]) classes = np.concatenate([ np.full(dataset.shape[0], diseases_names[index]) for (index, dataset) in enumerate(datasets) ]) scores, p_values = chi2(features, classes) features_with_values = pd.concat([ pd.DataFrame(features_names, columns=['Features']), pd.DataFrame(scores, columns=['Scores']), pd.DataFrame(p_values, columns=['P_values']) ], axis=1) print(features_with_values.sort_values('Scores', ascending=False).round(3)) ordered_features = features.copy() for idx, feature_idx in enumerate( features_with_values.sort_values('Scores', ascending=False).index): ordered_features[:, idx:idx + 1] = features[:, feature_idx:feature_idx + 1] print('==============================================================') alpha = 0.05 for index, row in features_with_values.iterrows(): p_value = row['P_values'] if p_value > alpha: features_with_values.drop(index, inplace=True) print(features_with_values.sort_values('Scores', ascending=False).round(3)) rskf = RepeatedStratifiedKFold(n_repeats=5, n_splits=2, random_state=1) n_neighbors_variants = [1, 5, 10] metric_variants = ['manhattan', 'euclidean'] df_columns = [ 'n_features', 'n_neighbors', 'metric', 'scores', 'mean_accuracy', 'mean_confusion_matrix' ] results_df = pd.DataFrame(columns=df_columns) #number_of_features = 8 number_of_features = ordered_features.shape[1] + 1 # or set to 8 print('Training models. Please wait...') for n_features in range(1, number_of_features): for n_neighbors in n_neighbors_variants: for metric in metric_variants: knn = KNeighborsClassifier(n_neighbors=n_neighbors, metric=metric) current_iteration_scores = [] current_iteration_confusion_matrices = np.zeros(shape=(5, 5)) number_of_iterations = 0 for train, test in rskf.split( ordered_features[:, 0:n_features], classes): knn.fit(ordered_features[:, 0:n_features][train], classes[train]) current_score = knn.score( ordered_features[:, 0:n_features][test], classes[test]) current_iteration_scores.append(current_score) y_pred = knn.predict(ordered_features[:, 0:n_features][test]) current_confusion_matrix = confusion_matrix(classes[test], y_pred=y_pred) current_iteration_confusion_matrices += current_confusion_matrix number_of_iterations += 1 results_df.loc[len(results_df)] = [ n_features, n_neighbors, metric, current_iteration_scores, np.array(current_iteration_scores).mean().round(3), (current_iteration_confusion_matrices / number_of_iterations) ] results_df = results_df.sort_values('mean_accuracy') j = 0 print('Best mean models scoreboard:') for i, row in results_df.iterrows(): j = j + 1 print( f'[{len(results_df) - j}] Mean score for n_neighbors={row["n_neighbors"]}, metric={row["metric"]}, ' f'n_features={row["n_features"]}: {row["mean_accuracy"]}') # compare_every_model_paired(results_df) # Compare two best models (indexed from 0 - best model) print('Compare two best models:') compare_two_models(0, 1, results_df) print('Best statistically significant model:') find_best_statistically_significant_model(results_df) best_model_params = results_df.sort_values('mean_accuracy', ascending=False).iloc[0] print(f'\nBest score: {best_model_params["mean_accuracy"]}') print( f'Best parameters: metric - {best_model_params["metric"]}, n_neighbors - {best_model_params["n_neighbors"]}, ' f'number of features - {best_model_params["n_features"]}') show_best_score_confusion_matrix( best_model_params['mean_confusion_matrix'], diseases_names) show_summarising_plots(number_of_features=number_of_features, results_df=results_df, metric_variants=metric_variants, n_neighbors_variants=n_neighbors_variants)
file_name += '_latent_size_' + str(hyper_params['latent_size']) # Path to store the log file and the model file log_file_root = "saved_logs/" model_file_root = "saved_models/" if not os.path.isdir(log_file_root): os.mkdir(log_file_root) if not os.path.isdir(model_file_root): os.mkdir(model_file_root) hyper_params['log_file'] = log_file_root + hyper_params[ 'project_name'] + '_log' + file_name + '.txt' hyper_params['model_file_name'] = model_file_root + hyper_params[ 'project_name'] + '_model' + file_name + '.pt' # Load the processed data and get the reader classes for training, test, and validation sets train_reader, val_reader, test_reader, total_items = load_data(hyper_params) hyper_params['total_items'] = total_items hyper_params['testing_batch_limit'] = test_reader.num_b file_write(hyper_params['log_file'], "\n\nSimulation run on: " + str(dt.datetime.now()) + "\n\n") file_write(hyper_params['log_file'], "Data reading complete!") file_write(hyper_params['log_file'], "Number of train batches: {:4d}".format(train_reader.num_b)) file_write(hyper_params['log_file'], "Number of validation batches: {:4d}".format(val_reader.num_b)) file_write(hyper_params['log_file'], "Number of test batches: {:4d}".format(test_reader.num_b)) file_write(hyper_params['log_file'], "Total Items: " + str(total_items) + "\n") # Instantiate the model
def main(args): # ensure reproducibility numpy.random.seed(10) random.seed(10) print('Loading data ...') data = load_data(args.task, args.paths, args.feature_set, args.emo_dim, args.normalize, args.norm_opts, args.win_len, args.hop_len, save=args.cache) data_loader = {} for partition in data.keys(): # one DataLoader for each partition set = MuSeDataset(data, partition) batch_size = args.batch_size if partition == 'train' else 1 shuffle = True if partition == 'train' else False # shuffle only for train partition data_loader[partition] = torch.utils.data.DataLoader(set, batch_size=batch_size, shuffle=shuffle, num_workers=4, worker_init_fn=seed_worker) args.d_in = data_loader['train'].dataset.get_feature_dim() if args.task == 'sent': args.n_targets = max([x[0, 0] for x in data['train']['label']]) + 1 # number of classes criterion = CrossEntropyLoss() score_str = 'Macro-F1' else: args.n_targets = 1 criterion = CCCLoss() score_str = 'CCC' if args.eval_model is None: # Train and validate for each seed seeds = range(args.seed, args.seed + args.n_seeds) val_losses, val_scores, best_model_files, test_scores = [], [], [], [] for seed in seeds: torch.manual_seed(seed) model = Model(args) print('=' * 50) print('Training model... [seed {}]'.format(seed)) val_loss, val_score, best_model_file = train_model(args.task, model, data_loader, args.epochs, args.lr, args.paths['model'], seed, args.use_gpu, criterion, regularization=args.regularization) if not args.predict: # run evaluation only if test labels are available test_loss, test_score = evaluate(args.task, model, data_loader['test'], criterion, args.use_gpu) test_scores.append(test_score) if args.task in ['physio', 'stress', 'wilder']: print(f'[Test CCC]: {test_score:7.4f}') val_losses.append(val_loss) val_scores.append(val_score) best_model_files.append(best_model_file) best_idx = val_scores.index(max(val_scores)) # find best performing seed print('=' * 50) print(f'Best {score_str} on [Val] for seed {seeds[best_idx]}: ' f'[Val {score_str}]: {val_scores[best_idx]:7.4f}' f"{f' | [Test {score_str}]: {test_scores[best_idx]:7.4f}' if not args.predict else ''}") print('=' * 50) model_file = best_model_files[best_idx] # best model of all of the seeds else: # Evaluate existing model (No training) model_file = args.eval_model model = torch.load(model_file) _, valid_score = evaluate(args.task, model, data_loader['devel'], criterion, args.use_gpu) print(f'Evaluating {model_file}:') print(f'[Val {score_str}]: {valid_score:7.4f}') if not args.predict: _, test_score = evaluate(args.task, model, data_loader['test'], criterion, args.use_gpu) print(f'[Test {score_str}]: {test_score:7.4f}') if args.predict: # Make predictions for the test partition; this option is set if there are no test labels print('Predicting test samples...') best_model = torch.load(model_file) evaluate(args.task, best_model, data_loader['test'], criterion, args.use_gpu, predict=True, prediction_path=args.paths['predict']) if args.save: # Save predictions for all partitions (needed to subsequently do late fusion) print('Save all predictions...') seed = int(model_file.split('_')[-1].split('.')[0]) torch.manual_seed(seed) best_model = torch.load(model_file) # Load data again without any segmentation data = load_data(args.task, args.paths, args.feature_set, args.emo_dim, args.normalize, args.norm_opts, args.win_len, args.hop_len, save=args.cache, apply_segmentation=False) for partition in data.keys(): dl = torch.utils.data.DataLoader(MuSeDataset(data, partition), batch_size=1, shuffle=False, worker_init_fn=seed_worker) evaluate(args.task, best_model, dl, criterion, args.use_gpu, predict=True, prediction_path=args.paths['save']) # Delete model if save option is not set. if not args.save and not args.eval_model: if os.path.exists(model_file): os.remove(model_file) print('Done.')
def main(args): print("Save prediction:"+ str(args.save)) # ensure reproducibility numpy.random.seed(10) random.seed(10) print('Loading data ...') encoding_position= True data = load_data(args.task, args.paths, args.feature_set, args.emo_dim, args.normalize, args.norm_opts, args.win_len, args.hop_len, save=args.cache, encoding_position=encoding_position) data_loader = {} for partition in data.keys(): # one DataLoader for each partition set = MuSeDataset(data, partition) batch_size = args.batch_size if partition == 'train' else 1 shuffle = True if partition == 'train' else False # shuffle only for train partition data_loader[partition] = torch.utils.data.DataLoader(set, batch_size=batch_size, shuffle=shuffle, num_workers=4, worker_init_fn=seed_worker) # args.d_in = data_loader['train'].dataset.get_feature_dim() args.n_targets = 1 criterion = CCCLoss() score_str = 'CCC' d_in= {'vggface': 512, "egemaps": 88, "bert-4": 768, 'fau_intensity':17, 'deepspectrum': 4096, 'vggish': 128} d_rnn_in= {'vggface': 256, "egemaps": 128, "bert-4": 512,'fau_intensity':16, 'deepspectrum': 512, 'vggish': 128} # rnn_out= {'visual': 256, "audio": 128, "text": 256, "bio": 128} # args.rnn_in = rnn_in # args.rnn_out = rnn_out # tcn_ins = {'vggface': 512, 'egemaps':88, 'bert-4': 768, 'fau_intensity':17,'deepspectrum': 4096, 'vggish': 128} tcn_channels = {'vggface': (256,), 'egemaps':(128,), 'bert-4': (512,), 'fau_intensity':(16,), 'deepspectrum': (512,),'vggish': (128,)} d_rnn= {'vggface': 128, 'egemaps':64, 'bert-4': 128, 'fau_intensity':16, 'deepspectrum': 512, 'vggish': 128 } # model_file = 'egemaps_model_101.pth' # if args.save: # Save predictions for all partitions (needed to subsequently do late fusion) # print('Save all predictions...') # # seed = int(model_file.split('_')[-3]) # seed= 101 # torch.manual_seed(seed) # print(torch.cuda.is_available()) # best_model = torch.load(model_file) # # Load data again without any segmentation # data = load_data(args.task, args.paths, args.feature_set, args.emo_dim, args.normalize, args.norm_opts, # args.win_len, args.hop_len, save=args.cache, apply_segmentation=False) # for partition in data.keys(): # dl = torch.utils.data.DataLoader(MuSeDataset(data, partition), batch_size=1, shuffle=False, # worker_init_fn=seed_worker) # evaluate(args.task, best_model, dl, criterion, args.use_gpu, predict=True, # prediction_path=args.paths['save']) # return if args.eval_model is None: # Train and validate for each seed seeds = range(args.seed, args.seed + args.n_seeds) val_losses, val_scores, best_model_files, test_scores = [], [], [], [] for seed in seeds: torch.manual_seed(seed) #params setting if encoding_position: args.d_in = d_in[args.feature_set[0]]+1 else: args.d_in = d_in[args.feature_set[0]] args.fea_dr=0. args.tcn_layer =1 args.tcn_channels= tcn_channels[args.feature_set[0]] args.num_dilations=4 args.tcn_kernel_size=3 args.tcn_dr=0.1 args.tcn_norm= True args.d_rnn_in = d_rnn_in[args.feature_set[0]] args.d_rnn = d_rnn[args.feature_set[0]] args.rnn_dr=0.2 #Attention Parameter args.attn_layer=1 args.n_heads=1 args.attn_dr= 0.2 # model = Model(args) # model= MuseModel(args, num_outputs=1, tcn_in= tcn_in, tcn_channels= bio_signal_tcn_channel, num_dilations=8, # dropout=0.2, use_norm=True, features_dropout=0.) # model= MuseModelBiCrossAttention(args, dropout=0.2, features_dropout=None, attn_dropout=0.2, num_last_regress=32, d_attention_out=32) model= MuseModelWithSelfAttention(args, num_outputs =1, num_last_regress = 64) # model= MuseModel2(args, num_outputs=1, tcn_in1= tcn_in1,tcn_in2= tcn_in2, # tcn_channels1= tcn_channels[args.feature_set[0]],tcn_channels2= tcn_channels[args.feature_set[0]], num_dilations=8, # dropout=0.2, use_norm=True, features_dropout=0.2, d_rnn1= d_rnn1, d_rnn2= d_rnn2) print('=' * 50) print('Training model... [seed {}]'.format(seed)) val_loss, val_score, best_model_file = train_model(args.task, model, data_loader, args.epochs, args.lr, args.paths['model'], seed, args.use_gpu, criterion, regularization=args.regularization) if not args.predict: # run evaluation only if test labels are available test_loss, test_score = evaluate(args.task, model, data_loader['test'], criterion, args.use_gpu) test_scores.append(test_score) if args.task in ['physio', 'stress', 'wilder']: print(f'[Test CCC]: {test_score:7.4f}') # val_losses.append(val_loss) val_scores.append(val_score) best_model_files.append(best_model_file) # epochs= range(1,51) # plt.plot(epochs, tran_l, 'g', label='Training Loss') # plt.plot(epochs, val_l, 'b', label='Validation Loss') # plt.plot(epochs, val_s, 'r', label='Validation CCC') # plt.title(f'Training/Validation Loss and Score seed{seed}') # plt.xlabel('Epochs') # plt.ylabel('Value') # plt.legend() # plt.show() best_idx = val_scores.index(max(val_scores)) # find best performing seed print('=' * 50) print(f'Best {score_str} on [Val] for seed {seeds[best_idx]}: ' f'[Val {score_str}]: {val_scores[best_idx]:7.4f}' f"{f' | [Test {score_str}]: {test_scores[best_idx]:7.4f}' if not args.predict else ''}") print('=' * 50) model_file = best_model_files[best_idx] # best model of all of the seeds else: # Evaluate existing model (No training) model_file = args.eval_model model = torch.load(model_file) _, valid_score = evaluate(args.task, model, data_loader['devel'], criterion, args.use_gpu) print(f'Evaluating {model_file}:') print(f'[Val {score_str}]: {valid_score:7.4f}') if not args.predict: _, test_score = evaluate(args.task, model, data_loader['test'], criterion, args.use_gpu) print(f'[Test {score_str}]: {test_score:7.4f}') if args.predict: # Make predictions for the test partition; this option is set if there are no test labels print('Predicting test samples...') best_model = torch.load(model_file) evaluate(args.task, best_model, data_loader['test'], criterion, args.use_gpu, predict=True, prediction_path=args.paths['predict']) if args.save: # Save predictions for all partitions (needed to subsequently do late fusion) print('Save all predictions...') seed = int(model_file.split('_')[-3]) torch.manual_seed(seed) best_model = torch.load(model_file) # Load data again without any segmentation data = load_data(args.task, args.paths, args.feature_set, args.emo_dim, args.normalize, args.norm_opts, args.win_len, args.hop_len, save=args.cache, apply_segmentation=False) for partition in data.keys(): dl = torch.utils.data.DataLoader(MuSeDataset(data, partition), batch_size=1, shuffle=False, worker_init_fn=seed_worker) evaluate(args.task, best_model, dl, criterion, args.use_gpu, predict=True, prediction_path=args.paths['save']) # Delete model if save option is not set. if not args.save and not args.eval_model: if os.path.exists(model_file): os.remove(model_file) print('Done.')
help='If data set already pre-processed by Prober lab Perl script.') args = parser.parse_args() # Specify output bokeh.io.output_file(args.html_file, title='fish sleep explorer') # Parse data Frames if args.tidy: df = pd.read_csv(args.activity_file) elif args.perl_processed: df_gt = data_parser.load_gtype(args.gtype_file) df = data_parser.load_perl_processed_activity(args.activity_file, df_gt) else: df = data_parser.load_data(args.activity_file, args.gtype_file, args.lights_on, args.lights_off, int(args.day_in_the_life)) # Resample the data df_resampled = data_parser.resample(df, int(args.ind_win)) # Get approximate time interval of averages inds = df_resampled.fish == df_resampled.fish.unique()[0] zeit = np.sort(df_resampled.loc[inds, 'zeit'].values) dt = np.mean(np.diff(zeit)) * 60 # Make y-axis label y_axis_label = 'sec. of act. in {0:.1f} min.'.format(dt) # Get summary statistic if args.summary_trace in ['none', 'None']:
#Run Script #Myles Scholz import data_parser as dp import Network as nw dp.make_data() tr_d, te_d, va_d = dp.load_data() print(len(tr_d)) net = nw.Network([28, 25, 25, 28]) net.SGD(tr_d, 30, 10, 1.0, test_data=te_d)
def __init__(self, scheduleType="Fixed", wake_up=7.0, sleep=11.0, typical=1, num_outings=4, variation_weekday=1, variation_weekend=1, abrupt_change_per_year=2, test_id=0, progression_rate = 0): self.scheduleType = scheduleType self.wake_up = wake_up self.sleep = sleep self.typical = typical self.variation_weekday = variation_weekday self.variation_weekend = variation_weekend self.abrupt_change_per_year = abrupt_change_per_year self.progression_rate = progression_rate self.functions = {} self.functions = { 'Test': self.getTimesTest, 'Fixed': self.getTimesFixed, 'Progressing': self.getTimesProgressing, } self.activities_data = np.array(data_parser.load_data(os.path.abspath(os.path.dirname(sys.argv[0])) + "/Data/not_at_home_activities.csv")) idx = np.r_[0:1, 50001:75002] self.activities_data = self.activities_data[idx] self.activities_data = np.append(self.activities_data, np.array(data_parser.load_data(os.path.abspath(os.path.dirname(sys.argv[0])) + "/Data/clusters.csv")), axis=1) #print(self.activities_data[0]) self.clustered_data = [] for row in self.activities_data: #print row if row[7] != '0': self.clustered_data.append(row) #print self.clustered_data.shape #print(self.activities_data[self.test_id][0]) self.test_id = test_id self.rand_id = np.random.randint(1, max(self.activities_data.shape)) self.num_outings = num_outings self.base_start_times = [] self.base_durations = [] self.clust_id = np.random.randint(1, len(self.clustered_data)) self.past_clust_id = [0] for i in range(num_outings): #print self.clustered_data[self.clust_id][7] #print self.past_clust_id while self.clustered_data[self.clust_id][7] in self.past_clust_id: self.clust_id = np.random.randint(1, len(self.clustered_data)) self.past_clust_id.append(self.clustered_data[self.clust_id][7]) self.base_start_times.append(float(self.clustered_data[self.clust_id][4])) self.base_durations.append(float(self.clustered_data[self.clust_id][3])) #print self.base_start_times #print self.base_durations self.base_weekend_start_times = [] self.base_weekend_durations = [] self.clust_id = np.random.randint(1, len(self.clustered_data)) self.past_clust_id = [0] for i in range(num_outings): #print self.clustered_data[self.clust_id][7] #print self.past_clust_id while self.clustered_data[self.clust_id][7] in self.past_clust_id: self.clust_id = np.random.randint(1, len(self.clustered_data)) self.past_clust_id.append(self.clustered_data[self.clust_id][7]) self.base_weekend_start_times.append(float(self.clustered_data[self.clust_id][4])) self.base_weekend_durations.append(float(self.clustered_data[self.clust_id][3]))