def load_data_input(path_to_params): params = TransformParameterParser(path_to_params).parse_params() print(params) set_random_seeds(params) data_input = DataInput(params['data_params']) data_input.split_data() return data_input
def init_data_input(params, transformer_path): data_input = DataInput(params['data_params']) data_input.split_data() with open(transformer_path, 'rb') as f: data_transformer = pickle.load(f) print(data_transformer) data_input.embed_data(data_transformer, \ params['transform_params']['cells_to_subsample'], params['transform_params']['num_cells_for_transformer'] ) data_input.normalize_data() data_input.prepare_data_for_training() return data_input
def load_and_prepare_data_input(params): data_input = DataInput(params['data_params']) data_input.split_data() print('%d samples in the training data' % len(data_input.x_tr)) with open(os.path.join(params['save_dir'], 'transformer.pkl'), 'rb') as f: data_transformer = pickle.load(f) # for debugging #params['transform_params']['cells_to_subsample'] = 2 data_input.embed_data(\ data_transformer, cells_to_subsample=params['transform_params']['cells_to_subsample'], use_labels_to_transform_data=params['transform_params']['use_labels_to_transform_data'] ) data_input.normalize_data() data_input.convert_all_data_to_tensors() return data_input
def main(params): start_time = time.time() #evauntually uncomment this leaving asis in order ot keep the same results as before to compare. set_random_seeds(params) if not os.path.exists(params['save_dir']): os.makedirs(params['save_dir']) with open(os.path.join(params['save_dir'], 'params.pkl'), 'wb') as f: pickle.dump(params, f) data_input = DataInput(params['data_params']) data_input.split_data() print('%d samples in the training data' %len(data_input.x_tr)) # force identity for the first transform data_transformer = DataTransformerFactory({'transform_type': 'identity'}, params['random_seed']).manufacture_transformer() data_input.embed_data_and_fit_transformer(\ data_transformer, cells_to_subsample=params['transform_params']['cells_to_subsample'], num_cells_for_transformer=params['transform_params']['num_cells_for_transformer'], use_labels_to_transform_data=params['transform_params']['use_labels_to_transform_data'] ) data_input.normalize_data() # gates aren't plotted because we're in n dimensions unused_cluster_gate_inits = init_gates(data_input, params) # data_input.convert_all_data_to_tensors() figscale = 8 fig, axs = plt.subplots(nrows=len(unused_cluster_gate_inits), figsize=(figscale, len(unused_cluster_gate_inits)*figscale)) print("initializing model") for gate, ax in zip(unused_cluster_gate_inits, axs): dataset = torch.utils.data.TensorDataset(torch.tensor(data_input.x_tr, dtype=torch.float), torch.tensor(data_input.y_tr, dtype=torch.float)) trainloader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=True) criterion = torch.nn.BCEWithLogitsLoss() model = SingleGateModel(params, gate) optimizer = torch.optim.Adam(model.parameters(), lr=1e-7, weight_decay=1e-2) print("initializing LR finder") lr_finder = LRFinder(model, optimizer, criterion) lr_finder.range_test(trainloader, end_lr=1e4, num_iter=100) lr_finder.plot(ax=ax) print("LR History:", lr_finder.history) plt.savefig(os.path.join(params['save_dir'], 'lr_find.png')) print('Complete main loop took %.4f seconds' %(time.time() - start_time)) return
def cross_validate_accuracy_over_saved_results(path_to_results, stepsize, n_steps, nfolds=20, starting_fold=30): path_to_params = os.path.join(path_to_results, 'params.yaml') params = TransformParameterParser(path_to_params).parse_params() print(params) cur_params = deepcopy(params) #evauntually uncomment this leaving asis in order ot keep the same results as before to compare. set_random_seeds(params) data_input = DataInput(params['data_params']) te_accs = [] pushed_gates_per_fold = [] starting_gates_per_fold = [] diffs_per_fold = [] for fold in range(starting_fold): data_input.split_data() for fold in range(starting_fold, nfolds + starting_fold): print('Running fold %d' % fold) cur_params['save_dir'] = os.path.join(params['save_dir'], 'run%d' % fold) data_input.split_data() best_tr_acc, starting_gate, best_gate = push_converged_boundaries_given_data_input_and_params( cur_params, data_input, stepsize, n_steps, path_to_params) model = DepthOneModel([[['D1', best_gate[0], best_gate[1]], ['D2', best_gate[2], best_gate[3]]]], params['model_params']) fit_classifier_params( model, data_input, params['train_params']['learning_rate_classifier']) te_acc = compute_te_acc(model, data_input) print('te acc for fold %d is %.3f' % (fold, te_acc)) te_accs.append(te_acc) pushed_gates_per_fold.append(best_gate) starting_gates_per_fold.append(starting_gate) diffs_per_fold.append(get_diff_between_gates(starting_gate, best_gate)) print('Diff: ', get_diff_between_gates(starting_gate, best_gate)) print('Te accs:', te_accs) print('Diffs per fold:', diffs_per_fold) with open( os.path.join(path_to_results, 'expanded_boundaries_te_accs_per_fold.pkl'), 'wb') as f: pickle.dump(te_accs, f) with open( os.path.join(path_to_results, 'expanded_boundaries_diffs_per_fold.pkl'), 'wb') as f: pickle.dump(diffs_per_fold, f) with open( os.path.join(path_to_results, 'expanded_boundaries_best_pushed_gates_per_fold.pkl'), 'wb') as f: pickle.dump(pushed_gates_per_fold, f)
def run_once_with_fixed_size(params, size, run, data_transformer): start_time = time.time() #set_random_seeds(params) for some reason doing this produces a different UMAP embedding- likely a bug in the UMAP package I'm using, so have to set seed in data input to get consistent splits if not os.path.exists(params['save_dir']): os.makedirs(params['save_dir']) with open(os.path.join(params['save_dir'], 'params.pkl'), 'wb') as f: pickle.dump(params, f) data_input = DataInput(params['data_params']) data_input.split_data(split_seed=params['random_seed']) data_input.embed_data(\ data_transformer, cells_to_subsample=params['transform_params']['cells_to_subsample'], ) #data_input.save_transformer(params['save_dir']) data_input.normalize_data() init_gate_tree = get_init_gate_in_disc_region(size) model = initialize_model(params['model_params'], init_gate_tree) #this line fixes the size model.fix_size_params(size) data_input.convert_all_data_to_tensors() trackers_per_step = [] performance_tracker = run_train_model(model, params['train_params'], data_input) check_size_stayed_constant(model, size) make_and_save_plot_to_check_umap_stays_same(model, data_input, run, params) model_save_path = os.path.join(params['save_dir'], 'model.pkl') torch.save(model.state_dict(), model_save_path) tracker_save_path = os.path.join(params['save_dir'], 'tracker.pkl') with open(tracker_save_path, 'wb') as f: pickle.dump(performance_tracker, f) print('Complete main loop took %.4f seconds' % (time.time() - start_time)) return model, performance_tracker, data_transformer
def load_saved_model_and_matching_data_input(path_to_params): def set_random_seeds(params): torch.manual_seed(params['random_seed']) np.random.seed(params['random_seed']) start_time = time.time() params = TransformParameterParser(path_to_params).parse_params() print(params) #evauntually uncomment this leaving asis in order ot keep the same results as before to compare. set_random_seeds(params) data_input = DataInput(params['data_params']) data_input.split_data() print('%d samples in the training data' % len(data_input.x_tr)) with open(os.path.join(params['save_dir'], 'trackers.pkl'), 'rb') as f: trackers = pickle.load(f) with open(os.path.join(params['save_dir'], 'transformer.pkl'), 'rb') as f: umapper = pickle.load(f) # FOR DEBUGGING ONLY #params['transform_params']['cells_to_subsample'] = 10 data_input.embed_data(\ umapper, cells_to_subsample=params['transform_params']['cells_to_subsample'], use_labels_to_transform_data=params['transform_params']['use_labels_to_transform_data'] ) data_input.normalize_data() data_input.convert_all_data_to_tensors() model = DepthOneModel([[['D1', 0, 0], ['D2', 0, 0]]], params['model_params']) model.load_state_dict( torch.load(os.path.join(params['save_dir'], 'model.pkl'))) return params, model, data_input, umapper
def make_umap_plots_for_incorrect_and_correct_samples( results_path, plot_expanded_data=True, path_to_true_features=None, BALL=False): with open(os.path.join(results_path, 'configs.pkl'), 'rb') as f: params = pickle.load(f) with open(os.path.join(results_path, 'transformer.pkl'), 'rb') as f: umapper = pickle.load(f) sample_names_to_true_features = None if path_to_true_features: with open(path_to_true_features, 'rb') as f: sample_names_to_true_features = pickle.load(f) set_random_seeds(params) model = DepthOneModel([[['D1', 0, 0], ['D2', 0, 0]]], params['model_params']) model.load_state_dict(torch.load(os.path.join(results_path, 'model.pkl'))) try: print(params['data_params']['use_presplit_data']) except: params['data_params']['use_presplit_data'] = False data_input = DataInput(params['data_params']) # splitting because codebase requires a split currently data_input.split_data() print('embedding data') # only for debuggin #params['transform_params']['cells_to_subsample'] = 2 data_input.embed_data( umapper, cells_to_subsample = params['transform_params']['cells_to_subsample'], use_labels_to_transform_data = params['transform_params']['use_labels_to_transform_data'] ) data_input.normalize_data() data_input.convert_all_data_to_tensors() # gate expansion using kde if plot_expanded_data: print(model.get_gates()[0]) kde_expander = KDEGateExpander(data_input.x_tr, model.get_gates()[0], sigma_thresh_factor=.5) kde_expander.expand_gates() kde_expander.collect_expanded_cells_per_sample() tr_expanded_data = kde_expander.expanded_data_per_sample te_expanded_data = kde_expander.get_expanded_data_new_samples(data_input.x_te) else: tr_expanded_data = None te_expanded_data = None output_tr = model(data_input.x_tr, data_input.y_tr) output_te = model(data_input.x_te, data_input.y_te) matching_tr = [( (output_tr['y_pred'].cpu().detach().numpy() >= .5)[i] * 1.0 == data_input.y_tr[i] ) for i in range(len(data_input.y_tr))] pos_probs_tr = np.array([prob.cpu().detach().numpy() for prob in output_tr['y_pred']]) sorted_idxs_tr = np.argsort(pos_probs_tr) #correct_idxs_tr = [data_input.idxs_tr[i] for i in range(len(data_input.y_tr)) if matching_tr[i]] correct_idxs_tr = [data_input.idxs_tr[i] for i in sorted_idxs_tr if matching_tr[i]] correct_idxs_true_pos_tr = [idx for idx in correct_idxs_tr if data_input.y_tr[data_input.idxs_tr.index(idx)] == 1] correct_idxs_true_neg_tr = [idx for idx in correct_idxs_tr if data_input.y_tr[data_input.idxs_tr.index(idx)] == 0] #incorrect_idxs_tr = [data_input.idxs_tr[i] for i in range(len(data_input.y_tr)) if not matching_tr[i]] incorrect_idxs_tr = [data_input.idxs_tr[i] for i in sorted_idxs_tr if not matching_tr[i]] incorrect_idxs_true_pos_tr = [idx for idx in incorrect_idxs_tr if data_input.y_tr[data_input.idxs_tr.index(idx)] == 1] incorrect_idxs_true_neg_tr = [idx for idx in incorrect_idxs_tr if data_input.y_tr[data_input.idxs_tr.index(idx)] == 0] print(np.sum(correct_idxs_tr)/len(data_input.x_tr)) matching_te = [( (output_te['y_pred'].cpu().detach().numpy() >= .5)[i] * 1.0 == data_input.y_te[i] ) for i in range(len(data_input.y_te))] pos_probs_te = np.array([prob.cpu().detach().numpy() for prob in output_te['y_pred']]) sorted_idxs_te = np.argsort(pos_probs_te) #correct_idxs_te = [data_input.idxs_te[i] for i in range(len(data_input.y_te)) if matching_te[i]] correct_idxs_te = [data_input.idxs_te[i] for i in sorted_idxs_te if matching_te[i]] correct_idxs_true_pos_te = [idx for idx in correct_idxs_te if data_input.y_te[data_input.idxs_te.index(idx)] == 1] correct_idxs_true_neg_te = [idx for idx in correct_idxs_te if data_input.y_te[data_input.idxs_te.index(idx)] == 0] #incorrect_idxs_te = [data_input.idxs_te[i] for i in range(len(data_input.y_te)) if not matching_te[i]] incorrect_idxs_te = [data_input.idxs_te[i] for i in sorted_idxs_te if not matching_te[i]] incorrect_idxs_true_pos_te = [idx for idx in incorrect_idxs_te if data_input.y_te[data_input.idxs_te.index(idx)] == 1] incorrect_idxs_true_neg_te = [idx for idx in incorrect_idxs_te if data_input.y_te[data_input.idxs_te.index(idx)] == 0] print('correct te idxs:', correct_idxs_te, 'incorrect te idxs', incorrect_idxs_te) print(incorrect_idxs_true_neg_te) background_data_to_plot_neg = np.concatenate([data for i, data in enumerate(data_input.x_tr) if data_input.y_tr[i] == 0]) try: background_data_to_plot_neg = np.concatenate([background_data_to_plot_neg, np.concatenate([data for i, data in enumerate(data_input.x_te) if data_input.y_te[i] == 0])]) except: pass background_data_to_plot_pos = np.concatenate([data for i, data in enumerate(data_input.x_tr) if data_input.y_tr[i]]) background_data_to_plot_pos = np.concatenate([background_data_to_plot_pos, np.concatenate([data for i, data in enumerate(data_input.x_te) if data_input.y_te[i]])]) full_background_data_to_plot = np.concatenate([background_data_to_plot_pos, background_data_to_plot_neg]) ### CHANGE SAVENAME IF YOU USE VAL DATA HERE plots_per_row_BALL = 9 make_umap_plots_per_sample(model, data_input, incorrect_idxs_true_pos_tr, savename='true_pos_incorrect_dev_tr.png', plots_per_row=plots_per_row_BALL, background_data_to_plot=full_background_data_to_plot, expanded_data_per_sample=tr_expanded_data, sample_names_to_true_features=sample_names_to_true_features, BALL=BALL) make_umap_plots_per_sample(model, data_input, incorrect_idxs_true_neg_tr, savename='true_neg_incorrect_dev_tr.png', plots_per_row=plots_per_row_BALL, background_data_to_plot=full_background_data_to_plot, expanded_data_per_sample=tr_expanded_data, sample_names_to_true_features=sample_names_to_true_features, BALL=BALL) make_umap_plots_per_sample(model, data_input, correct_idxs_true_pos_tr, savename='true_pos_correct_dev_tr.png', plots_per_row=plots_per_row_BALL, background_data_to_plot=full_background_data_to_plot, expanded_data_per_sample=tr_expanded_data, sample_names_to_true_features=sample_names_to_true_features, BALL=BALL) make_umap_plots_per_sample(model, data_input, correct_idxs_true_neg_tr, savename='true_neg_correct_dev_tr.png', plots_per_row=plots_per_row_BALL, background_data_to_plot=full_background_data_to_plot, expanded_data_per_sample=tr_expanded_data, sample_names_to_true_features=sample_names_to_true_features, BALL=BALL) make_umap_plots_per_sample(model, data_input, incorrect_idxs_true_pos_te, savename='true_pos_incorrect_dev_te.png', plots_per_row=plots_per_row_BALL, background_data_to_plot=full_background_data_to_plot, expanded_data_per_sample=te_expanded_data, sample_names_to_true_features=sample_names_to_true_features, BALL=BALL) make_umap_plots_per_sample(model, data_input, incorrect_idxs_true_neg_te, savename='true_neg_incorrect_dev_te.png', plots_per_row=plots_per_row_BALL, background_data_to_plot=full_background_data_to_plot, expanded_data_per_sample=te_expanded_data, sample_names_to_true_features=sample_names_to_true_features, BALL=BALL) make_umap_plots_per_sample(model, data_input, correct_idxs_true_pos_te, savename='true_pos_correct_dev_te.png', plots_per_row=plots_per_row_BALL, background_data_to_plot=full_background_data_to_plot, expanded_data_per_sample=te_expanded_data, sample_names_to_true_features=sample_names_to_true_features, BALL=BALL) make_umap_plots_per_sample(model, data_input, correct_idxs_true_neg_te, savename='true_neg_correct_dev_te.png', plots_per_row=plots_per_row_BALL, background_data_to_plot=full_background_data_to_plot, expanded_data_per_sample=te_expanded_data, sample_names_to_true_features=sample_names_to_true_features, BALL=BALL)
def cross_validate(path_to_params, n_runs, start_seed=0): start_time = time.time() params = TransformParameterParser(path_to_params).parse_params() print(params) check_consistency_of_params(params) #evauntually uncomment this leaving asis in order ot keep the same results as before to compare. set_random_seeds(params) if not os.path.exists(params['save_dir']): os.makedirs(params['save_dir']) with open(os.path.join(params['save_dir'], 'params.pkl'), 'wb') as f: pickle.dump(params, f) data_input = DataInput(params['data_params']) te_accs = [] tr_accs = [] # to get to the correct new split at start for i in range(start_seed): data_input.split_data() for run in range(start_seed, n_runs): if not os.path.exists(os.path.join(params['save_dir'], 'run%d' % run)): os.makedirs(os.path.join(params['save_dir'], 'run%d' % run)) savepath = os.path.join(params['save_dir'], 'run%d' % run) data_input.split_data() print(data_input.idxs_tr) data_transformer = DataTransformerFactory( params['transform_params'], params['random_seed']).manufacture_transformer() data_input.embed_data_and_fit_transformer(\ data_transformer, cells_to_subsample=params['transform_params']['cells_to_subsample'], num_cells_for_transformer=params['transform_params']['num_cells_for_transformer'], use_labels_to_transform_data=params['transform_params']['use_labels_to_transform_data'] ) data_input.save_transformer(savepath) data_input.normalize_data() unused_cluster_gate_inits = init_plot_and_save_gates( data_input, params) #everything below differs from the other main_UMAP data_input.convert_all_data_to_tensors() init_gate_tree, unused_cluster_gate_inits = get_next_gate_tree( unused_cluster_gate_inits, data_input, params, model=None) model = initialize_model(params['model_params'], [init_gate_tree]) performance_tracker = run_train_model(model, params['train_params'], data_input) model_save_path = os.path.join(savepath, 'model.pkl') torch.save(model.state_dict(), model_save_path) tracker_save_path = os.path.join(savepath, 'tracker.pkl') with open(tracker_save_path, 'wb') as f: pickle.dump(performance_tracker, f) results_plotter = DataAndGatesPlotterDepthOne( model, np.concatenate(data_input.x_tr)) #fig, axes = plt.subplots(params['gate_init_params']['n_clusters'], figsize=(1 * params['gate_init_params']['n_clusters'], 3 * params['gate_init_params']['n_clusters'])) results_plotter.plot_data_with_gates( np.array( np.concatenate([ data_input.y_tr[i] * torch.ones([data_input.x_tr[i].shape[0], 1]) for i in range(len(data_input.x_tr)) ]))) plt.savefig(os.path.join(savepath, 'final_gates.png')) with open(os.path.join(savepath, 'configs.pkl'), 'wb') as f: pickle.dump(params, f) print('Complete main loop for run %d took %.4f seconds' % (run, time.time() - start_time)) start_time = time.time() print('Accuracy tr %.3f, te %.3f' % (performance_tracker.metrics['tr_acc'][-1], performance_tracker.metrics['te_acc'][-1])) te_accs.append(performance_tracker.metrics['te_acc'][-1]) tr_accs.append(performance_tracker.metrics['tr_acc'][-1]) tr_accs = np.array(tr_accs) te_accs = np.array(te_accs) print('Average tr acc: %.3f, te acc %.3f' % (np.mean(tr_accs), np.mean(te_accs))) print('Std dev tr acc: %.3f, te_acc %.3f' % (np.std(tr_accs), np.std(te_accs)))
def push_converged_boundaries(path_to_params, stepsize, n_steps): start_time = time.time() params = TransformParameterParser(path_to_params).parse_params() print(params) #evauntually uncomment this leaving asis in order ot keep the same results as before to compare. set_random_seeds(params) data_input = DataInput(params['data_params']) data_input.split_data() print('%d samples in the training data' % len(data_input.x_tr)) with open(os.path.join(params['save_dir'], 'trackers.pkl'), 'rb') as f: trackers = pickle.load(f) with open(os.path.join(params['save_dir'], 'transformer.pkl'), 'rb') as f: umapper = pickle.load(f) # FOR DEBUGGING ONLY #params['transform_params']['cells_to_subsample'] = 10 data_input.embed_data(\ umapper, cells_to_subsample=params['transform_params']['cells_to_subsample'], use_labels_to_transform_data=params['transform_params']['use_labels_to_transform_data'] ) data_input.normalize_data() data_input.convert_all_data_to_tensors() model = DepthOneModel([[['D1', 0, 0], ['D2', 0, 0]]], params['model_params']) model.load_state_dict( torch.load(os.path.join(params['save_dir'], 'model.pkl'))) init_acc = trackers[0].metrics['tr_acc'][-1] cur_best_acc = init_acc starting_gate = model.get_gates()[0] cur_gate = copy.deepcopy(starting_gate) cur_best_gate = copy.deepcopy(cur_gate) print('Starting gate:', starting_gate) counter = 0 for left_step in range(n_steps): cur_gate[0] = starting_gate[0] - left_step * stepsize for right_step in range(n_steps): cur_gate[1] = starting_gate[1] + right_step * stepsize for down_step in range(n_steps): cur_gate[2] = starting_gate[2] - down_step * stepsize for up_step in range(n_steps): cur_gate[3] = starting_gate[3] + up_step * stepsize model = DepthOneModel([[['D1', cur_gate[0], cur_gate[1]], ['D2', cur_gate[2], cur_gate[3]]]], params['model_params']) fit_classifier_params( model, data_input, params['train_params']['learning_rate_classifier']) # model.nodes = None # model.init_nodes([[['D1', cur_gate[0], cur_gate[1]], ['D2', cur_gate[2], cur_gate[3]]]]) cur_acc = compute_tr_acc(model, data_input) #cur_acc = performance_tracker.metrics['tr_acc'][-1] counter += 1 print(counter) print(cur_gate) print(cur_acc) if cur_acc > cur_best_acc: cur_best_acc = cur_acc cur_best_gate = copy.deepcopy(cur_gate) print('Final acc %.3f, Initial acc %.3f' % (cur_best_acc, init_acc)) print('Init/final gates', starting_gate, cur_best_gate)
def main(path_to_params): start_time = time.time() params = TransformParameterParser(path_to_params).parse_params() print(params) check_consistency_of_params(params) #evauntually uncomment this leaving asis in order ot keep the same results as before to compare. set_random_seeds(params) if not os.path.exists(params['save_dir']): os.makedirs(params['save_dir']) with open(os.path.join(params['save_dir'], 'params.pkl'), 'wb') as f: pickle.dump(params, f) data_input = DataInput(params['data_params']) data_input.split_data() print('%d samples in the training data' % len(data_input.x_tr)) data_transformer = DataTransformerFactory( params['transform_params'], params['random_seed']).manufacture_transformer() data_input.embed_data_and_fit_transformer(\ data_transformer, cells_to_subsample=params['transform_params']['cells_to_subsample'], num_cells_for_transformer=params['transform_params']['num_cells_for_transformer'], use_labels_to_transform_data=params['transform_params']['use_labels_to_transform_data'] ) # can't pickle opentsne objects if not params['transform_params'] == 'tsne': data_input.save_transformer(params['save_dir']) data_input.normalize_data() potential_gates = get_all_potential_gates(data_input, params) data_input.convert_all_data_to_tensors() model = initialize_model(params['model_params'], potential_gates) if params['train_params']['fix_gates']: model.freeze_gate_params() tracker = run_train_model(\ model, params['train_params'], data_input ) # if params['transform_params']['embed_dim'] == 3: # unused_cluster_gate_inits = init_gates(data_input, params) # else: # unused_cluster_gate_inits = init_plot_and_save_gates(data_input, params) # #everything below differs from the other main_UMAP # data_input.convert_all_data_to_tensors() # init_gate_tree, unused_cluster_gate_inits = get_next_gate_tree(unused_cluster_gate_inits, data_input, params, model=None) # model = initialize_model(params['model_params'], [init_gate_tree]) # trackers_per_round = [] # num_gates_left = len(unused_cluster_gate_inits) # #print(num_gates_left, 'asdfasdfasdfasdfasdfasdfas') # for i in range(num_gates_left + 1): # performance_tracker = run_train_model(model, params['train_params'], data_input) # trackers_per_round.append(performance_tracker.get_named_tuple_rep()) # if i == params['train_params']['num_gates_to_learn'] - 1: # break # if not i == num_gates_left: # next_gate_tree, unused_cluster_gate_inits = get_next_gate_tree(unused_cluster_gate_inits, data_input, params, model=model) # model.add_node(next_gate_tree) model_save_path = os.path.join(params['save_dir'], 'model.pkl') torch.save(model.state_dict(), model_save_path) tracker_save_path = os.path.join(params['save_dir'], 'tracker.pkl') # trackers_per_round = [tracker.get_named_tuple_rep() for tracker in trackers_per_round] with open(tracker_save_path, 'wb') as f: pickle.dump(tracker, f) if params['plot_umap_reflection']: # reflection is about x=.5 since the data is already in umap space here reflected_data = [] for data in data_input.x_tr: data[:, 0] = 1 - data[:, 0] reflected_data.append(data) data_input.x_tr = reflected_data gate_tree = model.get_gate_tree() reflected_gates = [] for gate in gate_tree: print(gate) #order switches since reflected over x=.5 low_reflected = 1 - gate[0][2] high_reflected = 1 - gate[0][1] gate[0][1] = low_reflected gate[0][2] = high_reflected print(gate) reflected_gates.append(gate) model.init_nodes(reflected_gates) print(model.init_nodes) print(model.get_gates()) results_plotter = DataAndGatesPlotterDepthOne( model, np.concatenate(data_input.x_tr)) #fig, axes = plt.subplots(params['gate_init_params']['n_clusters'], figsize=(1 * params['gate_init_params']['n_clusters'], 3 * params['gate_init_params']['n_clusters'])) if params['transform_params']['embed_dim'] == 2: results_plotter.plot_data_with_gates( np.array( np.concatenate([ data_input.y_tr[i] * torch.ones([data_input.x_tr[i].shape[0], 1]) for i in range(len(data_input.x_tr)) ]))) plt.savefig(os.path.join(params['save_dir'], 'final_gates.png')) else: fig_pos, ax_pos, fig_neg, ax_neg = results_plotter.plot_data_with_gates( np.array( np.concatenate([ data_input.y_tr[i] * torch.ones([data_input.x_tr[i].shape[0], 1]) for i in range(len(data_input.x_tr)) ]))) with open(os.path.join(params['save_dir'], 'final_gates_pos_3d.pkl'), 'wb') as f: pickle.dump(fig_pos, f) with open(os.path.join(params['save_dir'], 'final_gates_neg_3d.pkl'), 'wb') as f: pickle.dump(fig_neg, f) with open(os.path.join(params['save_dir'], 'configs.pkl'), 'wb') as f: pickle.dump(params, f) print('Learned weights:', model.linear.weight) print('Complete main loop took %.4f seconds' % (time.time() - start_time))
def single_run_single_gate(params): start_time = time.time() #evauntually uncomment this leaving asis in order ot keep the same results as before to compare. #set_random_seeds(params) if not os.path.exists(params['save_dir']): os.makedirs(params['save_dir']) with open(os.path.join(params['save_dir'], 'params.pkl'), 'wb') as f: pickle.dump(params, f) data_input = DataInput(params['data_params']) data_input.split_data(split_seed=params['random_seed']) data_transformer = DataTransformerFactory( params['transform_params'], params['random_seed']).manufacture_transformer() data_input.embed_data_and_fit_transformer(\ data_transformer, cells_to_subsample=params['transform_params']['cells_to_subsample'], num_cells_for_transformer=params['transform_params']['num_cells_for_transformer'] ) data_input.save_transformer(params['save_dir']) data_input.normalize_data() unused_cluster_gate_inits = init_plot_and_save_gates(data_input, params) #everything below differs from the other main_UMAP data_input.convert_all_data_to_tensors() init_gate_tree, unused_cluster_gate_inits = get_next_gate_tree( unused_cluster_gate_inits, data_input, params, model=None) model = initialize_model(params['model_params'], [init_gate_tree]) performance_tracker = run_train_model(model, params['train_params'], data_input) model_save_path = os.path.join(params['save_dir'], 'model.pkl') torch.save(model.state_dict(), model_save_path) trackers_save_path = os.path.join(params['save_dir'], 'last_CV_rounds_tracker.pkl') with open(trackers_save_path, 'wb') as f: pickle.dump(performance_tracker, f) results_plotter = DataAndGatesPlotterDepthOne( model, np.concatenate(data_input.x_tr)) #fig, axes = plt.subplots(params['gate_init_params']['n_clusters'], figsize=(1 * params['gate_init_params']['n_clusters'], 3 * params['gate_init_params']['n_clusters'])) results_plotter.plot_data_with_gates( np.array( np.concatenate([ data_input.y_tr[i] * torch.ones([data_input.x_tr[i].shape[0], 1]) for i in range(len(data_input.x_tr)) ]))) plt.savefig(os.path.join(params['save_dir'], 'final_gates.png')) with open(os.path.join(params['save_dir'], 'configs.pkl'), 'wb') as f: pickle.dump(params, f) print('Complete main loop took %.4f seconds' % (time.time() - start_time)) return performance_tracker, model
def main(params): start_time = time.time() #evauntually uncomment this leaving asis in order ot keep the same results as before to compare. set_random_seeds(params) if not os.path.exists(params['save_dir']): os.makedirs(params['save_dir']) with open(os.path.join(params['save_dir'], 'params.pkl'), 'wb') as f: pickle.dump(params, f) data_input = DataInput(params['data_params']) data_input.split_data() print('%d samples in the training data' % len(data_input.x_tr)) # force identity for the first transform data_transformer = DataTransformerFactory({ 'transform_type': 'identity' }, params['random_seed']).manufacture_transformer() data_input.embed_data_and_fit_transformer(\ data_transformer, cells_to_subsample=params['transform_params']['cells_to_subsample'], num_cells_for_transformer=params['transform_params']['num_cells_for_transformer'], use_labels_to_transform_data=params['transform_params']['use_labels_to_transform_data'] ) # can't pickle opentsne objects if not params['transform_params'] == 'tsne': data_input.save_transformer(params['save_dir']) data_input.normalize_data() # gates aren't plotted because we're in n dimensions unused_cluster_gate_inits = init_gates(data_input, params) data_input.convert_all_data_to_tensors() init_gate_tree, unused_cluster_gate_inits = get_next_gate_tree( unused_cluster_gate_inits, data_input, params, model=None) model = initialize_model(params['model_params'], [init_gate_tree]) trackers_per_round = [] num_gates_left = len(unused_cluster_gate_inits) for i in range(num_gates_left + 1): performance_tracker = run_train_model(model, params['train_params'], data_input) trackers_per_round.append(performance_tracker.get_named_tuple_rep()) if i == params['train_params']['num_gates_to_learn'] - 1: break if not i == num_gates_left: next_gate_tree, unused_cluster_gate_inits = get_next_gate_tree( unused_cluster_gate_inits, data_input, params, model=model) model.add_node(next_gate_tree) model_save_path = os.path.join(params['save_dir'], 'model.pkl') torch.save(model.state_dict(), model_save_path) trackers_save_path = os.path.join(params['save_dir'], 'trackers.pkl') # trackers_per_round = [tracker.get_named_tuple_rep() for tracker in trackers_per_round] with open(trackers_save_path, 'wb') as f: pickle.dump(trackers_per_round, f) if params['plot_umap_reflection']: # reflection is about x=.5 since the data is already in umap space here reflected_data = [] for data in data_input.x_tr: data[:, 0] = 1 - data[:, 0] reflected_data.append(data) data_input.x_tr = reflected_data gate_tree = model.get_gate_tree() reflected_gates = [] for gate in gate_tree: print(gate) #order switches since reflected over x=.5 low_reflected = 1 - gate[0][2] high_reflected = 1 - gate[0][1] gate[0][1] = low_reflected gate[0][2] = high_reflected print(gate) reflected_gates.append(gate) model.init_nodes(reflected_gates) print(model.init_nodes) print(model.get_gates()) data_transformer = DataTransformerFactory( params['transform_params'], params['random_seed']).manufacture_transformer() data_input.convert_all_data_to_numpy() data_input.x_tr = data_input.x_tr_raw data_input.x_te = data_input.x_te_raw old_scale = data_input.scale old_offset = data_input.offset print("fitting projection") data_input.embed_data_and_fit_transformer(\ data_transformer, cells_to_subsample=params['transform_params']['cells_to_subsample'], num_cells_for_transformer=params['transform_params']['num_cells_for_transformer'], use_labels_to_transform_data=params['transform_params']['use_labels_to_transform_data'] ) results_plotter = MultidimDataAndGatesPlotter( model, np.concatenate(data_input.x_tr), np.concatenate(data_input.untransformed_matched_x_tr), old_scale, old_offset, data_input.transformer) results_plotter.plot_in_feature_space( np.array( np.concatenate([ data_input.y_tr[i] * torch.ones([data_input.x_tr[i].shape[0], 1]) for i in range(len(data_input.x_tr)) ]))) plt.savefig(os.path.join(params['save_dir'], 'feature_results.png')) if params['transform_params']['embed_dim'] == 2: results_plotter.plot_data_with_gates( np.array( np.concatenate([ data_input.y_tr[i] * torch.ones([data_input.x_tr[i].shape[0], 1]) for i in range(len(data_input.x_tr)) ]))) plt.savefig(os.path.join(params['save_dir'], 'final_gates.png')) else: fig_pos, ax_pos, fig_neg, ax_neg = results_plotter.plot_data_with_gates( np.array( np.concatenate([ data_input.y_tr[i] * torch.ones([data_input.x_tr[i].shape[0], 1]) for i in range(len(data_input.x_tr)) ]))) with open(os.path.join(params['save_dir'], 'final_gates_pos_3d.pkl'), 'wb') as f: pickle.dump(fig_pos, f) with open(os.path.join(params['save_dir'], 'final_gates_neg_3d.pkl'), 'wb') as f: pickle.dump(fig_neg, f) with open(os.path.join(params['save_dir'], 'configs.pkl'), 'wb') as f: pickle.dump(params, f) print('Complete main loop took %.4f seconds' % (time.time() - start_time)) return trackers_per_round[-1]
def main(path_to_params): start_time = time.time() params = TransformParameterParser(path_to_params).parse_params() print(params) check_consistency_of_params(params) #evauntually uncomment this leaving asis in order ot keep the same results as before to compare. set_random_seeds(params) if not os.path.exists(params['save_dir']): os.makedirs(params['save_dir']) with open(os.path.join(params['save_dir'], 'params.pkl'), 'wb') as f: pickle.dump(params, f) data_input = DataInput(params['data_params']) data_input.split_data() data_transformer = DataTransformerFactory( params['transform_params'], params['random_seed']).manufacture_transformer() data_input.embed_data_and_fit_transformer(\ data_transformer, cells_to_subsample=params['transform_params']['cells_to_subsample'], num_cells_for_transformer=params['transform_params']['num_cells_for_transformer'], use_labels_to_transform_data=params['transform_params']['use_labels_to_transform_data'] ) data_input.save_transformer(params['save_dir']) data_input.normalize_data() unused_cluster_gate_inits = init_plot_and_save_gates(data_input, params) data_input.convert_all_data_to_tensors() init_gate_tree, unused_cluster_gate_inits = get_next_gate_tree( unused_cluster_gate_inits, data_input, params, model=None) model1 = initialize_model(params['model_params'], [init_gate_tree]) performance_tracker1 = run_train_model(model1, params['train_params'], data_input) model1_save_path = os.path.join(params['save_dir'], 'model1.pkl') torch.save(model1.state_dict(), model1_save_path) tracker1_save_path = os.path.join(params['save_dir'], 'tracker1.pkl') with open(tracker1_save_path, 'wb') as f: pickle.dump(performance_tracker1, f) # now select the data inside the learned model1 gate and re-run umap data_input.filter_data_inside_first_model_gate(model1) unused_cluster_gate_inits = init_plot_and_save_gates(data_input, params) data_transformer = DataTransformerFactory( params['transform_params'], params['random_seed']).manufacture_transformer() data_input.embed_data_and_fit_transformer(\ data_transformer, cells_to_subsample=params['transform_params']['cells_to_subsample'], num_cells_for_transformer=params['transform_params']['num_cells_for_transformer'], use_labels_to_transform_data=params['transform_params']['use_labels_to_transform_data'] ) data_input.save_transformer(params['save_dir']) data_input.convert_all_data_to_tensors() init_gate_tree, _ = get_next_gate_tree(unused_cluster_gate_inits, data_input, params, model=None) model2 = initialize_model(params['model_params'], [init_gate_tree]) performance_tracker2 = run_train_model(model2, params['train_params'], data_input) model2_save_path = os.path.join(params['save_dir'], 'model2.pkl') torch.save(model2.state_dict(), model2_save_path) tracker2_save_path = os.path.join(params['save_dir'], 'tracker2.pkl') with open(tracker2_save_path, 'wb') as f: pickle.dump(performance_tracker2, f) results_plotter = DataAndGatesPlotterDepthOne( model2, np.concatenate(data_input.x_tr)) #fig, axes = plt.subplots(params['gate_init_params']['n_clusters'], figsize=(1 * params['gate_init_params']['n_clusters'], 3 * params['gate_init_params']['n_clusters'])) results_plotter.plot_data_with_gates( np.array( np.concatenate([ data_input.y_tr[i] * torch.ones([data_input.x_tr[i].shape[0], 1]) for i in range(len(data_input.x_tr)) ]))) plt.savefig(os.path.join(params['save_dir'], 'final_gates.png')) with open(os.path.join(params['save_dir'], 'configs.pkl'), 'wb') as f: pickle.dump(params, f) print('Complete main loop took %.4f seconds' % (time.time() - start_time))
def main(path_to_params): start_time = time.time() params = TransformParameterParser(path_to_params).parse_params() print(params) check_consistency_of_params(params) set_random_seeds(params) if not os.path.exists(params['save_dir']): os.makedirs(params['save_dir']) with open(os.path.join(params['save_dir'], 'params.pkl'), 'wb') as f: pickle.dump(params, f) data_input = DataInput(params['data_params']) data_input.split_data() data_transformer = DataTransformerFactory( params['transform_params'], params['random_seed']).manufacture_transformer() data_input.embed_data_and_fit_transformer(\ data_transformer, params['transform_params']['cells_to_subsample'], params['transform_params']['num_cells_for_transformer'] ) data_input.save_transformer(params['save_dir']) data_input.normalize_data() #everything below differs from the other main_UMAP multi_gate_initializer = MultipleGateInitializerHeuristic( data_input, params['model_params']['node_type'], params['gate_init_multi_heuristic_params']) init_gate_tree = [multi_gate_initializer.init_next_gate()] model = initialize_model(params['model_params'], init_gate_tree) data_input.prepare_data_for_training() trackers_per_step = [] num_gates = params['gate_init_multi_heuristic_params']['num_gates'] for i in range(num_gates): performance_tracker = run_train_model(model, params['train_params'], data_input) multi_gate_initializer.gates = model.get_gates() if not (i == num_gates - 1): print(model.get_gates()) next_gate = multi_gate_initializer.init_next_gate() if next_gate is None: print( 'There are no non-overlapping initializations left to try!' ) break model.add_node(next_gate) model_save_path = os.path.join(params['save_dir'], 'model.pkl') torch.save(model.state_dict(), model_save_path) tracker_save_path = os.path.join(params['save_dir'], 'tracker.pkl') with open(tracker_save_path, 'wb') as f: pickle.dump(performance_tracker, f) results_plotter = DataAndGatesPlotterDepthOne( model, np.concatenate(data_input.x_tr)) #fig, axes = plt.subplots(params['gate_init_params']['n_clusters'], figsize=(1 * params['gate_init_params']['n_clusters'], 3 * params['gate_init_params']['n_clusters'])) results_plotter.plot_data_with_gates( np.array( np.concatenate([ data_input.y_tr[i] * torch.ones([data_input.x_tr[i].shape[0], 1]) for i in range(len(data_input.x_tr)) ]))) plt.savefig(os.path.join(params['save_dir'], 'final_gates.png')) print('Complete main loop took %.4f seconds' % (time.time() - start_time))
def main(path_to_params): params = TransformParameterParser(path_to_params).parse_params() start_time = time.time() print(params) if not os.path.exists(params['save_dir']): os.makedirs(params['save_dir']) with open(os.path.join(params['save_dir'], 'params.pkl'), 'wb') as f: pickle.dump(params, f) data_input = DataInput(params['data_params']) data_input.split_data() data_transformer = DataTransformerFactory( params['transform_params']).manufacture_transformer() data_input.embed_data( data_transformer, params['transform_params']['cells_to_subsample'], params['transform_params']['num_cells_for_transformer'] ) #cells to subsample should change to a transformer param instead data_input.save_transformer(params['save_dir']) data_input.normalize_data() init_gate_tree = init_plot_and_save_gates(data_input, params) model = initialize_model(params['model_params'], init_gate_tree) data_input.prepare_data_for_training() performance_tracker = run_train_model(model, params['train_params'], data_input) model_save_path = os.path.join(params['save_dir'], 'model.pkl') torch.save(model.state_dict(), model_save_path) tracker_save_path = os.path.join(params['save_dir'], 'tracker.pkl') with open(tracker_save_path, 'wb') as f: pickle.dump(performance_tracker, f) results_plotter = DataAndGatesPlotterDepthOne( model, np.concatenate(data_input.x_tr)) #fig, axes = plt.subplots(params['gate_init_params']['n_clusters'], figsize=(1 * params['gate_init_params']['n_clusters'], 3 * params['gate_init_params']['n_clusters'])) results_plotter.plot_data_with_gates( np.array( np.concatenate([ data_input.y_tr[i] * torch.ones([data_input.x_tr[i].shape[0], 1]) for i in range(len(data_input.x_tr)) ]))) plt.savefig(os.path.join(params['save_dir'], 'final_gates.png')) print('Complete main loop took %.4f seconds' % (time.time() - start_time))