def init_data_input(params, transformer_path): data_input = DataInput(params['data_params']) data_input.split_data() with open(transformer_path, 'rb') as f: data_transformer = pickle.load(f) print(data_transformer) data_input.embed_data(data_transformer, \ params['transform_params']['cells_to_subsample'], params['transform_params']['num_cells_for_transformer'] ) data_input.normalize_data() data_input.prepare_data_for_training() return data_input
def main(path_to_params): params = TransformParameterParser(path_to_params).parse_params() start_time = time.time() print(params) if not os.path.exists(params['save_dir']): os.makedirs(params['save_dir']) with open(os.path.join(params['save_dir'], 'params.pkl'), 'wb') as f: pickle.dump(params, f) data_input = DataInput(params['data_params']) data_input.split_data() data_transformer = DataTransformerFactory( params['transform_params']).manufacture_transformer() data_input.embed_data( data_transformer, params['transform_params']['cells_to_subsample'], params['transform_params']['num_cells_for_transformer'] ) #cells to subsample should change to a transformer param instead data_input.save_transformer(params['save_dir']) data_input.normalize_data() init_gate_tree = init_plot_and_save_gates(data_input, params) model = initialize_model(params['model_params'], init_gate_tree) data_input.prepare_data_for_training() performance_tracker = run_train_model(model, params['train_params'], data_input) model_save_path = os.path.join(params['save_dir'], 'model.pkl') torch.save(model.state_dict(), model_save_path) tracker_save_path = os.path.join(params['save_dir'], 'tracker.pkl') with open(tracker_save_path, 'wb') as f: pickle.dump(performance_tracker, f) results_plotter = DataAndGatesPlotterDepthOne( model, np.concatenate(data_input.x_tr)) #fig, axes = plt.subplots(params['gate_init_params']['n_clusters'], figsize=(1 * params['gate_init_params']['n_clusters'], 3 * params['gate_init_params']['n_clusters'])) results_plotter.plot_data_with_gates( np.array( np.concatenate([ data_input.y_tr[i] * torch.ones([data_input.x_tr[i].shape[0], 1]) for i in range(len(data_input.x_tr)) ]))) plt.savefig(os.path.join(params['save_dir'], 'final_gates.png')) print('Complete main loop took %.4f seconds' % (time.time() - start_time))
def load_and_prepare_data_input(params): data_input = DataInput(params['data_params']) data_input.split_data() print('%d samples in the training data' % len(data_input.x_tr)) with open(os.path.join(params['save_dir'], 'transformer.pkl'), 'rb') as f: data_transformer = pickle.load(f) # for debugging #params['transform_params']['cells_to_subsample'] = 2 data_input.embed_data(\ data_transformer, cells_to_subsample=params['transform_params']['cells_to_subsample'], use_labels_to_transform_data=params['transform_params']['use_labels_to_transform_data'] ) data_input.normalize_data() data_input.convert_all_data_to_tensors() return data_input
def run_once_with_fixed_size(params, size, run, data_transformer): start_time = time.time() #set_random_seeds(params) for some reason doing this produces a different UMAP embedding- likely a bug in the UMAP package I'm using, so have to set seed in data input to get consistent splits if not os.path.exists(params['save_dir']): os.makedirs(params['save_dir']) with open(os.path.join(params['save_dir'], 'params.pkl'), 'wb') as f: pickle.dump(params, f) data_input = DataInput(params['data_params']) data_input.split_data(split_seed=params['random_seed']) data_input.embed_data(\ data_transformer, cells_to_subsample=params['transform_params']['cells_to_subsample'], ) #data_input.save_transformer(params['save_dir']) data_input.normalize_data() init_gate_tree = get_init_gate_in_disc_region(size) model = initialize_model(params['model_params'], init_gate_tree) #this line fixes the size model.fix_size_params(size) data_input.convert_all_data_to_tensors() trackers_per_step = [] performance_tracker = run_train_model(model, params['train_params'], data_input) check_size_stayed_constant(model, size) make_and_save_plot_to_check_umap_stays_same(model, data_input, run, params) model_save_path = os.path.join(params['save_dir'], 'model.pkl') torch.save(model.state_dict(), model_save_path) tracker_save_path = os.path.join(params['save_dir'], 'tracker.pkl') with open(tracker_save_path, 'wb') as f: pickle.dump(performance_tracker, f) print('Complete main loop took %.4f seconds' % (time.time() - start_time)) return model, performance_tracker, data_transformer
def load_saved_model_and_matching_data_input(path_to_params): def set_random_seeds(params): torch.manual_seed(params['random_seed']) np.random.seed(params['random_seed']) start_time = time.time() params = TransformParameterParser(path_to_params).parse_params() print(params) #evauntually uncomment this leaving asis in order ot keep the same results as before to compare. set_random_seeds(params) data_input = DataInput(params['data_params']) data_input.split_data() print('%d samples in the training data' % len(data_input.x_tr)) with open(os.path.join(params['save_dir'], 'trackers.pkl'), 'rb') as f: trackers = pickle.load(f) with open(os.path.join(params['save_dir'], 'transformer.pkl'), 'rb') as f: umapper = pickle.load(f) # FOR DEBUGGING ONLY #params['transform_params']['cells_to_subsample'] = 10 data_input.embed_data(\ umapper, cells_to_subsample=params['transform_params']['cells_to_subsample'], use_labels_to_transform_data=params['transform_params']['use_labels_to_transform_data'] ) data_input.normalize_data() data_input.convert_all_data_to_tensors() model = DepthOneModel([[['D1', 0, 0], ['D2', 0, 0]]], params['model_params']) model.load_state_dict( torch.load(os.path.join(params['save_dir'], 'model.pkl'))) return params, model, data_input, umapper
def make_umap_plots_for_incorrect_and_correct_samples( results_path, plot_expanded_data=True, path_to_true_features=None, BALL=False): with open(os.path.join(results_path, 'configs.pkl'), 'rb') as f: params = pickle.load(f) with open(os.path.join(results_path, 'transformer.pkl'), 'rb') as f: umapper = pickle.load(f) sample_names_to_true_features = None if path_to_true_features: with open(path_to_true_features, 'rb') as f: sample_names_to_true_features = pickle.load(f) set_random_seeds(params) model = DepthOneModel([[['D1', 0, 0], ['D2', 0, 0]]], params['model_params']) model.load_state_dict(torch.load(os.path.join(results_path, 'model.pkl'))) try: print(params['data_params']['use_presplit_data']) except: params['data_params']['use_presplit_data'] = False data_input = DataInput(params['data_params']) # splitting because codebase requires a split currently data_input.split_data() print('embedding data') # only for debuggin #params['transform_params']['cells_to_subsample'] = 2 data_input.embed_data( umapper, cells_to_subsample = params['transform_params']['cells_to_subsample'], use_labels_to_transform_data = params['transform_params']['use_labels_to_transform_data'] ) data_input.normalize_data() data_input.convert_all_data_to_tensors() # gate expansion using kde if plot_expanded_data: print(model.get_gates()[0]) kde_expander = KDEGateExpander(data_input.x_tr, model.get_gates()[0], sigma_thresh_factor=.5) kde_expander.expand_gates() kde_expander.collect_expanded_cells_per_sample() tr_expanded_data = kde_expander.expanded_data_per_sample te_expanded_data = kde_expander.get_expanded_data_new_samples(data_input.x_te) else: tr_expanded_data = None te_expanded_data = None output_tr = model(data_input.x_tr, data_input.y_tr) output_te = model(data_input.x_te, data_input.y_te) matching_tr = [( (output_tr['y_pred'].cpu().detach().numpy() >= .5)[i] * 1.0 == data_input.y_tr[i] ) for i in range(len(data_input.y_tr))] pos_probs_tr = np.array([prob.cpu().detach().numpy() for prob in output_tr['y_pred']]) sorted_idxs_tr = np.argsort(pos_probs_tr) #correct_idxs_tr = [data_input.idxs_tr[i] for i in range(len(data_input.y_tr)) if matching_tr[i]] correct_idxs_tr = [data_input.idxs_tr[i] for i in sorted_idxs_tr if matching_tr[i]] correct_idxs_true_pos_tr = [idx for idx in correct_idxs_tr if data_input.y_tr[data_input.idxs_tr.index(idx)] == 1] correct_idxs_true_neg_tr = [idx for idx in correct_idxs_tr if data_input.y_tr[data_input.idxs_tr.index(idx)] == 0] #incorrect_idxs_tr = [data_input.idxs_tr[i] for i in range(len(data_input.y_tr)) if not matching_tr[i]] incorrect_idxs_tr = [data_input.idxs_tr[i] for i in sorted_idxs_tr if not matching_tr[i]] incorrect_idxs_true_pos_tr = [idx for idx in incorrect_idxs_tr if data_input.y_tr[data_input.idxs_tr.index(idx)] == 1] incorrect_idxs_true_neg_tr = [idx for idx in incorrect_idxs_tr if data_input.y_tr[data_input.idxs_tr.index(idx)] == 0] print(np.sum(correct_idxs_tr)/len(data_input.x_tr)) matching_te = [( (output_te['y_pred'].cpu().detach().numpy() >= .5)[i] * 1.0 == data_input.y_te[i] ) for i in range(len(data_input.y_te))] pos_probs_te = np.array([prob.cpu().detach().numpy() for prob in output_te['y_pred']]) sorted_idxs_te = np.argsort(pos_probs_te) #correct_idxs_te = [data_input.idxs_te[i] for i in range(len(data_input.y_te)) if matching_te[i]] correct_idxs_te = [data_input.idxs_te[i] for i in sorted_idxs_te if matching_te[i]] correct_idxs_true_pos_te = [idx for idx in correct_idxs_te if data_input.y_te[data_input.idxs_te.index(idx)] == 1] correct_idxs_true_neg_te = [idx for idx in correct_idxs_te if data_input.y_te[data_input.idxs_te.index(idx)] == 0] #incorrect_idxs_te = [data_input.idxs_te[i] for i in range(len(data_input.y_te)) if not matching_te[i]] incorrect_idxs_te = [data_input.idxs_te[i] for i in sorted_idxs_te if not matching_te[i]] incorrect_idxs_true_pos_te = [idx for idx in incorrect_idxs_te if data_input.y_te[data_input.idxs_te.index(idx)] == 1] incorrect_idxs_true_neg_te = [idx for idx in incorrect_idxs_te if data_input.y_te[data_input.idxs_te.index(idx)] == 0] print('correct te idxs:', correct_idxs_te, 'incorrect te idxs', incorrect_idxs_te) print(incorrect_idxs_true_neg_te) background_data_to_plot_neg = np.concatenate([data for i, data in enumerate(data_input.x_tr) if data_input.y_tr[i] == 0]) try: background_data_to_plot_neg = np.concatenate([background_data_to_plot_neg, np.concatenate([data for i, data in enumerate(data_input.x_te) if data_input.y_te[i] == 0])]) except: pass background_data_to_plot_pos = np.concatenate([data for i, data in enumerate(data_input.x_tr) if data_input.y_tr[i]]) background_data_to_plot_pos = np.concatenate([background_data_to_plot_pos, np.concatenate([data for i, data in enumerate(data_input.x_te) if data_input.y_te[i]])]) full_background_data_to_plot = np.concatenate([background_data_to_plot_pos, background_data_to_plot_neg]) ### CHANGE SAVENAME IF YOU USE VAL DATA HERE plots_per_row_BALL = 9 make_umap_plots_per_sample(model, data_input, incorrect_idxs_true_pos_tr, savename='true_pos_incorrect_dev_tr.png', plots_per_row=plots_per_row_BALL, background_data_to_plot=full_background_data_to_plot, expanded_data_per_sample=tr_expanded_data, sample_names_to_true_features=sample_names_to_true_features, BALL=BALL) make_umap_plots_per_sample(model, data_input, incorrect_idxs_true_neg_tr, savename='true_neg_incorrect_dev_tr.png', plots_per_row=plots_per_row_BALL, background_data_to_plot=full_background_data_to_plot, expanded_data_per_sample=tr_expanded_data, sample_names_to_true_features=sample_names_to_true_features, BALL=BALL) make_umap_plots_per_sample(model, data_input, correct_idxs_true_pos_tr, savename='true_pos_correct_dev_tr.png', plots_per_row=plots_per_row_BALL, background_data_to_plot=full_background_data_to_plot, expanded_data_per_sample=tr_expanded_data, sample_names_to_true_features=sample_names_to_true_features, BALL=BALL) make_umap_plots_per_sample(model, data_input, correct_idxs_true_neg_tr, savename='true_neg_correct_dev_tr.png', plots_per_row=plots_per_row_BALL, background_data_to_plot=full_background_data_to_plot, expanded_data_per_sample=tr_expanded_data, sample_names_to_true_features=sample_names_to_true_features, BALL=BALL) make_umap_plots_per_sample(model, data_input, incorrect_idxs_true_pos_te, savename='true_pos_incorrect_dev_te.png', plots_per_row=plots_per_row_BALL, background_data_to_plot=full_background_data_to_plot, expanded_data_per_sample=te_expanded_data, sample_names_to_true_features=sample_names_to_true_features, BALL=BALL) make_umap_plots_per_sample(model, data_input, incorrect_idxs_true_neg_te, savename='true_neg_incorrect_dev_te.png', plots_per_row=plots_per_row_BALL, background_data_to_plot=full_background_data_to_plot, expanded_data_per_sample=te_expanded_data, sample_names_to_true_features=sample_names_to_true_features, BALL=BALL) make_umap_plots_per_sample(model, data_input, correct_idxs_true_pos_te, savename='true_pos_correct_dev_te.png', plots_per_row=plots_per_row_BALL, background_data_to_plot=full_background_data_to_plot, expanded_data_per_sample=te_expanded_data, sample_names_to_true_features=sample_names_to_true_features, BALL=BALL) make_umap_plots_per_sample(model, data_input, correct_idxs_true_neg_te, savename='true_neg_correct_dev_te.png', plots_per_row=plots_per_row_BALL, background_data_to_plot=full_background_data_to_plot, expanded_data_per_sample=te_expanded_data, sample_names_to_true_features=sample_names_to_true_features, BALL=BALL)
def push_converged_boundaries(path_to_params, stepsize, n_steps): start_time = time.time() params = TransformParameterParser(path_to_params).parse_params() print(params) #evauntually uncomment this leaving asis in order ot keep the same results as before to compare. set_random_seeds(params) data_input = DataInput(params['data_params']) data_input.split_data() print('%d samples in the training data' % len(data_input.x_tr)) with open(os.path.join(params['save_dir'], 'trackers.pkl'), 'rb') as f: trackers = pickle.load(f) with open(os.path.join(params['save_dir'], 'transformer.pkl'), 'rb') as f: umapper = pickle.load(f) # FOR DEBUGGING ONLY #params['transform_params']['cells_to_subsample'] = 10 data_input.embed_data(\ umapper, cells_to_subsample=params['transform_params']['cells_to_subsample'], use_labels_to_transform_data=params['transform_params']['use_labels_to_transform_data'] ) data_input.normalize_data() data_input.convert_all_data_to_tensors() model = DepthOneModel([[['D1', 0, 0], ['D2', 0, 0]]], params['model_params']) model.load_state_dict( torch.load(os.path.join(params['save_dir'], 'model.pkl'))) init_acc = trackers[0].metrics['tr_acc'][-1] cur_best_acc = init_acc starting_gate = model.get_gates()[0] cur_gate = copy.deepcopy(starting_gate) cur_best_gate = copy.deepcopy(cur_gate) print('Starting gate:', starting_gate) counter = 0 for left_step in range(n_steps): cur_gate[0] = starting_gate[0] - left_step * stepsize for right_step in range(n_steps): cur_gate[1] = starting_gate[1] + right_step * stepsize for down_step in range(n_steps): cur_gate[2] = starting_gate[2] - down_step * stepsize for up_step in range(n_steps): cur_gate[3] = starting_gate[3] + up_step * stepsize model = DepthOneModel([[['D1', cur_gate[0], cur_gate[1]], ['D2', cur_gate[2], cur_gate[3]]]], params['model_params']) fit_classifier_params( model, data_input, params['train_params']['learning_rate_classifier']) # model.nodes = None # model.init_nodes([[['D1', cur_gate[0], cur_gate[1]], ['D2', cur_gate[2], cur_gate[3]]]]) cur_acc = compute_tr_acc(model, data_input) #cur_acc = performance_tracker.metrics['tr_acc'][-1] counter += 1 print(counter) print(cur_gate) print(cur_acc) if cur_acc > cur_best_acc: cur_best_acc = cur_acc cur_best_gate = copy.deepcopy(cur_gate) print('Final acc %.3f, Initial acc %.3f' % (cur_best_acc, init_acc)) print('Init/final gates', starting_gate, cur_best_gate)