Beispiel #1
0
def init_data_input(params, transformer_path):
    data_input = DataInput(params['data_params'])
    data_input.split_data()
    with open(transformer_path, 'rb') as f:
        data_transformer = pickle.load(f)
    print(data_transformer)
    data_input.embed_data(data_transformer, \
        params['transform_params']['cells_to_subsample'], 
        params['transform_params']['num_cells_for_transformer']
    )
    data_input.normalize_data()
    data_input.prepare_data_for_training() 
    return data_input
Beispiel #2
0
def main(path_to_params):
    params = TransformParameterParser(path_to_params).parse_params()
    start_time = time.time()
    print(params)
    if not os.path.exists(params['save_dir']):
        os.makedirs(params['save_dir'])
    with open(os.path.join(params['save_dir'], 'params.pkl'), 'wb') as f:
        pickle.dump(params, f)
    data_input = DataInput(params['data_params'])
    data_input.split_data()

    data_transformer = DataTransformerFactory(
        params['transform_params']).manufacture_transformer()
    data_input.embed_data(
        data_transformer, params['transform_params']['cells_to_subsample'],
        params['transform_params']['num_cells_for_transformer']
    )  #cells to subsample should change to a transformer param instead
    data_input.save_transformer(params['save_dir'])

    data_input.normalize_data()
    init_gate_tree = init_plot_and_save_gates(data_input, params)

    model = initialize_model(params['model_params'], init_gate_tree)
    data_input.prepare_data_for_training()
    performance_tracker = run_train_model(model, params['train_params'],
                                          data_input)

    model_save_path = os.path.join(params['save_dir'], 'model.pkl')
    torch.save(model.state_dict(), model_save_path)

    tracker_save_path = os.path.join(params['save_dir'], 'tracker.pkl')
    with open(tracker_save_path, 'wb') as f:
        pickle.dump(performance_tracker, f)
    results_plotter = DataAndGatesPlotterDepthOne(
        model, np.concatenate(data_input.x_tr))
    #fig, axes = plt.subplots(params['gate_init_params']['n_clusters'], figsize=(1 * params['gate_init_params']['n_clusters'], 3 * params['gate_init_params']['n_clusters']))
    results_plotter.plot_data_with_gates(
        np.array(
            np.concatenate([
                data_input.y_tr[i] *
                torch.ones([data_input.x_tr[i].shape[0], 1])
                for i in range(len(data_input.x_tr))
            ])))

    plt.savefig(os.path.join(params['save_dir'], 'final_gates.png'))
    print('Complete main loop took %.4f seconds' % (time.time() - start_time))
def load_and_prepare_data_input(params):
    data_input = DataInput(params['data_params'])
    data_input.split_data()
    print('%d samples in the training data' % len(data_input.x_tr))

    with open(os.path.join(params['save_dir'], 'transformer.pkl'), 'rb') as f:
        data_transformer = pickle.load(f)

    # for debugging
    #params['transform_params']['cells_to_subsample'] = 2
    data_input.embed_data(\
        data_transformer,
        cells_to_subsample=params['transform_params']['cells_to_subsample'],
        use_labels_to_transform_data=params['transform_params']['use_labels_to_transform_data']
    )
    data_input.normalize_data()
    data_input.convert_all_data_to_tensors()
    return data_input
Beispiel #4
0
def run_once_with_fixed_size(params, size, run, data_transformer):
    start_time = time.time()

    #set_random_seeds(params) for some reason doing this produces a different UMAP embedding- likely a bug in the UMAP package I'm using, so have to set seed in data input to get consistent splits

    if not os.path.exists(params['save_dir']):
        os.makedirs(params['save_dir'])

    with open(os.path.join(params['save_dir'], 'params.pkl'), 'wb') as f:
        pickle.dump(params, f)

    data_input = DataInput(params['data_params'])
    data_input.split_data(split_seed=params['random_seed'])

    data_input.embed_data(\
        data_transformer,
        cells_to_subsample=params['transform_params']['cells_to_subsample'],
    )
    #data_input.save_transformer(params['save_dir'])
    data_input.normalize_data()

    init_gate_tree = get_init_gate_in_disc_region(size)
    model = initialize_model(params['model_params'], init_gate_tree)
    #this line fixes the size
    model.fix_size_params(size)
    data_input.convert_all_data_to_tensors()
    trackers_per_step = []
    performance_tracker = run_train_model(model, params['train_params'],
                                          data_input)
    check_size_stayed_constant(model, size)
    make_and_save_plot_to_check_umap_stays_same(model, data_input, run, params)

    model_save_path = os.path.join(params['save_dir'], 'model.pkl')
    torch.save(model.state_dict(), model_save_path)

    tracker_save_path = os.path.join(params['save_dir'], 'tracker.pkl')
    with open(tracker_save_path, 'wb') as f:
        pickle.dump(performance_tracker, f)
    print('Complete main loop took %.4f seconds' % (time.time() - start_time))
    return model, performance_tracker, data_transformer
def load_saved_model_and_matching_data_input(path_to_params):
    def set_random_seeds(params):
        torch.manual_seed(params['random_seed'])
        np.random.seed(params['random_seed'])

    start_time = time.time()

    params = TransformParameterParser(path_to_params).parse_params()
    print(params)

    #evauntually uncomment this leaving asis in order ot keep the same results as before to compare.
    set_random_seeds(params)

    data_input = DataInput(params['data_params'])
    data_input.split_data()
    print('%d samples in the training data' % len(data_input.x_tr))

    with open(os.path.join(params['save_dir'], 'trackers.pkl'), 'rb') as f:
        trackers = pickle.load(f)

    with open(os.path.join(params['save_dir'], 'transformer.pkl'), 'rb') as f:
        umapper = pickle.load(f)
    # FOR DEBUGGING ONLY
    #params['transform_params']['cells_to_subsample'] = 10
    data_input.embed_data(\
        umapper,
        cells_to_subsample=params['transform_params']['cells_to_subsample'],
        use_labels_to_transform_data=params['transform_params']['use_labels_to_transform_data']
    )
    data_input.normalize_data()
    data_input.convert_all_data_to_tensors()

    model = DepthOneModel([[['D1', 0, 0], ['D2', 0, 0]]],
                          params['model_params'])
    model.load_state_dict(
        torch.load(os.path.join(params['save_dir'], 'model.pkl')))
    return params, model, data_input, umapper
Beispiel #6
0
def make_umap_plots_for_incorrect_and_correct_samples(
    results_path, plot_expanded_data=True, path_to_true_features=None,
    BALL=False):
    with open(os.path.join(results_path, 'configs.pkl'), 'rb') as f:
        params = pickle.load(f)

    with open(os.path.join(results_path, 'transformer.pkl'), 'rb') as f:
        umapper = pickle.load(f)

    sample_names_to_true_features = None
    if path_to_true_features:
        with open(path_to_true_features, 'rb') as f:
            sample_names_to_true_features = pickle.load(f)

    set_random_seeds(params)    

    model = DepthOneModel([[['D1', 0, 0], ['D2', 0, 0]]], params['model_params'])
    model.load_state_dict(torch.load(os.path.join(results_path, 'model.pkl')))
    try: 
        print(params['data_params']['use_presplit_data'])
    except:
        params['data_params']['use_presplit_data'] = False
    data_input = DataInput(params['data_params'])
    # splitting because codebase requires a split currently
    data_input.split_data()
    print('embedding data')
    # only for debuggin
    #params['transform_params']['cells_to_subsample'] = 2
    data_input.embed_data(
        umapper,
        cells_to_subsample = params['transform_params']['cells_to_subsample'],
        use_labels_to_transform_data = params['transform_params']['use_labels_to_transform_data']
    )

    data_input.normalize_data()
    data_input.convert_all_data_to_tensors()

    # gate expansion using kde
    if plot_expanded_data:
        print(model.get_gates()[0])
        kde_expander = KDEGateExpander(data_input.x_tr, model.get_gates()[0], sigma_thresh_factor=.5)
        kde_expander.expand_gates()
        kde_expander.collect_expanded_cells_per_sample()
        tr_expanded_data = kde_expander.expanded_data_per_sample
        te_expanded_data = kde_expander.get_expanded_data_new_samples(data_input.x_te)
    else:
        tr_expanded_data = None
        te_expanded_data = None
    output_tr = model(data_input.x_tr, data_input.y_tr)
    output_te = model(data_input.x_te, data_input.y_te)
    matching_tr = [( (output_tr['y_pred'].cpu().detach().numpy() >= .5)[i] * 1.0 == data_input.y_tr[i] ) for i in range(len(data_input.y_tr))]
    pos_probs_tr = np.array([prob.cpu().detach().numpy() for prob in output_tr['y_pred']])
    sorted_idxs_tr = np.argsort(pos_probs_tr)

    #correct_idxs_tr = [data_input.idxs_tr[i]  for i in range(len(data_input.y_tr)) if matching_tr[i]]
    correct_idxs_tr = [data_input.idxs_tr[i]  for i in sorted_idxs_tr if matching_tr[i]]

    correct_idxs_true_pos_tr = [idx for idx in correct_idxs_tr if data_input.y_tr[data_input.idxs_tr.index(idx)] == 1]
    correct_idxs_true_neg_tr = [idx for idx in correct_idxs_tr if data_input.y_tr[data_input.idxs_tr.index(idx)] == 0]

    #incorrect_idxs_tr = [data_input.idxs_tr[i]  for i in range(len(data_input.y_tr)) if not matching_tr[i]]
    incorrect_idxs_tr = [data_input.idxs_tr[i]  for i in sorted_idxs_tr if not matching_tr[i]]
    incorrect_idxs_true_pos_tr = [idx for idx in incorrect_idxs_tr if data_input.y_tr[data_input.idxs_tr.index(idx)] == 1]
    incorrect_idxs_true_neg_tr = [idx for idx in incorrect_idxs_tr if data_input.y_tr[data_input.idxs_tr.index(idx)] == 0]


    print(np.sum(correct_idxs_tr)/len(data_input.x_tr))

    matching_te = [( (output_te['y_pred'].cpu().detach().numpy() >= .5)[i] * 1.0 == data_input.y_te[i] ) for i in range(len(data_input.y_te))]
    pos_probs_te = np.array([prob.cpu().detach().numpy() for prob in output_te['y_pred']])
    sorted_idxs_te = np.argsort(pos_probs_te)

    #correct_idxs_te = [data_input.idxs_te[i]  for i in range(len(data_input.y_te)) if matching_te[i]]
    correct_idxs_te = [data_input.idxs_te[i]  for i in sorted_idxs_te if matching_te[i]]
    correct_idxs_true_pos_te = [idx for idx in correct_idxs_te if data_input.y_te[data_input.idxs_te.index(idx)] == 1]
    correct_idxs_true_neg_te = [idx for idx in correct_idxs_te if data_input.y_te[data_input.idxs_te.index(idx)] == 0]

    #incorrect_idxs_te = [data_input.idxs_te[i]  for i in range(len(data_input.y_te)) if not matching_te[i]]
    incorrect_idxs_te = [data_input.idxs_te[i]  for i in sorted_idxs_te if not matching_te[i]]
    incorrect_idxs_true_pos_te = [idx for idx in incorrect_idxs_te if data_input.y_te[data_input.idxs_te.index(idx)] == 1]
    incorrect_idxs_true_neg_te = [idx for idx in incorrect_idxs_te if data_input.y_te[data_input.idxs_te.index(idx)] == 0]
    print('correct te idxs:', correct_idxs_te, 'incorrect te idxs', incorrect_idxs_te)
    print(incorrect_idxs_true_neg_te)




    background_data_to_plot_neg = np.concatenate([data for i, data in enumerate(data_input.x_tr)  if data_input.y_tr[i] == 0])
    try:
        background_data_to_plot_neg = np.concatenate([background_data_to_plot_neg, np.concatenate([data for i, data in enumerate(data_input.x_te)  if data_input.y_te[i] == 0])])
    except:
        pass


    background_data_to_plot_pos = np.concatenate([data for i, data in enumerate(data_input.x_tr)  if data_input.y_tr[i]])
    background_data_to_plot_pos = np.concatenate([background_data_to_plot_pos, np.concatenate([data for i, data in enumerate(data_input.x_te)  if data_input.y_te[i]])])

    full_background_data_to_plot = np.concatenate([background_data_to_plot_pos, background_data_to_plot_neg])

    ### CHANGE SAVENAME IF YOU USE VAL DATA HERE
    plots_per_row_BALL = 9
    make_umap_plots_per_sample(model, data_input, incorrect_idxs_true_pos_tr, savename='true_pos_incorrect_dev_tr.png', plots_per_row=plots_per_row_BALL, background_data_to_plot=full_background_data_to_plot, expanded_data_per_sample=tr_expanded_data, sample_names_to_true_features=sample_names_to_true_features, BALL=BALL)
    make_umap_plots_per_sample(model, data_input, incorrect_idxs_true_neg_tr, savename='true_neg_incorrect_dev_tr.png', plots_per_row=plots_per_row_BALL, background_data_to_plot=full_background_data_to_plot, expanded_data_per_sample=tr_expanded_data, sample_names_to_true_features=sample_names_to_true_features, BALL=BALL)
    make_umap_plots_per_sample(model, data_input, correct_idxs_true_pos_tr, savename='true_pos_correct_dev_tr.png', plots_per_row=plots_per_row_BALL, background_data_to_plot=full_background_data_to_plot, expanded_data_per_sample=tr_expanded_data, sample_names_to_true_features=sample_names_to_true_features, BALL=BALL)
    make_umap_plots_per_sample(model, data_input, correct_idxs_true_neg_tr, savename='true_neg_correct_dev_tr.png', plots_per_row=plots_per_row_BALL, background_data_to_plot=full_background_data_to_plot, expanded_data_per_sample=tr_expanded_data, sample_names_to_true_features=sample_names_to_true_features, BALL=BALL)


    make_umap_plots_per_sample(model, data_input, incorrect_idxs_true_pos_te, savename='true_pos_incorrect_dev_te.png', plots_per_row=plots_per_row_BALL, background_data_to_plot=full_background_data_to_plot, expanded_data_per_sample=te_expanded_data, sample_names_to_true_features=sample_names_to_true_features, BALL=BALL)
    make_umap_plots_per_sample(model, data_input, incorrect_idxs_true_neg_te, savename='true_neg_incorrect_dev_te.png', plots_per_row=plots_per_row_BALL, background_data_to_plot=full_background_data_to_plot, expanded_data_per_sample=te_expanded_data, sample_names_to_true_features=sample_names_to_true_features, BALL=BALL)
    make_umap_plots_per_sample(model, data_input, correct_idxs_true_pos_te, savename='true_pos_correct_dev_te.png', plots_per_row=plots_per_row_BALL, background_data_to_plot=full_background_data_to_plot, expanded_data_per_sample=te_expanded_data, sample_names_to_true_features=sample_names_to_true_features, BALL=BALL)
    make_umap_plots_per_sample(model, data_input, correct_idxs_true_neg_te, savename='true_neg_correct_dev_te.png', plots_per_row=plots_per_row_BALL, background_data_to_plot=full_background_data_to_plot, expanded_data_per_sample=te_expanded_data, sample_names_to_true_features=sample_names_to_true_features, BALL=BALL)
Beispiel #7
0
def push_converged_boundaries(path_to_params, stepsize, n_steps):
    start_time = time.time()

    params = TransformParameterParser(path_to_params).parse_params()
    print(params)

    #evauntually uncomment this leaving asis in order ot keep the same results as before to compare.
    set_random_seeds(params)

    data_input = DataInput(params['data_params'])
    data_input.split_data()
    print('%d samples in the training data' % len(data_input.x_tr))

    with open(os.path.join(params['save_dir'], 'trackers.pkl'), 'rb') as f:
        trackers = pickle.load(f)

    with open(os.path.join(params['save_dir'], 'transformer.pkl'), 'rb') as f:
        umapper = pickle.load(f)
    # FOR DEBUGGING ONLY
    #params['transform_params']['cells_to_subsample'] = 10
    data_input.embed_data(\
        umapper,
        cells_to_subsample=params['transform_params']['cells_to_subsample'],
        use_labels_to_transform_data=params['transform_params']['use_labels_to_transform_data']
    )
    data_input.normalize_data()
    data_input.convert_all_data_to_tensors()

    model = DepthOneModel([[['D1', 0, 0], ['D2', 0, 0]]],
                          params['model_params'])
    model.load_state_dict(
        torch.load(os.path.join(params['save_dir'], 'model.pkl')))

    init_acc = trackers[0].metrics['tr_acc'][-1]
    cur_best_acc = init_acc
    starting_gate = model.get_gates()[0]
    cur_gate = copy.deepcopy(starting_gate)
    cur_best_gate = copy.deepcopy(cur_gate)
    print('Starting gate:', starting_gate)
    counter = 0
    for left_step in range(n_steps):
        cur_gate[0] = starting_gate[0] - left_step * stepsize
        for right_step in range(n_steps):
            cur_gate[1] = starting_gate[1] + right_step * stepsize
            for down_step in range(n_steps):
                cur_gate[2] = starting_gate[2] - down_step * stepsize
                for up_step in range(n_steps):
                    cur_gate[3] = starting_gate[3] + up_step * stepsize
                    model = DepthOneModel([[['D1', cur_gate[0], cur_gate[1]],
                                            ['D2', cur_gate[2], cur_gate[3]]]],
                                          params['model_params'])
                    fit_classifier_params(
                        model, data_input,
                        params['train_params']['learning_rate_classifier'])
                    #                    model.nodes = None
                    #                    model.init_nodes([[['D1', cur_gate[0], cur_gate[1]], ['D2', cur_gate[2], cur_gate[3]]]])
                    cur_acc = compute_tr_acc(model, data_input)
                    #cur_acc = performance_tracker.metrics['tr_acc'][-1]
                    counter += 1
                    print(counter)
                    print(cur_gate)
                    print(cur_acc)
                    if cur_acc > cur_best_acc:
                        cur_best_acc = cur_acc
                        cur_best_gate = copy.deepcopy(cur_gate)

    print('Final acc %.3f, Initial acc %.3f' % (cur_best_acc, init_acc))
    print('Init/final gates', starting_gate, cur_best_gate)