def load_data_input(path_to_params):
    params = TransformParameterParser(path_to_params).parse_params()
    print(params)

    set_random_seeds(params)

    data_input = DataInput(params['data_params'])
    data_input.split_data()
    return data_input
Exemple #2
0
def init_data_input(params, transformer_path):
    data_input = DataInput(params['data_params'])
    data_input.split_data()
    with open(transformer_path, 'rb') as f:
        data_transformer = pickle.load(f)
    print(data_transformer)
    data_input.embed_data(data_transformer, \
        params['transform_params']['cells_to_subsample'], 
        params['transform_params']['num_cells_for_transformer']
    )
    data_input.normalize_data()
    data_input.prepare_data_for_training() 
    return data_input
def load_and_prepare_data_input(params):
    data_input = DataInput(params['data_params'])
    data_input.split_data()
    print('%d samples in the training data' % len(data_input.x_tr))

    with open(os.path.join(params['save_dir'], 'transformer.pkl'), 'rb') as f:
        data_transformer = pickle.load(f)

    # for debugging
    #params['transform_params']['cells_to_subsample'] = 2
    data_input.embed_data(\
        data_transformer,
        cells_to_subsample=params['transform_params']['cells_to_subsample'],
        use_labels_to_transform_data=params['transform_params']['use_labels_to_transform_data']
    )
    data_input.normalize_data()
    data_input.convert_all_data_to_tensors()
    return data_input
Exemple #4
0
def main(params):
    start_time = time.time()

    #evauntually uncomment this leaving asis in order ot keep the same results as before to compare.
    set_random_seeds(params)

    if not os.path.exists(params['save_dir']):
        os.makedirs(params['save_dir'])

    with open(os.path.join(params['save_dir'], 'params.pkl'), 'wb') as f:
        pickle.dump(params, f)

    data_input = DataInput(params['data_params'])
    data_input.split_data()
    print('%d samples in the training data' %len(data_input.x_tr))
    # force identity for the first transform
    data_transformer = DataTransformerFactory({'transform_type': 'identity'}, params['random_seed']).manufacture_transformer()

    data_input.embed_data_and_fit_transformer(\
        data_transformer,
        cells_to_subsample=params['transform_params']['cells_to_subsample'],
        num_cells_for_transformer=params['transform_params']['num_cells_for_transformer'],
        use_labels_to_transform_data=params['transform_params']['use_labels_to_transform_data']
    )

    data_input.normalize_data()

    # gates aren't plotted because we're in n dimensions
    unused_cluster_gate_inits = init_gates(data_input, params)

    # data_input.convert_all_data_to_tensors()
    figscale = 8
    fig, axs = plt.subplots(nrows=len(unused_cluster_gate_inits), figsize=(figscale, len(unused_cluster_gate_inits)*figscale))

    print("initializing model")
    for gate, ax in zip(unused_cluster_gate_inits, axs):
        dataset = torch.utils.data.TensorDataset(torch.tensor(data_input.x_tr, dtype=torch.float),
                                                 torch.tensor(data_input.y_tr, dtype=torch.float))
        trainloader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=True)
        criterion = torch.nn.BCEWithLogitsLoss()
        model = SingleGateModel(params, gate)

        optimizer = torch.optim.Adam(model.parameters(), lr=1e-7, weight_decay=1e-2)

        print("initializing LR finder")
        lr_finder = LRFinder(model, optimizer, criterion)
        lr_finder.range_test(trainloader, end_lr=1e4, num_iter=100)
        lr_finder.plot(ax=ax)
        print("LR History:", lr_finder.history)
    plt.savefig(os.path.join(params['save_dir'], 'lr_find.png'))

    print('Complete main loop took %.4f seconds' %(time.time() - start_time))
    return
Exemple #5
0
def cross_validate_accuracy_over_saved_results(path_to_results,
                                               stepsize,
                                               n_steps,
                                               nfolds=20,
                                               starting_fold=30):
    path_to_params = os.path.join(path_to_results, 'params.yaml')

    params = TransformParameterParser(path_to_params).parse_params()
    print(params)
    cur_params = deepcopy(params)

    #evauntually uncomment this leaving asis in order ot keep the same results as before to compare.
    set_random_seeds(params)
    data_input = DataInput(params['data_params'])
    te_accs = []
    pushed_gates_per_fold = []
    starting_gates_per_fold = []
    diffs_per_fold = []

    for fold in range(starting_fold):
        data_input.split_data()

    for fold in range(starting_fold, nfolds + starting_fold):
        print('Running fold %d' % fold)
        cur_params['save_dir'] = os.path.join(params['save_dir'],
                                              'run%d' % fold)
        data_input.split_data()
        best_tr_acc, starting_gate, best_gate = push_converged_boundaries_given_data_input_and_params(
            cur_params, data_input, stepsize, n_steps, path_to_params)

        model = DepthOneModel([[['D1', best_gate[0], best_gate[1]],
                                ['D2', best_gate[2], best_gate[3]]]],
                              params['model_params'])
        fit_classifier_params(
            model, data_input,
            params['train_params']['learning_rate_classifier'])
        te_acc = compute_te_acc(model, data_input)
        print('te acc for fold %d is %.3f' % (fold, te_acc))
        te_accs.append(te_acc)
        pushed_gates_per_fold.append(best_gate)
        starting_gates_per_fold.append(starting_gate)
        diffs_per_fold.append(get_diff_between_gates(starting_gate, best_gate))
        print('Diff: ', get_diff_between_gates(starting_gate, best_gate))

    print('Te accs:', te_accs)
    print('Diffs per fold:', diffs_per_fold)
    with open(
            os.path.join(path_to_results,
                         'expanded_boundaries_te_accs_per_fold.pkl'),
            'wb') as f:
        pickle.dump(te_accs, f)
    with open(
            os.path.join(path_to_results,
                         'expanded_boundaries_diffs_per_fold.pkl'), 'wb') as f:
        pickle.dump(diffs_per_fold, f)
    with open(
            os.path.join(path_to_results,
                         'expanded_boundaries_best_pushed_gates_per_fold.pkl'),
            'wb') as f:
        pickle.dump(pushed_gates_per_fold, f)
Exemple #6
0
def run_once_with_fixed_size(params, size, run, data_transformer):
    start_time = time.time()

    #set_random_seeds(params) for some reason doing this produces a different UMAP embedding- likely a bug in the UMAP package I'm using, so have to set seed in data input to get consistent splits

    if not os.path.exists(params['save_dir']):
        os.makedirs(params['save_dir'])

    with open(os.path.join(params['save_dir'], 'params.pkl'), 'wb') as f:
        pickle.dump(params, f)

    data_input = DataInput(params['data_params'])
    data_input.split_data(split_seed=params['random_seed'])

    data_input.embed_data(\
        data_transformer,
        cells_to_subsample=params['transform_params']['cells_to_subsample'],
    )
    #data_input.save_transformer(params['save_dir'])
    data_input.normalize_data()

    init_gate_tree = get_init_gate_in_disc_region(size)
    model = initialize_model(params['model_params'], init_gate_tree)
    #this line fixes the size
    model.fix_size_params(size)
    data_input.convert_all_data_to_tensors()
    trackers_per_step = []
    performance_tracker = run_train_model(model, params['train_params'],
                                          data_input)
    check_size_stayed_constant(model, size)
    make_and_save_plot_to_check_umap_stays_same(model, data_input, run, params)

    model_save_path = os.path.join(params['save_dir'], 'model.pkl')
    torch.save(model.state_dict(), model_save_path)

    tracker_save_path = os.path.join(params['save_dir'], 'tracker.pkl')
    with open(tracker_save_path, 'wb') as f:
        pickle.dump(performance_tracker, f)
    print('Complete main loop took %.4f seconds' % (time.time() - start_time))
    return model, performance_tracker, data_transformer
def load_saved_model_and_matching_data_input(path_to_params):
    def set_random_seeds(params):
        torch.manual_seed(params['random_seed'])
        np.random.seed(params['random_seed'])

    start_time = time.time()

    params = TransformParameterParser(path_to_params).parse_params()
    print(params)

    #evauntually uncomment this leaving asis in order ot keep the same results as before to compare.
    set_random_seeds(params)

    data_input = DataInput(params['data_params'])
    data_input.split_data()
    print('%d samples in the training data' % len(data_input.x_tr))

    with open(os.path.join(params['save_dir'], 'trackers.pkl'), 'rb') as f:
        trackers = pickle.load(f)

    with open(os.path.join(params['save_dir'], 'transformer.pkl'), 'rb') as f:
        umapper = pickle.load(f)
    # FOR DEBUGGING ONLY
    #params['transform_params']['cells_to_subsample'] = 10
    data_input.embed_data(\
        umapper,
        cells_to_subsample=params['transform_params']['cells_to_subsample'],
        use_labels_to_transform_data=params['transform_params']['use_labels_to_transform_data']
    )
    data_input.normalize_data()
    data_input.convert_all_data_to_tensors()

    model = DepthOneModel([[['D1', 0, 0], ['D2', 0, 0]]],
                          params['model_params'])
    model.load_state_dict(
        torch.load(os.path.join(params['save_dir'], 'model.pkl')))
    return params, model, data_input, umapper
Exemple #8
0
def make_umap_plots_for_incorrect_and_correct_samples(
    results_path, plot_expanded_data=True, path_to_true_features=None,
    BALL=False):
    with open(os.path.join(results_path, 'configs.pkl'), 'rb') as f:
        params = pickle.load(f)

    with open(os.path.join(results_path, 'transformer.pkl'), 'rb') as f:
        umapper = pickle.load(f)

    sample_names_to_true_features = None
    if path_to_true_features:
        with open(path_to_true_features, 'rb') as f:
            sample_names_to_true_features = pickle.load(f)

    set_random_seeds(params)    

    model = DepthOneModel([[['D1', 0, 0], ['D2', 0, 0]]], params['model_params'])
    model.load_state_dict(torch.load(os.path.join(results_path, 'model.pkl')))
    try: 
        print(params['data_params']['use_presplit_data'])
    except:
        params['data_params']['use_presplit_data'] = False
    data_input = DataInput(params['data_params'])
    # splitting because codebase requires a split currently
    data_input.split_data()
    print('embedding data')
    # only for debuggin
    #params['transform_params']['cells_to_subsample'] = 2
    data_input.embed_data(
        umapper,
        cells_to_subsample = params['transform_params']['cells_to_subsample'],
        use_labels_to_transform_data = params['transform_params']['use_labels_to_transform_data']
    )

    data_input.normalize_data()
    data_input.convert_all_data_to_tensors()

    # gate expansion using kde
    if plot_expanded_data:
        print(model.get_gates()[0])
        kde_expander = KDEGateExpander(data_input.x_tr, model.get_gates()[0], sigma_thresh_factor=.5)
        kde_expander.expand_gates()
        kde_expander.collect_expanded_cells_per_sample()
        tr_expanded_data = kde_expander.expanded_data_per_sample
        te_expanded_data = kde_expander.get_expanded_data_new_samples(data_input.x_te)
    else:
        tr_expanded_data = None
        te_expanded_data = None
    output_tr = model(data_input.x_tr, data_input.y_tr)
    output_te = model(data_input.x_te, data_input.y_te)
    matching_tr = [( (output_tr['y_pred'].cpu().detach().numpy() >= .5)[i] * 1.0 == data_input.y_tr[i] ) for i in range(len(data_input.y_tr))]
    pos_probs_tr = np.array([prob.cpu().detach().numpy() for prob in output_tr['y_pred']])
    sorted_idxs_tr = np.argsort(pos_probs_tr)

    #correct_idxs_tr = [data_input.idxs_tr[i]  for i in range(len(data_input.y_tr)) if matching_tr[i]]
    correct_idxs_tr = [data_input.idxs_tr[i]  for i in sorted_idxs_tr if matching_tr[i]]

    correct_idxs_true_pos_tr = [idx for idx in correct_idxs_tr if data_input.y_tr[data_input.idxs_tr.index(idx)] == 1]
    correct_idxs_true_neg_tr = [idx for idx in correct_idxs_tr if data_input.y_tr[data_input.idxs_tr.index(idx)] == 0]

    #incorrect_idxs_tr = [data_input.idxs_tr[i]  for i in range(len(data_input.y_tr)) if not matching_tr[i]]
    incorrect_idxs_tr = [data_input.idxs_tr[i]  for i in sorted_idxs_tr if not matching_tr[i]]
    incorrect_idxs_true_pos_tr = [idx for idx in incorrect_idxs_tr if data_input.y_tr[data_input.idxs_tr.index(idx)] == 1]
    incorrect_idxs_true_neg_tr = [idx for idx in incorrect_idxs_tr if data_input.y_tr[data_input.idxs_tr.index(idx)] == 0]


    print(np.sum(correct_idxs_tr)/len(data_input.x_tr))

    matching_te = [( (output_te['y_pred'].cpu().detach().numpy() >= .5)[i] * 1.0 == data_input.y_te[i] ) for i in range(len(data_input.y_te))]
    pos_probs_te = np.array([prob.cpu().detach().numpy() for prob in output_te['y_pred']])
    sorted_idxs_te = np.argsort(pos_probs_te)

    #correct_idxs_te = [data_input.idxs_te[i]  for i in range(len(data_input.y_te)) if matching_te[i]]
    correct_idxs_te = [data_input.idxs_te[i]  for i in sorted_idxs_te if matching_te[i]]
    correct_idxs_true_pos_te = [idx for idx in correct_idxs_te if data_input.y_te[data_input.idxs_te.index(idx)] == 1]
    correct_idxs_true_neg_te = [idx for idx in correct_idxs_te if data_input.y_te[data_input.idxs_te.index(idx)] == 0]

    #incorrect_idxs_te = [data_input.idxs_te[i]  for i in range(len(data_input.y_te)) if not matching_te[i]]
    incorrect_idxs_te = [data_input.idxs_te[i]  for i in sorted_idxs_te if not matching_te[i]]
    incorrect_idxs_true_pos_te = [idx for idx in incorrect_idxs_te if data_input.y_te[data_input.idxs_te.index(idx)] == 1]
    incorrect_idxs_true_neg_te = [idx for idx in incorrect_idxs_te if data_input.y_te[data_input.idxs_te.index(idx)] == 0]
    print('correct te idxs:', correct_idxs_te, 'incorrect te idxs', incorrect_idxs_te)
    print(incorrect_idxs_true_neg_te)




    background_data_to_plot_neg = np.concatenate([data for i, data in enumerate(data_input.x_tr)  if data_input.y_tr[i] == 0])
    try:
        background_data_to_plot_neg = np.concatenate([background_data_to_plot_neg, np.concatenate([data for i, data in enumerate(data_input.x_te)  if data_input.y_te[i] == 0])])
    except:
        pass


    background_data_to_plot_pos = np.concatenate([data for i, data in enumerate(data_input.x_tr)  if data_input.y_tr[i]])
    background_data_to_plot_pos = np.concatenate([background_data_to_plot_pos, np.concatenate([data for i, data in enumerate(data_input.x_te)  if data_input.y_te[i]])])

    full_background_data_to_plot = np.concatenate([background_data_to_plot_pos, background_data_to_plot_neg])

    ### CHANGE SAVENAME IF YOU USE VAL DATA HERE
    plots_per_row_BALL = 9
    make_umap_plots_per_sample(model, data_input, incorrect_idxs_true_pos_tr, savename='true_pos_incorrect_dev_tr.png', plots_per_row=plots_per_row_BALL, background_data_to_plot=full_background_data_to_plot, expanded_data_per_sample=tr_expanded_data, sample_names_to_true_features=sample_names_to_true_features, BALL=BALL)
    make_umap_plots_per_sample(model, data_input, incorrect_idxs_true_neg_tr, savename='true_neg_incorrect_dev_tr.png', plots_per_row=plots_per_row_BALL, background_data_to_plot=full_background_data_to_plot, expanded_data_per_sample=tr_expanded_data, sample_names_to_true_features=sample_names_to_true_features, BALL=BALL)
    make_umap_plots_per_sample(model, data_input, correct_idxs_true_pos_tr, savename='true_pos_correct_dev_tr.png', plots_per_row=plots_per_row_BALL, background_data_to_plot=full_background_data_to_plot, expanded_data_per_sample=tr_expanded_data, sample_names_to_true_features=sample_names_to_true_features, BALL=BALL)
    make_umap_plots_per_sample(model, data_input, correct_idxs_true_neg_tr, savename='true_neg_correct_dev_tr.png', plots_per_row=plots_per_row_BALL, background_data_to_plot=full_background_data_to_plot, expanded_data_per_sample=tr_expanded_data, sample_names_to_true_features=sample_names_to_true_features, BALL=BALL)


    make_umap_plots_per_sample(model, data_input, incorrect_idxs_true_pos_te, savename='true_pos_incorrect_dev_te.png', plots_per_row=plots_per_row_BALL, background_data_to_plot=full_background_data_to_plot, expanded_data_per_sample=te_expanded_data, sample_names_to_true_features=sample_names_to_true_features, BALL=BALL)
    make_umap_plots_per_sample(model, data_input, incorrect_idxs_true_neg_te, savename='true_neg_incorrect_dev_te.png', plots_per_row=plots_per_row_BALL, background_data_to_plot=full_background_data_to_plot, expanded_data_per_sample=te_expanded_data, sample_names_to_true_features=sample_names_to_true_features, BALL=BALL)
    make_umap_plots_per_sample(model, data_input, correct_idxs_true_pos_te, savename='true_pos_correct_dev_te.png', plots_per_row=plots_per_row_BALL, background_data_to_plot=full_background_data_to_plot, expanded_data_per_sample=te_expanded_data, sample_names_to_true_features=sample_names_to_true_features, BALL=BALL)
    make_umap_plots_per_sample(model, data_input, correct_idxs_true_neg_te, savename='true_neg_correct_dev_te.png', plots_per_row=plots_per_row_BALL, background_data_to_plot=full_background_data_to_plot, expanded_data_per_sample=te_expanded_data, sample_names_to_true_features=sample_names_to_true_features, BALL=BALL)
def cross_validate(path_to_params, n_runs, start_seed=0):
    start_time = time.time()

    params = TransformParameterParser(path_to_params).parse_params()
    print(params)
    check_consistency_of_params(params)

    #evauntually uncomment this leaving asis in order ot keep the same results as before to compare.
    set_random_seeds(params)

    if not os.path.exists(params['save_dir']):
        os.makedirs(params['save_dir'])

    with open(os.path.join(params['save_dir'], 'params.pkl'), 'wb') as f:
        pickle.dump(params, f)

    data_input = DataInput(params['data_params'])
    te_accs = []
    tr_accs = []
    # to get to the correct new split at start
    for i in range(start_seed):
        data_input.split_data()

    for run in range(start_seed, n_runs):
        if not os.path.exists(os.path.join(params['save_dir'], 'run%d' % run)):
            os.makedirs(os.path.join(params['save_dir'], 'run%d' % run))
        savepath = os.path.join(params['save_dir'], 'run%d' % run)
        data_input.split_data()
        print(data_input.idxs_tr)

        data_transformer = DataTransformerFactory(
            params['transform_params'],
            params['random_seed']).manufacture_transformer()

        data_input.embed_data_and_fit_transformer(\
            data_transformer,
            cells_to_subsample=params['transform_params']['cells_to_subsample'],
            num_cells_for_transformer=params['transform_params']['num_cells_for_transformer'],
            use_labels_to_transform_data=params['transform_params']['use_labels_to_transform_data']
        )
        data_input.save_transformer(savepath)
        data_input.normalize_data()
        unused_cluster_gate_inits = init_plot_and_save_gates(
            data_input, params)
        #everything below differs from the other main_UMAP
        data_input.convert_all_data_to_tensors()
        init_gate_tree, unused_cluster_gate_inits = get_next_gate_tree(
            unused_cluster_gate_inits, data_input, params, model=None)
        model = initialize_model(params['model_params'], [init_gate_tree])
        performance_tracker = run_train_model(model, params['train_params'],
                                              data_input)

        model_save_path = os.path.join(savepath, 'model.pkl')
        torch.save(model.state_dict(), model_save_path)

        tracker_save_path = os.path.join(savepath, 'tracker.pkl')
        with open(tracker_save_path, 'wb') as f:
            pickle.dump(performance_tracker, f)
        results_plotter = DataAndGatesPlotterDepthOne(
            model, np.concatenate(data_input.x_tr))
        #fig, axes = plt.subplots(params['gate_init_params']['n_clusters'], figsize=(1 * params['gate_init_params']['n_clusters'], 3 * params['gate_init_params']['n_clusters']))
        results_plotter.plot_data_with_gates(
            np.array(
                np.concatenate([
                    data_input.y_tr[i] *
                    torch.ones([data_input.x_tr[i].shape[0], 1])
                    for i in range(len(data_input.x_tr))
                ])))

        plt.savefig(os.path.join(savepath, 'final_gates.png'))

        with open(os.path.join(savepath, 'configs.pkl'), 'wb') as f:
            pickle.dump(params, f)

        print('Complete main loop for run %d took %.4f seconds' %
              (run, time.time() - start_time))
        start_time = time.time()
        print('Accuracy tr %.3f, te %.3f' %
              (performance_tracker.metrics['tr_acc'][-1],
               performance_tracker.metrics['te_acc'][-1]))
        te_accs.append(performance_tracker.metrics['te_acc'][-1])
        tr_accs.append(performance_tracker.metrics['tr_acc'][-1])
    tr_accs = np.array(tr_accs)
    te_accs = np.array(te_accs)
    print('Average tr acc: %.3f, te acc %.3f' %
          (np.mean(tr_accs), np.mean(te_accs)))
    print('Std dev tr acc: %.3f, te_acc %.3f' %
          (np.std(tr_accs), np.std(te_accs)))
Exemple #10
0
def push_converged_boundaries(path_to_params, stepsize, n_steps):
    start_time = time.time()

    params = TransformParameterParser(path_to_params).parse_params()
    print(params)

    #evauntually uncomment this leaving asis in order ot keep the same results as before to compare.
    set_random_seeds(params)

    data_input = DataInput(params['data_params'])
    data_input.split_data()
    print('%d samples in the training data' % len(data_input.x_tr))

    with open(os.path.join(params['save_dir'], 'trackers.pkl'), 'rb') as f:
        trackers = pickle.load(f)

    with open(os.path.join(params['save_dir'], 'transformer.pkl'), 'rb') as f:
        umapper = pickle.load(f)
    # FOR DEBUGGING ONLY
    #params['transform_params']['cells_to_subsample'] = 10
    data_input.embed_data(\
        umapper,
        cells_to_subsample=params['transform_params']['cells_to_subsample'],
        use_labels_to_transform_data=params['transform_params']['use_labels_to_transform_data']
    )
    data_input.normalize_data()
    data_input.convert_all_data_to_tensors()

    model = DepthOneModel([[['D1', 0, 0], ['D2', 0, 0]]],
                          params['model_params'])
    model.load_state_dict(
        torch.load(os.path.join(params['save_dir'], 'model.pkl')))

    init_acc = trackers[0].metrics['tr_acc'][-1]
    cur_best_acc = init_acc
    starting_gate = model.get_gates()[0]
    cur_gate = copy.deepcopy(starting_gate)
    cur_best_gate = copy.deepcopy(cur_gate)
    print('Starting gate:', starting_gate)
    counter = 0
    for left_step in range(n_steps):
        cur_gate[0] = starting_gate[0] - left_step * stepsize
        for right_step in range(n_steps):
            cur_gate[1] = starting_gate[1] + right_step * stepsize
            for down_step in range(n_steps):
                cur_gate[2] = starting_gate[2] - down_step * stepsize
                for up_step in range(n_steps):
                    cur_gate[3] = starting_gate[3] + up_step * stepsize
                    model = DepthOneModel([[['D1', cur_gate[0], cur_gate[1]],
                                            ['D2', cur_gate[2], cur_gate[3]]]],
                                          params['model_params'])
                    fit_classifier_params(
                        model, data_input,
                        params['train_params']['learning_rate_classifier'])
                    #                    model.nodes = None
                    #                    model.init_nodes([[['D1', cur_gate[0], cur_gate[1]], ['D2', cur_gate[2], cur_gate[3]]]])
                    cur_acc = compute_tr_acc(model, data_input)
                    #cur_acc = performance_tracker.metrics['tr_acc'][-1]
                    counter += 1
                    print(counter)
                    print(cur_gate)
                    print(cur_acc)
                    if cur_acc > cur_best_acc:
                        cur_best_acc = cur_acc
                        cur_best_gate = copy.deepcopy(cur_gate)

    print('Final acc %.3f, Initial acc %.3f' % (cur_best_acc, init_acc))
    print('Init/final gates', starting_gate, cur_best_gate)
Exemple #11
0
def main(path_to_params):
    start_time = time.time()

    params = TransformParameterParser(path_to_params).parse_params()
    print(params)
    check_consistency_of_params(params)

    #evauntually uncomment this leaving asis in order ot keep the same results as before to compare.
    set_random_seeds(params)

    if not os.path.exists(params['save_dir']):
        os.makedirs(params['save_dir'])

    with open(os.path.join(params['save_dir'], 'params.pkl'), 'wb') as f:
        pickle.dump(params, f)

    data_input = DataInput(params['data_params'])
    data_input.split_data()
    print('%d samples in the training data' % len(data_input.x_tr))
    data_transformer = DataTransformerFactory(
        params['transform_params'],
        params['random_seed']).manufacture_transformer()

    data_input.embed_data_and_fit_transformer(\
        data_transformer,
        cells_to_subsample=params['transform_params']['cells_to_subsample'],
        num_cells_for_transformer=params['transform_params']['num_cells_for_transformer'],
        use_labels_to_transform_data=params['transform_params']['use_labels_to_transform_data']
    )
    # can't pickle opentsne objects
    if not params['transform_params'] == 'tsne':
        data_input.save_transformer(params['save_dir'])
    data_input.normalize_data()

    potential_gates = get_all_potential_gates(data_input, params)
    data_input.convert_all_data_to_tensors()
    model = initialize_model(params['model_params'], potential_gates)

    if params['train_params']['fix_gates']:
        model.freeze_gate_params()
    tracker = run_train_model(\
        model, params['train_params'], data_input
    )

    #   if params['transform_params']['embed_dim'] == 3:
    #       unused_cluster_gate_inits = init_gates(data_input, params)
    #   else:
    #       unused_cluster_gate_inits = init_plot_and_save_gates(data_input, params)
    #   #everything below differs from the other main_UMAP
    #   data_input.convert_all_data_to_tensors()
    #   init_gate_tree, unused_cluster_gate_inits = get_next_gate_tree(unused_cluster_gate_inits, data_input, params, model=None)
    #   model = initialize_model(params['model_params'], [init_gate_tree])
    #   trackers_per_round = []
    #   num_gates_left = len(unused_cluster_gate_inits)
    #   #print(num_gates_left, 'asdfasdfasdfasdfasdfasdfas')
    #   for i in range(num_gates_left + 1):
    #       performance_tracker = run_train_model(model, params['train_params'], data_input)
    #       trackers_per_round.append(performance_tracker.get_named_tuple_rep())
    #       if i == params['train_params']['num_gates_to_learn'] - 1:
    #           break
    #       if not i == num_gates_left:
    #           next_gate_tree, unused_cluster_gate_inits = get_next_gate_tree(unused_cluster_gate_inits, data_input, params, model=model)
    #           model.add_node(next_gate_tree)

    model_save_path = os.path.join(params['save_dir'], 'model.pkl')
    torch.save(model.state_dict(), model_save_path)

    tracker_save_path = os.path.join(params['save_dir'], 'tracker.pkl')
    #    trackers_per_round = [tracker.get_named_tuple_rep() for tracker in trackers_per_round]
    with open(tracker_save_path, 'wb') as f:
        pickle.dump(tracker, f)
    if params['plot_umap_reflection']:
        # reflection is about x=.5 since the data is already in umap space here
        reflected_data = []
        for data in data_input.x_tr:
            data[:, 0] = 1 - data[:, 0]
            reflected_data.append(data)
        data_input.x_tr = reflected_data
        gate_tree = model.get_gate_tree()
        reflected_gates = []
        for gate in gate_tree:
            print(gate)
            #order switches since reflected over x=.5
            low_reflected = 1 - gate[0][2]
            high_reflected = 1 - gate[0][1]
            gate[0][1] = low_reflected
            gate[0][2] = high_reflected
            print(gate)

            reflected_gates.append(gate)
        model.init_nodes(reflected_gates)
        print(model.init_nodes)
        print(model.get_gates())
    results_plotter = DataAndGatesPlotterDepthOne(
        model, np.concatenate(data_input.x_tr))
    #fig, axes = plt.subplots(params['gate_init_params']['n_clusters'], figsize=(1 * params['gate_init_params']['n_clusters'], 3 * params['gate_init_params']['n_clusters']))

    if params['transform_params']['embed_dim'] == 2:
        results_plotter.plot_data_with_gates(
            np.array(
                np.concatenate([
                    data_input.y_tr[i] *
                    torch.ones([data_input.x_tr[i].shape[0], 1])
                    for i in range(len(data_input.x_tr))
                ])))
        plt.savefig(os.path.join(params['save_dir'], 'final_gates.png'))
    else:
        fig_pos, ax_pos, fig_neg, ax_neg = results_plotter.plot_data_with_gates(
            np.array(
                np.concatenate([
                    data_input.y_tr[i] *
                    torch.ones([data_input.x_tr[i].shape[0], 1])
                    for i in range(len(data_input.x_tr))
                ])))
        with open(os.path.join(params['save_dir'], 'final_gates_pos_3d.pkl'),
                  'wb') as f:
            pickle.dump(fig_pos, f)

        with open(os.path.join(params['save_dir'], 'final_gates_neg_3d.pkl'),
                  'wb') as f:
            pickle.dump(fig_neg, f)

    with open(os.path.join(params['save_dir'], 'configs.pkl'), 'wb') as f:
        pickle.dump(params, f)

    print('Learned weights:', model.linear.weight)
    print('Complete main loop took %.4f seconds' % (time.time() - start_time))
Exemple #12
0
def single_run_single_gate(params):
    start_time = time.time()

    #evauntually uncomment this leaving asis in order ot keep the same results as before to compare.
    #set_random_seeds(params)

    if not os.path.exists(params['save_dir']):
        os.makedirs(params['save_dir'])

    with open(os.path.join(params['save_dir'], 'params.pkl'), 'wb') as f:
        pickle.dump(params, f)

    data_input = DataInput(params['data_params'])
    data_input.split_data(split_seed=params['random_seed'])

    data_transformer = DataTransformerFactory(
        params['transform_params'],
        params['random_seed']).manufacture_transformer()

    data_input.embed_data_and_fit_transformer(\
        data_transformer,
        cells_to_subsample=params['transform_params']['cells_to_subsample'],
        num_cells_for_transformer=params['transform_params']['num_cells_for_transformer']
    )
    data_input.save_transformer(params['save_dir'])
    data_input.normalize_data()
    unused_cluster_gate_inits = init_plot_and_save_gates(data_input, params)
    #everything below differs from the other main_UMAP
    data_input.convert_all_data_to_tensors()
    init_gate_tree, unused_cluster_gate_inits = get_next_gate_tree(
        unused_cluster_gate_inits, data_input, params, model=None)
    model = initialize_model(params['model_params'], [init_gate_tree])
    performance_tracker = run_train_model(model, params['train_params'],
                                          data_input)

    model_save_path = os.path.join(params['save_dir'], 'model.pkl')
    torch.save(model.state_dict(), model_save_path)

    trackers_save_path = os.path.join(params['save_dir'],
                                      'last_CV_rounds_tracker.pkl')
    with open(trackers_save_path, 'wb') as f:
        pickle.dump(performance_tracker, f)
    results_plotter = DataAndGatesPlotterDepthOne(
        model, np.concatenate(data_input.x_tr))
    #fig, axes = plt.subplots(params['gate_init_params']['n_clusters'], figsize=(1 * params['gate_init_params']['n_clusters'], 3 * params['gate_init_params']['n_clusters']))
    results_plotter.plot_data_with_gates(
        np.array(
            np.concatenate([
                data_input.y_tr[i] *
                torch.ones([data_input.x_tr[i].shape[0], 1])
                for i in range(len(data_input.x_tr))
            ])))

    plt.savefig(os.path.join(params['save_dir'], 'final_gates.png'))

    with open(os.path.join(params['save_dir'], 'configs.pkl'), 'wb') as f:
        pickle.dump(params, f)

    print('Complete main loop took %.4f seconds' % (time.time() - start_time))
    return performance_tracker, model
Exemple #13
0
def main(params):
    start_time = time.time()

    #evauntually uncomment this leaving asis in order ot keep the same results as before to compare.
    set_random_seeds(params)

    if not os.path.exists(params['save_dir']):
        os.makedirs(params['save_dir'])

    with open(os.path.join(params['save_dir'], 'params.pkl'), 'wb') as f:
        pickle.dump(params, f)

    data_input = DataInput(params['data_params'])
    data_input.split_data()
    print('%d samples in the training data' % len(data_input.x_tr))
    # force identity for the first transform
    data_transformer = DataTransformerFactory({
        'transform_type': 'identity'
    }, params['random_seed']).manufacture_transformer()

    data_input.embed_data_and_fit_transformer(\
        data_transformer,
        cells_to_subsample=params['transform_params']['cells_to_subsample'],
        num_cells_for_transformer=params['transform_params']['num_cells_for_transformer'],
        use_labels_to_transform_data=params['transform_params']['use_labels_to_transform_data']
    )
    # can't pickle opentsne objects
    if not params['transform_params'] == 'tsne':
        data_input.save_transformer(params['save_dir'])
    data_input.normalize_data()

    # gates aren't plotted because we're in n dimensions
    unused_cluster_gate_inits = init_gates(data_input, params)

    data_input.convert_all_data_to_tensors()
    init_gate_tree, unused_cluster_gate_inits = get_next_gate_tree(
        unused_cluster_gate_inits, data_input, params, model=None)
    model = initialize_model(params['model_params'], [init_gate_tree])
    trackers_per_round = []
    num_gates_left = len(unused_cluster_gate_inits)
    for i in range(num_gates_left + 1):
        performance_tracker = run_train_model(model, params['train_params'],
                                              data_input)
        trackers_per_round.append(performance_tracker.get_named_tuple_rep())
        if i == params['train_params']['num_gates_to_learn'] - 1:
            break
        if not i == num_gates_left:
            next_gate_tree, unused_cluster_gate_inits = get_next_gate_tree(
                unused_cluster_gate_inits, data_input, params, model=model)
            model.add_node(next_gate_tree)

    model_save_path = os.path.join(params['save_dir'], 'model.pkl')
    torch.save(model.state_dict(), model_save_path)

    trackers_save_path = os.path.join(params['save_dir'], 'trackers.pkl')
    #    trackers_per_round = [tracker.get_named_tuple_rep() for tracker in trackers_per_round]
    with open(trackers_save_path, 'wb') as f:
        pickle.dump(trackers_per_round, f)
    if params['plot_umap_reflection']:
        # reflection is about x=.5 since the data is already in umap space here
        reflected_data = []
        for data in data_input.x_tr:
            data[:, 0] = 1 - data[:, 0]
            reflected_data.append(data)
        data_input.x_tr = reflected_data
        gate_tree = model.get_gate_tree()
        reflected_gates = []
        for gate in gate_tree:
            print(gate)
            #order switches since reflected over x=.5
            low_reflected = 1 - gate[0][2]
            high_reflected = 1 - gate[0][1]
            gate[0][1] = low_reflected
            gate[0][2] = high_reflected
            print(gate)

            reflected_gates.append(gate)
        model.init_nodes(reflected_gates)
        print(model.init_nodes)
        print(model.get_gates())
    data_transformer = DataTransformerFactory(
        params['transform_params'],
        params['random_seed']).manufacture_transformer()
    data_input.convert_all_data_to_numpy()
    data_input.x_tr = data_input.x_tr_raw
    data_input.x_te = data_input.x_te_raw
    old_scale = data_input.scale
    old_offset = data_input.offset
    print("fitting projection")
    data_input.embed_data_and_fit_transformer(\
        data_transformer,
        cells_to_subsample=params['transform_params']['cells_to_subsample'],
        num_cells_for_transformer=params['transform_params']['num_cells_for_transformer'],
        use_labels_to_transform_data=params['transform_params']['use_labels_to_transform_data']
    )
    results_plotter = MultidimDataAndGatesPlotter(
        model, np.concatenate(data_input.x_tr),
        np.concatenate(data_input.untransformed_matched_x_tr), old_scale,
        old_offset, data_input.transformer)

    results_plotter.plot_in_feature_space(
        np.array(
            np.concatenate([
                data_input.y_tr[i] *
                torch.ones([data_input.x_tr[i].shape[0], 1])
                for i in range(len(data_input.x_tr))
            ])))
    plt.savefig(os.path.join(params['save_dir'], 'feature_results.png'))

    if params['transform_params']['embed_dim'] == 2:
        results_plotter.plot_data_with_gates(
            np.array(
                np.concatenate([
                    data_input.y_tr[i] *
                    torch.ones([data_input.x_tr[i].shape[0], 1])
                    for i in range(len(data_input.x_tr))
                ])))
        plt.savefig(os.path.join(params['save_dir'], 'final_gates.png'))
    else:
        fig_pos, ax_pos, fig_neg, ax_neg = results_plotter.plot_data_with_gates(
            np.array(
                np.concatenate([
                    data_input.y_tr[i] *
                    torch.ones([data_input.x_tr[i].shape[0], 1])
                    for i in range(len(data_input.x_tr))
                ])))
        with open(os.path.join(params['save_dir'], 'final_gates_pos_3d.pkl'),
                  'wb') as f:
            pickle.dump(fig_pos, f)

        with open(os.path.join(params['save_dir'], 'final_gates_neg_3d.pkl'),
                  'wb') as f:
            pickle.dump(fig_neg, f)

    with open(os.path.join(params['save_dir'], 'configs.pkl'), 'wb') as f:
        pickle.dump(params, f)

    print('Complete main loop took %.4f seconds' % (time.time() - start_time))
    return trackers_per_round[-1]
def main(path_to_params):
    start_time = time.time()

    params = TransformParameterParser(path_to_params).parse_params()
    print(params)
    check_consistency_of_params(params)

    #evauntually uncomment this leaving asis in order ot keep the same results as before to compare.
    set_random_seeds(params)

    if not os.path.exists(params['save_dir']):
        os.makedirs(params['save_dir'])

    with open(os.path.join(params['save_dir'], 'params.pkl'), 'wb') as f:
        pickle.dump(params, f)

    data_input = DataInput(params['data_params'])
    data_input.split_data()

    data_transformer = DataTransformerFactory(
        params['transform_params'],
        params['random_seed']).manufacture_transformer()

    data_input.embed_data_and_fit_transformer(\
        data_transformer,
        cells_to_subsample=params['transform_params']['cells_to_subsample'],
        num_cells_for_transformer=params['transform_params']['num_cells_for_transformer'],
        use_labels_to_transform_data=params['transform_params']['use_labels_to_transform_data']
    )
    data_input.save_transformer(params['save_dir'])
    data_input.normalize_data()
    unused_cluster_gate_inits = init_plot_and_save_gates(data_input, params)

    data_input.convert_all_data_to_tensors()

    init_gate_tree, unused_cluster_gate_inits = get_next_gate_tree(
        unused_cluster_gate_inits, data_input, params, model=None)
    model1 = initialize_model(params['model_params'], [init_gate_tree])

    performance_tracker1 = run_train_model(model1, params['train_params'],
                                           data_input)

    model1_save_path = os.path.join(params['save_dir'], 'model1.pkl')
    torch.save(model1.state_dict(), model1_save_path)

    tracker1_save_path = os.path.join(params['save_dir'], 'tracker1.pkl')
    with open(tracker1_save_path, 'wb') as f:
        pickle.dump(performance_tracker1, f)

    # now select the data inside the learned model1 gate and re-run umap
    data_input.filter_data_inside_first_model_gate(model1)
    unused_cluster_gate_inits = init_plot_and_save_gates(data_input, params)

    data_transformer = DataTransformerFactory(
        params['transform_params'],
        params['random_seed']).manufacture_transformer()

    data_input.embed_data_and_fit_transformer(\
        data_transformer,
        cells_to_subsample=params['transform_params']['cells_to_subsample'],
        num_cells_for_transformer=params['transform_params']['num_cells_for_transformer'],
        use_labels_to_transform_data=params['transform_params']['use_labels_to_transform_data']
    )
    data_input.save_transformer(params['save_dir'])
    data_input.convert_all_data_to_tensors()

    init_gate_tree, _ = get_next_gate_tree(unused_cluster_gate_inits,
                                           data_input,
                                           params,
                                           model=None)
    model2 = initialize_model(params['model_params'], [init_gate_tree])

    performance_tracker2 = run_train_model(model2, params['train_params'],
                                           data_input)

    model2_save_path = os.path.join(params['save_dir'], 'model2.pkl')
    torch.save(model2.state_dict(), model2_save_path)

    tracker2_save_path = os.path.join(params['save_dir'], 'tracker2.pkl')
    with open(tracker2_save_path, 'wb') as f:
        pickle.dump(performance_tracker2, f)

    results_plotter = DataAndGatesPlotterDepthOne(
        model2, np.concatenate(data_input.x_tr))
    #fig, axes = plt.subplots(params['gate_init_params']['n_clusters'], figsize=(1 * params['gate_init_params']['n_clusters'], 3 * params['gate_init_params']['n_clusters']))
    results_plotter.plot_data_with_gates(
        np.array(
            np.concatenate([
                data_input.y_tr[i] *
                torch.ones([data_input.x_tr[i].shape[0], 1])
                for i in range(len(data_input.x_tr))
            ])))

    plt.savefig(os.path.join(params['save_dir'], 'final_gates.png'))

    with open(os.path.join(params['save_dir'], 'configs.pkl'), 'wb') as f:
        pickle.dump(params, f)

    print('Complete main loop took %.4f seconds' % (time.time() - start_time))
def main(path_to_params):
    start_time = time.time()

    params = TransformParameterParser(path_to_params).parse_params()
    print(params)
    check_consistency_of_params(params)

    set_random_seeds(params)

    if not os.path.exists(params['save_dir']):
        os.makedirs(params['save_dir'])

    with open(os.path.join(params['save_dir'], 'params.pkl'), 'wb') as f:
        pickle.dump(params, f)

    data_input = DataInput(params['data_params'])
    data_input.split_data()

    data_transformer = DataTransformerFactory(
        params['transform_params'],
        params['random_seed']).manufacture_transformer()
    data_input.embed_data_and_fit_transformer(\
        data_transformer,
        params['transform_params']['cells_to_subsample'],
        params['transform_params']['num_cells_for_transformer']
    )
    data_input.save_transformer(params['save_dir'])
    data_input.normalize_data()
    #everything below differs from the other main_UMAP

    multi_gate_initializer = MultipleGateInitializerHeuristic(
        data_input, params['model_params']['node_type'],
        params['gate_init_multi_heuristic_params'])
    init_gate_tree = [multi_gate_initializer.init_next_gate()]

    model = initialize_model(params['model_params'], init_gate_tree)
    data_input.prepare_data_for_training()
    trackers_per_step = []
    num_gates = params['gate_init_multi_heuristic_params']['num_gates']
    for i in range(num_gates):
        performance_tracker = run_train_model(model, params['train_params'],
                                              data_input)
        multi_gate_initializer.gates = model.get_gates()
        if not (i == num_gates - 1):
            print(model.get_gates())
            next_gate = multi_gate_initializer.init_next_gate()
            if next_gate is None:
                print(
                    'There are no non-overlapping initializations left to try!'
                )
                break
            model.add_node(next_gate)

    model_save_path = os.path.join(params['save_dir'], 'model.pkl')
    torch.save(model.state_dict(), model_save_path)

    tracker_save_path = os.path.join(params['save_dir'], 'tracker.pkl')
    with open(tracker_save_path, 'wb') as f:
        pickle.dump(performance_tracker, f)
    results_plotter = DataAndGatesPlotterDepthOne(
        model, np.concatenate(data_input.x_tr))
    #fig, axes = plt.subplots(params['gate_init_params']['n_clusters'], figsize=(1 * params['gate_init_params']['n_clusters'], 3 * params['gate_init_params']['n_clusters']))
    results_plotter.plot_data_with_gates(
        np.array(
            np.concatenate([
                data_input.y_tr[i] *
                torch.ones([data_input.x_tr[i].shape[0], 1])
                for i in range(len(data_input.x_tr))
            ])))

    plt.savefig(os.path.join(params['save_dir'], 'final_gates.png'))
    print('Complete main loop took %.4f seconds' % (time.time() - start_time))
Exemple #16
0
def main(path_to_params):
    params = TransformParameterParser(path_to_params).parse_params()
    start_time = time.time()
    print(params)
    if not os.path.exists(params['save_dir']):
        os.makedirs(params['save_dir'])
    with open(os.path.join(params['save_dir'], 'params.pkl'), 'wb') as f:
        pickle.dump(params, f)
    data_input = DataInput(params['data_params'])
    data_input.split_data()

    data_transformer = DataTransformerFactory(
        params['transform_params']).manufacture_transformer()
    data_input.embed_data(
        data_transformer, params['transform_params']['cells_to_subsample'],
        params['transform_params']['num_cells_for_transformer']
    )  #cells to subsample should change to a transformer param instead
    data_input.save_transformer(params['save_dir'])

    data_input.normalize_data()
    init_gate_tree = init_plot_and_save_gates(data_input, params)

    model = initialize_model(params['model_params'], init_gate_tree)
    data_input.prepare_data_for_training()
    performance_tracker = run_train_model(model, params['train_params'],
                                          data_input)

    model_save_path = os.path.join(params['save_dir'], 'model.pkl')
    torch.save(model.state_dict(), model_save_path)

    tracker_save_path = os.path.join(params['save_dir'], 'tracker.pkl')
    with open(tracker_save_path, 'wb') as f:
        pickle.dump(performance_tracker, f)
    results_plotter = DataAndGatesPlotterDepthOne(
        model, np.concatenate(data_input.x_tr))
    #fig, axes = plt.subplots(params['gate_init_params']['n_clusters'], figsize=(1 * params['gate_init_params']['n_clusters'], 3 * params['gate_init_params']['n_clusters']))
    results_plotter.plot_data_with_gates(
        np.array(
            np.concatenate([
                data_input.y_tr[i] *
                torch.ones([data_input.x_tr[i].shape[0], 1])
                for i in range(len(data_input.x_tr))
            ])))

    plt.savefig(os.path.join(params['save_dir'], 'final_gates.png'))
    print('Complete main loop took %.4f seconds' % (time.time() - start_time))