Beispiel #1
0
def experiment_2b(W_init, H_init, results_folder, ranks, no_iterations):

    for r in ranks:

        W_init_r = W_init[:, :r]
        H_init_r = H_init[:r, :]

        W, H, divergence_by_it = NMF_divergence(data_matrix, W_init_r, H_init_r, n, m, r, no_iterations, 1,
                                                report_progress=True,
                                                save_progress_to=results_folder)

        save_results(results_folder, W, H, unique_name='r={}_final'.format(r), additional_data_name_to_array_dict={
            'divergence_record' : divergence_by_it}, row_names_list=genes, column_names_list=patients)
Beispiel #2
0
def experiment_2c(ranks, no_iterations, rpkm_matrix_path, ensg_to_go_path,
                  results_folder):

    genes_with_GO_annotations = list(
        set(pd.read_csv(ensg_to_go_path, index_col=0)['ENSEMBL']))

    rpkm_matrix = pd.read_csv(rpkm_matrix_path,
                              index_col=0).loc[genes_with_GO_annotations]

    patients = list(rpkm_matrix.columns)

    n = len(genes_with_GO_annotations)
    m = len(patients)

    rpkm_matrix = np.array(rpkm_matrix)

    W_init, H_init = NNDSVDar_initialization(rpkm_matrix, n, m, max(ranks))

    for r in ranks:

        W_init_r = W_init[:, :r]
        H_init_r = H_init[:r, :]

        W, H, divergence_by_it = NMF_divergence(rpkm_matrix,
                                                W_init_r,
                                                H_init_r,
                                                n,
                                                m,
                                                r,
                                                no_iterations,
                                                100,
                                                report_progress=True)

        save_results(results_folder,
                     W,
                     H,
                     unique_name='r={}_final'.format(r),
                     additional_data_name_to_array_dict={
                         'divergence_record': divergence_by_it
                     },
                     row_names_list=genes_with_GO_annotations,
                     column_names_list=patients)
def experiment_2d(ranks, no_iterations, rpkm_matrix_path,
                  genes_with_GO_annotations, results_folder, repeat_no):

    rpkm_matrix = pd.read_csv(rpkm_matrix_path,
                              index_col=0).loc[genes_with_GO_annotations]

    patients = list(rpkm_matrix.columns)

    n = len(genes_with_GO_annotations)
    m = len(patients)

    rpkm_matrix = np.array(rpkm_matrix)

    random_state = np.random.RandomState(repeat_no)

    for r in ranks:

        print('\n\nRANK {}, REPEAT {}\n\n'.format(r, repeat_no))

        W_init = random_state.uniform(size=(n, r))
        H_init = random_state.uniform(size=(r, m))

        W, H, divergence_by_it = NMF_divergence(rpkm_matrix,
                                                W_init,
                                                H_init,
                                                n,
                                                m,
                                                r,
                                                no_iterations,
                                                100,
                                                report_progress=True)

        save_results(results_folder,
                     W,
                     H,
                     unique_name='r={}_repeat={}'.format(r, repeat_no),
                     additional_data_name_to_array_dict={
                         'divergence_record': divergence_by_it
                     },
                     row_names_list=genes_with_GO_annotations,
                     column_names_list=patients)
        np.save(initializations_path + 'W_init_' + code, W_init)
        np.save(initializations_path + 'H_init_' + code, H_init)

        # REAL NMF
        W, H, divergence_by_it = NMF_divergence(V_15000,
                                                W_init.copy(),
                                                H_init.copy(),
                                                n,
                                                m,
                                                r,
                                                iterations,
                                                divergence_calc_frequency,
                                                report_progress=True)
        save_results(W,
                     H, {'divergence_record': divergence_by_it},
                     row_names_list=genes,
                     column_names_list=patients,
                     main_results_folder=results_path_main)

        # PERMUTATION NMF
        W, H, divergence_by_it = NMF_divergence(V_15000_columns_permuted,
                                                W_init,
                                                H_init,
                                                n,
                                                m,
                                                r,
                                                iterations,
                                                divergence_calc_frequency,
                                                report_progress=True)
        save_results(W,
                     H, {'divergence_record': divergence_by_it},
Beispiel #5
0

data_path = "C:/Users/hanne/Documents/PROJECT/Project Data/CM_experiment_2_data.csv"

main_results_folder = "C:/Users/hanne/Documents/PROJECT/Project Data/Experiment_2a_results/"

data_matrix = pd.read_csv(data_path, index_col=0)

genes = list(data_matrix.index)
patients = list(data_matrix.columns)

n = len(genes)
m = len(patients)
ranks = [2, 5, 10, 15, 20, 25, 30, 40, 50, 60, 70, 80, 100, 150]


data_matrix = np.array(data_matrix)


for r in ranks:

    random = np.random.RandomState(42)

    W_init = random.uniform(0, 1, (n, r))
    H_init = random.uniform(0, 1, (r, m))

    W, H, divergence_by_it = NMF_divergence(data_matrix, W_init, H_init, n, m, r, 5000, 1, report_progress=True,
                                            save_progress_to=main_results_folder)

    save_results(main_results_folder, W, H, unique_name='r={}_final'.format(r), additional_data_name_to_array_dict={
        'divergence_record' : divergence_by_it}, row_names_list=genes, column_names_list=patients)
    r = ranks[idx]
    repeat = best_repeats[idx]

    # Loading best initialisations by divergence after 100 iterations as initialisation for longer run

    code = 'rep={}_rank={}'.format(repeat, r)
    print(code)

    W_init = np.load(initializations_path + 'W_init_' + code + '.npy')
    H_init = np.load(initializations_path + 'H_init_' + code + '.npy')

    # REAL NMF
    W, H, divergence_by_it = NMF_divergence(V_15000,
                                            W_init.copy(),
                                            H_init.copy(),
                                            n,
                                            m,
                                            r,
                                            iterations,
                                            divergence_calc_frequency,
                                            report_progress=True)
    save_results(results_path_main,
                 W,
                 H,
                 unique_name='rank={}'.format(r),
                 additional_data_name_to_array_dict={
                     'divergence_record': divergence_by_it
                 },
                 row_names_list=genes,
                 column_names_list=patients)