Exemple #1
0
def run_correlation(run_parameters):
    """ perform gene prioritization

    Args:
        run_parameters: parameter set dictionary.
    """

    run_parameters["results_tmp_directory"] = kn.create_dir(run_parameters["results_directory"], 'tmp')

    results_tmp_directory      = run_parameters["results_tmp_directory"     ]
    phenotype_name_full_path   = run_parameters["phenotype_name_full_path"  ]
    spreadsheet_name_full_path = run_parameters["spreadsheet_name_full_path"]

    spreadsheet_df             = kn.get_spreadsheet_df(spreadsheet_name_full_path)
    phenotype_df               = kn.get_spreadsheet_df(phenotype_name_full_path  )
    phenotype_df               = phenotype_df.T

    number_of_jobs             = len(phenotype_df.index)
    jobs_id                    = range(0, number_of_jobs)
    zipped_arguments           = dstutil.zip_parameters( run_parameters
                                                       , spreadsheet_df
                                                       , phenotype_df
                                                       , jobs_id
                                                       )

    dstutil.parallelize_processes_locally( run_correlation_worker
                                         , zipped_arguments
                                         , number_of_jobs
                                         )

    write_phenotype_data_all(run_parameters       )
    kn.remove_dir           (results_tmp_directory)
Exemple #2
0
def run_bootstrap_net_correlation(run_parameters):
    """ perform gene prioritization using bootstrap sampling and network smoothing

    Args:
        run_parameters: parameter set dictionary.
    """
    run_parameters["results_tmp_directory"] = kn.create_dir(run_parameters["results_directory"], 'tmp')
    gg_network_name_full_path = run_parameters['gg_network_name_full_path']
    network_mat, unique_gene_names = kn.get_sparse_network_matrix(gg_network_name_full_path)

    network_mat = normalize(network_mat, norm="l1", axis=0)

    phenotype_df = kn.get_spreadsheet_df(run_parameters["phenotype_name_full_path"])
    spreadsheet_df = kn.get_spreadsheet_df(run_parameters["spreadsheet_name_full_path"])
    spreadsheet_genes_as_input = spreadsheet_df.index.values
    phenotype_df = phenotype_df.T

    spreadsheet_df = kn.update_spreadsheet_df(spreadsheet_df, unique_gene_names)
    spreadsheet_df = zscore_dataframe(spreadsheet_df)
    sample_smooth, iterations = kn.smooth_matrix_with_rwr(spreadsheet_df.as_matrix(), network_mat.T, run_parameters)
    spreadsheet_df = pd.DataFrame(sample_smooth, index=spreadsheet_df.index, columns=spreadsheet_df.columns)

    baseline_array = np.ones(network_mat.shape[0]) / network_mat.shape[0]
    baseline_array = kn.smooth_matrix_with_rwr(baseline_array, network_mat, run_parameters)[0]

    number_of_jobs = len(phenotype_df.index)
    jobs_id = range(0, number_of_jobs)
    zipped_arguments = dstutil.zip_parameters(run_parameters, spreadsheet_df, phenotype_df, network_mat,
                                              spreadsheet_genes_as_input, baseline_array, jobs_id)
    dstutil.parallelize_processes_locally(run_bootstrap_net_correlation_worker, zipped_arguments, number_of_jobs)

    write_phenotype_data_all(run_parameters)
    kn.remove_dir(run_parameters["results_tmp_directory"])
Exemple #3
0
def run_similarity(run_parameters):
    """ Performs similarity analysis and saves the similarity matrix.

    Args:
        run_parameters: parameter set dictionary.
    """

    expression_name = run_parameters["spreadsheet_name_full_path"]
    signature_name = run_parameters["signature_name_full_path"]
    similarity_measure = run_parameters["similarity_measure"]

    expression_df = kn.get_spreadsheet_df(expression_name)
    signature_df = kn.get_spreadsheet_df(signature_name)

    samples_names = expression_df.columns
    signatures_names = signature_df.columns
    signatures_names = [i.split('.')[0] for i in signatures_names]
    signature_df.columns = signatures_names

    similarity_mat = generate_similarity_mat(expression_df, signature_df,
                                             similarity_measure)
    # similarity_mat = map_similarity_range(similarity_mat, 0)
    similarity_df = pd.DataFrame(similarity_mat,
                                 index=samples_names,
                                 columns=signatures_names)
    save_final_samples_signature(similarity_df, run_parameters)
    save_best_match_signature(similarity_df, run_parameters)
Exemple #4
0
def run_select_subtype_df(run_parameters):
    """ Subset samples based on some row value, e.g., patients with longer survival.
        Output can be a smaller spreadsheet with fewer columns.
        From a genes x samples spreadsheet and a samples x phenotypes spreadsheet,
        return both spreadsheets with only the samples corresponding to a category in a phenotype.

    Args:           run_parameters with keys:
                    "results_directory", "spreadsheet_file_name", "phenotype_file_name",
                    "phenotype_id", "select_category"
    """
    results_directory = run_parameters['results_directory']
    spreadsheet_file_name = run_parameters['spreadsheet_file_name']
    phenotype_file_name = run_parameters['phenotype_file_name']
    phenotype_id = run_parameters['phenotype_id']
    select_category = run_parameters['select_category']

    spreadsheet_df = kn.get_spreadsheet_df(spreadsheet_file_name)
    phenotype_df = kn.get_spreadsheet_df(phenotype_file_name)

    spreadsheet_df, phenotype_df = select_subtype_df(spreadsheet_df,
                                                     phenotype_df,
                                                     phenotype_id,
                                                     select_category)

    transform_name = "phenotype_category"
    write_transform_df(spreadsheet_df, spreadsheet_file_name, transform_name,
                       results_directory)
    write_transform_df(phenotype_df, phenotype_file_name, transform_name,
                       results_directory)
Exemple #5
0
def run_correlation(run_parameters):
    """ perform feature prioritization

    Args:
        run_parameters: parameter set dictionary.
    """
    max_cpu = run_parameters["max_cpu"]
    run_parameters["results_tmp_directory"] = kn.create_dir(
        run_parameters["results_directory"], 'tmp')

    phenotype_df = kn.get_spreadsheet_df(
        run_parameters["phenotype_name_full_path"])
    spreadsheet_df = kn.get_spreadsheet_df(
        run_parameters["spreadsheet_name_full_path"])
    phenotype_df = phenotype_df.T

    len_phenotype = len(phenotype_df.index)
    array_of_jobs = range(0, len_phenotype)

    for i in range(0, len_phenotype, max_cpu):
        jobs_id = array_of_jobs[i:i + max_cpu]
        number_of_jobs = len(jobs_id)

        zipped_arguments = dstutil.zip_parameters(run_parameters,
                                                  spreadsheet_df, phenotype_df,
                                                  jobs_id)
        dstutil.parallelize_processes_locally(run_correlation_worker,
                                              zipped_arguments, number_of_jobs)
    write_phenotype_data_all(run_parameters)
    kn.remove_dir(run_parameters["results_tmp_directory"])
Exemple #6
0
def run_lasso_predict(run_parameters):

    gene_file = run_parameters['spreadsheet_name_full_path'     ]
    sign_file = run_parameters['response_name_full_path'        ]
    test_file = run_parameters['test_spreadsheet_name_full_path']

    gene_df   = kn.get_spreadsheet_df(gene_file)
    sign_df   = kn.get_spreadsheet_df(sign_file)
    test_df   = kn.get_spreadsheet_df(test_file)

    row_names = test_df.columns

    gene_mat  = gene_df.values
    sign_mat  = sign_df.values[0]
    test_mat  = test_df.values

    min_alpha     = run_parameters['min_alpha']
    max_alpha     = run_parameters['max_alpha']
    n_alpha       = run_parameters['n_alpha']
    intercept     = run_parameters['fit_intercept']
    normalization = run_parameters['normalize']
    max_iter      = run_parameters['max_iter']
    tolerance     = run_parameters['tolerance']

    alpha_grid    = np.linspace(min_alpha, max_alpha, num=n_alpha)
    reg_model     = linear_model.LassoCV(
        alphas=alpha_grid, fit_intercept=intercept, \
        normalize=normalization, max_iter=max_iter, tol=tolerance, cv=5)

    reg_model.fit( gene_mat.T, sign_mat)
    filename      = os.path.join(run_parameters['results_directory'], 'lasso_model.pkl') 
    pickle.dump(reg_model, open(filename, 'wb'))
    response_predict   = reg_model.predict(test_mat.T)
    predict_df         = pd.DataFrame(response_predict.T, index=row_names, columns=['predict'])
    write_predict_data(predict_df, run_parameters)
Exemple #7
0
def run_cc_net_similarity(run_parameters):
    """ wrapper: call sequence to perform signature analysis with
        random walk smoothing and bootstrapped similarity and save results.

    Args:
        run_parameters: parameter set dictionary.
    """
    tmp_dir = 'tmp_cc_similarity_'
    run_parameters = update_tmp_directory(run_parameters, tmp_dir)

    expression_name      = run_parameters["spreadsheet_name_full_path"]
    signature_name       = run_parameters["signature_name_full_path"  ]
    gg_network_name      = run_parameters['gg_network_name_full_path' ]
    similarity_measure   = run_parameters["similarity_measure"        ]
    number_of_bootstraps = run_parameters['number_of_bootstraps'      ]
    processing_method    = run_parameters['processing_method'         ]

    expression_df        = kn.get_spreadsheet_df(expression_name)
    signature_df         = kn.get_spreadsheet_df(signature_name )

    samples_names        = expression_df.columns
    signatures_names     =  signature_df.columns
    signatures_names     = [i.split('.')[0] for i in signatures_names]
    signature_df.columns = signatures_names

    network_mat, unique_gene_names = kn.get_sparse_network_matrix(gg_network_name)
    # network_mat                    = kn.normalize_sparse_mat_by_diagonal(network_mat)
    
    expression_df                  = kn.update_spreadsheet_df(expression_df, unique_gene_names)
    signature_df                   = kn.update_spreadsheet_df(signature_df, unique_gene_names)

    expression_mat                 = expression_df.as_matrix()
    signature_mat                  = signature_df.as_matrix()

    expression_mat, iterations = kn.smooth_matrix_with_rwr(expression_mat, network_mat, run_parameters)
    signature_mat,  iterations = kn.smooth_matrix_with_rwr(signature_mat,  network_mat, run_parameters)

    expression_df.iloc[:] = expression_mat
    signature_df.iloc[:]  = signature_mat

    if   processing_method == 'serial':
         for sample in range(0, number_of_bootstraps):
            run_cc_similarity_signature_worker(expression_df, signature_df, run_parameters, sample)

    elif processing_method == 'parallel':
         find_and_save_cc_similarity_parallel(expression_df, signature_df, run_parameters, number_of_bootstraps)

    else:
        raise ValueError('processing_method contains bad value.')

    # consensus_df = form_consensus_df(run_parameters, expression_df, signature_df)
    similarity_df = assemble_similarity_df(expression_df, signature_df, run_parameters)
    similarity_df  = pd.DataFrame(similarity_df.values, index=samples_names, columns=signatures_names)
    save_final_samples_signature(similarity_df, run_parameters)
    save_best_match_signature(similarity_df, run_parameters)

    kn.remove_dir(run_parameters["tmp_directory"])
def run_bootstrap_correlation(run_parameters):
    """ perform gene prioritization using bootstrap sampling

    Args:
        run_parameters: parameter set dictionary.
    """

    max_cpu = run_parameters["max_cpu"]
    run_parameters["results_tmp_directory"] = kn.create_dir(
        run_parameters["results_directory"], 'tmp')

    results_tmp_directory = run_parameters["results_tmp_directory"]
    n_bootstraps = run_parameters["number_of_bootstraps"]
    results_tmp_directory = run_parameters["results_tmp_directory"]
    phenotype_df = kn.get_spreadsheet_df(
        run_parameters["phenotype_name_full_path"])
    spreadsheet_df = kn.get_spreadsheet_df(
        run_parameters["spreadsheet_name_full_path"])

    phenotype_df = phenotype_df.T

    #-----------------------------------------------------------------------------------------
    #   Partition the phenotype dataframe (partition size = MaxCPU)
    #-----------------------------------------------------------------------------------------
    len_phenotype = len(phenotype_df.index)
    array_of_jobs = range(0, len_phenotype)

    if (len_phenotype <= max_cpu):
        jobs_id = array_of_jobs
        number_of_jobs = len(jobs_id)
        #-----------------------------------------------------------------------------------------
        zipped_arguments = dstutil.zip_parameters(run_parameters,
                                                  spreadsheet_df, phenotype_df,
                                                  n_bootstraps, jobs_id)

        dstutil.parallelize_processes_locally(run_bootstrap_correlation_worker,
                                              zipped_arguments, number_of_jobs)

        write_phenotype_data_all(run_parameters)
        #-----------------------------------------------------------------------------------------

    else:
        for i in range(0, len_phenotype, max_cpu):
            jobs_id = array_of_jobs[i:i + max_cpu]
            number_of_jobs = len(jobs_id)
            #-----------------------------------------------------------------------------------------
        zipped_arguments = dstutil.zip_parameters(run_parameters,
                                                  spreadsheet_df, phenotype_df,
                                                  n_bootstraps, jobs_id)

        dstutil.parallelize_processes_locally(run_bootstrap_correlation_worker,
                                              zipped_arguments, number_of_jobs)
    write_phenotype_data_all(run_parameters)
    #-----------------------------------------------------------------------------------------

    kn.remove_dir(results_tmp_directory)
def run_elastic_predict(run_parameters):
    ''' Using Elastic net model to predict response data against feature data

    Args:
        run_parameters: dictionary of run parameters
    '''
    gene_file = run_parameters['spreadsheet_name_full_path']
    sign_file = run_parameters['response_name_full_path']
    test_file = run_parameters['test_spreadsheet_name_full_path']

    gene_df = kn.get_spreadsheet_df(gene_file)
    sign_df = kn.get_spreadsheet_df(sign_file)
    test_df = kn.get_spreadsheet_df(test_file)

    row_names = test_df.columns

    gene_mat = gene_df.values
    sign_mat = sign_df.values[0]
    test_mat = test_df.values

    eps = run_parameters['eps']
    min_alpha = run_parameters['min_alpha']
    max_alpha = run_parameters['max_alpha']
    n_alpha = run_parameters['n_alpha']
    min_l1 = run_parameters['min_l1']
    max_l1 = run_parameters['max_l1']
    n_l1 = run_parameters['n_l1']
    intercept = run_parameters['fit_intercept']
    normalize = run_parameters['normalize']
    max_iter = run_parameters['max_iter']
    tolerance = run_parameters['tolerance']

    alpha_grid = np.linspace(min_alpha, max_alpha, num=n_alpha)
    l1_grid = np.linspace(min_l1, max_l1, num=n_l1)


    reg_model = linear_model.ElasticNetCV(
        l1_ratio=l1_grid, alphas=alpha_grid, fit_intercept=intercept, eps = eps,\
        normalize=normalize, max_iter=max_iter, tol=tolerance, cv=5)

    reg_model.fit(gene_mat.T, sign_mat)

    filename = os.path.join(run_parameters['results_directory'],
                            'elastic_net_model.pkl')
    pickle.dump(reg_model, open(filename, 'wb'))
    response_predict = reg_model.predict(test_mat.T)
    predict_df = pd.DataFrame(response_predict.T,
                              index=row_names,
                              columns=['predict'])
    write_predict_data(predict_df, run_parameters)
Exemple #10
0
def run_cc_similarity(run_parameters):
    """ Performs similarity analysis with bootstraps and saves the similarity matrix.

    Args:
        run_parameters: parameter set dictionary.
    """
    tmp_dir = 'tmp_cc_similarity'
    run_parameters = update_tmp_directory(run_parameters, tmp_dir)

    expression_name = run_parameters["spreadsheet_name_full_path"]
    signature_name = run_parameters["signature_name_full_path"]
    similarity_measure = run_parameters["similarity_measure"]
    number_of_bootstraps = run_parameters['number_of_bootstraps']
    processing_method = run_parameters['processing_method']

    expression_df = kn.get_spreadsheet_df(expression_name)
    signature_df = kn.get_spreadsheet_df(signature_name)

    samples_names = expression_df.columns
    signatures_names = signature_df.columns
    signatures_names = [i.split('.')[0] for i in signatures_names]
    signature_df.columns = signatures_names

    expression_mat = expression_df.as_matrix()
    signature_mat = signature_df.as_matrix()
    if processing_method == 'serial':
        for sample in range(0, number_of_bootstraps):
            run_cc_similarity_signature_worker(expression_df, signature_df,
                                               run_parameters, sample)

    elif processing_method == 'parallel':
        find_and_save_cc_similarity_parallel(expression_df, signature_df,
                                             run_parameters,
                                             number_of_bootstraps)

    else:
        raise ValueError('processing_method contains bad value.')

    # consensus_df = form_consensus_df(run_parameters, expression_df, signature_df)
    similarity_df = assemble_similarity_df(expression_df, signature_df,
                                           run_parameters)

    similarity_df = pd.DataFrame(similarity_df.values,
                                 index=samples_names,
                                 columns=signatures_names)
    save_final_samples_signature(similarity_df, run_parameters)
    save_best_match_signature(similarity_df, run_parameters)

    kn.remove_dir(run_parameters["tmp_directory"])
Exemple #11
0
def phenotype_expander(run_parameters):
    """ Run phenotype expander on the whole dataframe of phenotype data.
    Save the results to tsv file.
    """
    phenotype_df = kn.get_spreadsheet_df(
        run_parameters['phenotype_name_full_path'])
    output_dict = run_pre_processing_phenotype_expander(
        phenotype_df, run_parameters['threshold'])

    result_df = pd.DataFrame(index=phenotype_df.index)

    for key, df_list in output_dict.items():
        if key == ColumnType.CATEGORICAL:
            for item in df_list:
                col_df = phenotype_df.loc[:, item.columns[0]].dropna()
                uniq_array = np.unique(col_df.values)
                col_names = [
                    item.columns[0] + '_' + str(i) for i in uniq_array
                ]
                cur_df = pd.DataFrame(columns=col_names, index=col_df.index)
                cur_append_df = pd.DataFrame(columns=col_names,
                                             index=phenotype_df.index)

                for i, val in enumerate(uniq_array):
                    cur_df.loc[col_df == val, col_names[i]] = 1
                    cur_df.loc[col_df != val, col_names[i]] = 0
                cur_append_df.loc[cur_df.index, :] = cur_df
                result_df = pd.concat([result_df, cur_append_df], axis=1)

    file_name = kn.create_timestamped_filename("phenotype_expander_result",
                                               "tsv")
    file_path = os.path.join(run_parameters["results_directory"], file_name)
    result_df.index.name = "sample_id"
    result_df.to_csv(file_path, header=True, index=True, sep='\t', na_rep='NA')
Exemple #12
0
def run_kaplan_meier(button):
    """ callback for kaplan_meier_execute_button """
    if get_km_file_button.file_selector.value == LIST_BOX_UPDATE_MESSAGE:
        if get_km_file_button.description == 'Clear':
            get_km_file_button.view_box.value = ''
            get_km_file_button.view_box.description = ''
            get_km_file_button.description = 'View'
        refresh_files_list(get_km_file_button)

        return

    if button.description == 'Clear':
        button.description = button.original_description
        button.im_view_box.value = BLAK_IMAGE
        button.view_box.value = ''
        return
    else:
        button.description = 'Clear'

    phenotype_df = kn.get_spreadsheet_df(
        os.path.join(input_data_dir, get_km_file_button.file_selector.value))
    cluster_id_name = button.cluster_id_listbox.value
    event_name = button.event_id_listbox.value
    time_name = button.time_id_listbox.value

    disp_kaplan_meier(phenotype_df, cluster_id_name, event_name, time_name,
                      button)
Exemple #13
0
def run_merge_df(run_parameters):
    """ Merge two phenotype matrices that correspond to same columns  (Union)
    Args:           run_parameters with keys:
                    "results_directory", "spreadsheet_1_file_name", "spreadsheet_2_file_name"
    """
    results_directory = run_parameters['results_directory']
    spreadsheet_1_file_name = run_parameters['spreadsheet_1_file_name']
    spreadsheet_2_file_name = run_parameters['spreadsheet_2_file_name']

    spreadsheet_1_df = kn.get_spreadsheet_df(spreadsheet_1_file_name)
    spreadsheet_2_df = kn.get_spreadsheet_df(spreadsheet_2_file_name)

    result_df = merge_df(spreadsheet_1_df, spreadsheet_2_df)
    transform_name = "merge"
    write_transform_df(result_df, spreadsheet_1_file_name, transform_name,
                       results_directory)
Exemple #14
0
def run_link_hclust(run_parameters):
    #-----------------------------------------------------
    """ wrapper: call sequence to perform hierchical clustering using linkage and save the results.

    Args:
        run_parameters: parameter set dictionary.
    """

    np.random.seed()

    nearest_neighbors = run_parameters['nearest_neighbors']
    number_of_clusters = run_parameters['number_of_clusters']
    affinity_metric = run_parameters['affinity_metric']
    linkage_criterion = run_parameters['linkage_criterion']
    spreadsheet_name_full_path = run_parameters['spreadsheet_name_full_path']

    spreadsheet_df = kn.get_spreadsheet_df(spreadsheet_name_full_path)
    spreadsheet_mat = spreadsheet_df.values
    number_of_samples = spreadsheet_mat.shape[1]

    labels, distance_matrix = perform_link_hclust(spreadsheet_mat,
                                                  number_of_clusters,
                                                  nearest_neighbors,
                                                  affinity_metric,
                                                  linkage_criterion)

    sample_names = spreadsheet_df.columns

    save_clustering_scores(distance_matrix, sample_names, labels,
                           run_parameters)
    save_final_samples_clustering(sample_names, labels, run_parameters)
    save_spreadsheet_and_variance_heatmap(spreadsheet_df, labels,
                                          run_parameters)

    return labels
Exemple #15
0
def run_kmeans(run_parameters):
    #-----------------------------------------------------
    """ wrapper: call sequence to perform kmeans clustering and save the results.

    Args:
        run_parameters: parameter set dictionary.
    """

    number_of_clusters = run_parameters['number_of_clusters']

    spreadsheet_name_full_path = run_parameters['spreadsheet_name_full_path']

    spreadsheet_df = kn.get_spreadsheet_df(spreadsheet_name_full_path)
    spreadsheet_mat_T = spreadsheet_df.values.T
    number_of_samples = spreadsheet_mat_T.shape[0]

    distance_matrix = pairwise_distances(spreadsheet_mat_T)
    labels = kn.perform_kmeans(spreadsheet_mat_T, number_of_clusters)
    sample_names = spreadsheet_df.columns

    save_clustering_scores(distance_matrix, sample_names, labels,
                           run_parameters)
    save_final_samples_clustering(sample_names, labels, run_parameters)
    save_spreadsheet_and_variance_heatmap(spreadsheet_df, labels,
                                          run_parameters)

    return labels
def run_nmf(run_parameters):
    """ wrapper: call sequence to perform non-negative matrix factorization and write results.

    Args:
        run_parameters: parameter set dictionary.
    """
    number_of_clusters = run_parameters['number_of_clusters']
    spreadsheet_name_full_path = run_parameters['spreadsheet_name_full_path']

    spreadsheet_df = kn.get_spreadsheet_df(spreadsheet_name_full_path)
    spreadsheet_mat = spreadsheet_df.as_matrix()
    spreadsheet_mat = kn.get_quantile_norm_matrix(spreadsheet_mat)

    h_mat = kn.perform_nmf(spreadsheet_mat, run_parameters)

    linkage_matrix = np.zeros(
        (spreadsheet_mat.shape[1], spreadsheet_mat.shape[1]))
    sample_perm = np.arange(0, spreadsheet_mat.shape[1])
    linkage_matrix = kn.update_linkage_matrix(h_mat, sample_perm,
                                              linkage_matrix)
    labels = kn.perform_kmeans(linkage_matrix, number_of_clusters)

    sample_names = spreadsheet_df.columns
    save_consensus_clustering(linkage_matrix, sample_names, labels,
                              run_parameters)
    save_final_samples_clustering(sample_names, labels, run_parameters)
    save_spreadsheet_and_variance_heatmap(spreadsheet_df, labels,
                                          run_parameters)
Exemple #17
0
def reset_phenotype_cols_list(change):
    """ Reset the three parameters dropdown listboxes to a new file selection.
    Args:
        change:      IPywidgets widget control change event
    """
    if get_km_file_button.file_selector.value == LIST_BOX_UPDATE_MESSAGE:
        if get_km_file_button.description == 'Clear':
            get_km_file_button.view_box.value = ''
            get_km_file_button.view_box.description = ''
            get_km_file_button.description = 'View'
        refresh_files_list(get_km_file_button)

        return
    options_df = kn.get_spreadsheet_df(
        os.path.join(input_data_dir, get_km_file_button.file_selector.value))
    sorted_options_list = sorted(list(options_df.columns.values))
    if len(sorted_options_list) > 0:
        def_val = sorted_options_list[0]
    else:
        def_val = ''
    cluster_id_listbox.options = sorted_options_list
    cluster_id_listbox.value = def_val
    event_id_listbox.options = sorted_options_list
    event_id_listbox.value = def_val
    time_id_listbox.options = sorted_options_list
    time_id_listbox.value = def_val
Exemple #18
0
def combine_phenotype_data_and_clustering(run_parameters):
    """This is to insert the sample clusters column into the phenotype dataframe.

    Returns:
        phenotype_df: phenotype dataframe with the first column as sample clusters.
    """
    phenotype_df = kn.get_spreadsheet_df(
        run_parameters['phenotype_name_full_path'])

    phenotype_df.insert(0, 'Cluster_ID', np.nan)  # pylint: disable=no-member

    cluster_labels_df = pd.read_csv(
        run_parameters['cluster_mapping_full_path'],
        index_col=0,
        header=None,
        sep='\t')

    cluster_labels_df.columns = ['Cluster_ID']

    common_samples = kn.find_common_node_names(phenotype_df.index,
                                               cluster_labels_df.index)

    phenotype_df.loc[common_samples,
                     'Cluster_ID'] = cluster_labels_df.loc[common_samples,
                                                           'Cluster_ID']  # pylint: disable=no-member

    return phenotype_df
Exemple #19
0
def run_GRN_lasso(run_parameters):
    """
	"""
    spreadsheet = kn.get_spreadsheet_df(
        run_parameters['spreadsheet_name_full_path'])
    gene_list = spreadsheet.index
    tf_idx = range(int(spreadsheet.shape[0] * 0.2))
    tf_spreadsheet = spreadsheet.iloc[tf_idx, :]

    result_df = pd.DataFrame(index=gene_list, columns=tf_spreadsheet.index)
    param_dict = {
        'n_alphas': 1000,
        'fit_intercept': run_parameters['fit_intercept'],
        'normalize': run_parameters['normalize'],
        'max_iter': 2000,
        'cv': 5
    }

    for i in range(spreadsheet.shape[0]):
        # curr_response = spreadsheet.values[i, :].reshape(-1,1)
        curr_response = spreadsheet.values[i, :].ravel()
        curr_model = algo_lasso(tf_spreadsheet.values.T, curr_response,
                                param_dict)
        coef = curr_model.coef_.ravel()
        # (x-min(x))/(max(x)-min(x))
        result_df.loc[gene_list[i], :] = (coef - min(coef)) / (max(coef) -
                                                               min(coef))

    file_path = os.path.join(run_parameters['results_directory'],
                             'GRN_coefficient_result.tsv')
    result_df.to_csv(file_path, header=True, index=True, sep='\t')
Exemple #20
0
def run_net_similarity(run_parameters):
    """ Run random walk first to smooth expression and signature 
    then perform similarity analysis and save the similarity matrix.

    Args:
        run_parameters: parameter set dictionary.
    """
    expression_name = run_parameters["spreadsheet_name_full_path"]
    signature_name = run_parameters["signature_name_full_path"]
    gg_network_name = run_parameters['gg_network_name_full_path']
    similarity_measure = run_parameters["similarity_measure"]

    expression_df = kn.get_spreadsheet_df(expression_name)
    signature_df = kn.get_spreadsheet_df(signature_name)

    samples_names = expression_df.columns
    signatures_names = signature_df.columns
    signatures_names = [i.split('.')[0] for i in signatures_names]
    signature_df.columns = signatures_names

    network_mat, unique_gene_names = kn.get_sparse_network_matrix(
        gg_network_name)
    # network_mat                    = kn.normalize_sparse_mat_by_diagonal(network_mat)

    expression_df = kn.update_spreadsheet_df(expression_df, unique_gene_names)
    signature_df = kn.update_spreadsheet_df(signature_df, unique_gene_names)

    expression_mat = expression_df.as_matrix()
    signature_mat = signature_df.as_matrix()

    expression_mat, iterations = kn.smooth_matrix_with_rwr(
        expression_mat, network_mat, run_parameters)
    signature_mat, iterations = kn.smooth_matrix_with_rwr(
        signature_mat, network_mat, run_parameters)

    expression_df.iloc[:] = expression_mat
    signature_df.iloc[:] = signature_mat

    similarity_mat = generate_similarity_mat(expression_df, signature_df,
                                             similarity_measure)
    # similarity_mat = map_similarity_range(similarity_mat, 0)
    similarity_df = pd.DataFrame(similarity_mat,
                                 index=samples_names,
                                 columns=signatures_names)

    save_final_samples_signature(similarity_df, run_parameters)
    save_best_match_signature(similarity_df, run_parameters)
Exemple #21
0
def run_fisher(run_parameters):
    ''' wrapper: call sequence to perform fisher gene-set characterization
    Args:
        run_parameters: dictionary of run parameters
    '''
    # -----------------------------------
    # - Data read and extraction Section -
    # -----------------------------------
    spreadsheet_df = kn.get_spreadsheet_df(
        run_parameters['spreadsheet_name_full_path'])
    prop_gene_network_df = kn.get_network_df(
        run_parameters['pg_network_name_full_path'])

    spreadsheet_gene_names = kn.extract_spreadsheet_gene_names(spreadsheet_df)

    prop_gene_network_n1_names, \
    prop_gene_network_n2_names = kn.extract_network_node_names(prop_gene_network_df)
    # -----------------------------------------------------------------------
    # - limit the gene set to the intersection of network and user gene set -
    # -----------------------------------------------------------------------
    common_gene_names = kn.find_common_node_names(prop_gene_network_n2_names,
                                                  spreadsheet_gene_names)
    common_gene_names_dict = kn.create_node_names_dict(common_gene_names)
    prop_gene_network_n1_names_dict = kn.create_node_names_dict(
        prop_gene_network_n1_names)
    reverse_prop_dict = kn.create_reverse_node_names_dict(
        prop_gene_network_n1_names_dict)
    # ----------------------------------------------------------------------------
    # - restrict spreadsheet and network to common genes and drop everthing else -
    # ----------------------------------------------------------------------------
    new_spreadsheet_df = kn.update_spreadsheet_df(spreadsheet_df,
                                                  common_gene_names)
    prop_gene_network_df = kn.update_network_df(prop_gene_network_df,
                                                common_gene_names, "node_2")
    prop_gene_network_df['wt'] = 1
    # ----------------------------------------------------------------------------
    # - map every gene name to an integer index in sequential order startng at 0 -
    # ----------------------------------------------------------------------------
    prop_gene_network_df = kn.map_node_names_to_index(
        prop_gene_network_df, prop_gene_network_n1_names_dict, "node_1")
    prop_gene_network_df = kn.map_node_names_to_index(prop_gene_network_df,
                                                      common_gene_names_dict,
                                                      "node_2")
    # --------------------------------------------
    # - store the network in a csr sparse format -
    # --------------------------------------------
    universe_count = len(common_gene_names)
    prop_gene_network_sparse = kn.convert_network_df_to_sparse(
        prop_gene_network_df, universe_count, len(prop_gene_network_n1_names))
    fisher_contingency_pval = get_fisher_exact_test(prop_gene_network_sparse,
                                                    reverse_prop_dict,
                                                    new_spreadsheet_df)
    fisher_final_result = save_fisher_test_result(
        fisher_contingency_pval, run_parameters['results_directory'],
        spreadsheet_df.columns.values, 2)
    map_and_save_droplist(spreadsheet_df, common_gene_names, 'fisher_droplist',
                          run_parameters)

    return fisher_final_result
def run_lasso_predict(run_parameters):
    gene_samples_train_df = kn.get_spreadsheet_df(
        run_parameters['spreadsheet_name_full_path'])
    response_train_df = kn.get_spreadsheet_df(
        run_parameters['response_name_full_path'])
    gene_samples_test_df = kn.get_spreadsheet_df(
        run_parameters['test_spreadsheet_name_full_path'])
    response_test_sample_names = list(gene_samples_test_df.columns)
    reg_moE = linear_model.Lasso()
    response_predict = reg_moE.fit(gene_samples_train_df.transpose().values,
                                   response_train_df.values[0]).predict(
                                       gene_samples_test_df.transpose().values)

    predict_df = pd.DataFrame(response_predict.T,
                              index=response_test_sample_names,
                              columns=['predict'])
    write_predict_data(predict_df, run_parameters)
def run_net_similarity(run_parameters):
    """ Run random walk first to smooth expression and signature 
    then perform similarity analysis and save the similarity matrix.

    Args:
        run_parameters: parameter set dictionary.
    """
    expression_name       = run_parameters["spreadsheet_name_full_path"]
    signature_name        = run_parameters["signature_name_full_path"  ]
    gg_network_name       = run_parameters['gg_network_name_full_path' ]
    similarity_measure    = run_parameters["similarity_measure"        ]

    expression_df         = kn.get_spreadsheet_df(expression_name)
    signature_df          = kn.get_spreadsheet_df( signature_name)

    expression_col_names  = expression_df.columns
    signature_col_names   =  signature_df.columns

    #---------------------
    network_mat,          \
    unique_gene_names     = kn.get_sparse_network_matrix(gg_network_name)
    expression_df         = kn.update_spreadsheet_df(expression_df, unique_gene_names)
    signature_df          = kn.update_spreadsheet_df( signature_df, unique_gene_names)
    #---------------------

    expression_mat        = expression_df.values
    signature_mat         =  signature_df.values

    expression_mat,       \
    iterations            = kn.smooth_matrix_with_rwr(expression_mat, network_mat, run_parameters)

    signature_mat,        \
    iterations            = kn.smooth_matrix_with_rwr( signature_mat, network_mat, run_parameters)

    expression_df.iloc[:] = expression_mat
    signature_df.iloc [:] = signature_mat

    # ---------------------------------------------
    similarity_mat        = generate_similarity_mat(expression_df, signature_df,similarity_measure)
    # ---------------------------------------------


    similarity_df  = pd.DataFrame( similarity_mat, index = expression_col_names, columns = signature_col_names )
    save_final_expression_signature( similarity_df,  run_parameters                                            )
    save_best_match_signature      ( similarity_df,  run_parameters                                            )
def view_spreadsheet_file_head(full_file_name):
    """ notebook convenience """
    if os.path.isfile(full_file_name):
        sp_df = kn.get_spreadsheet_df(full_file_name)
        deNada, f_name = os.path.split(full_file_name)
        print(f_name, ' size:', sp_df.shape)
        display(sp_df.head(10))
    else:
        print('file not found on local path')
def run_svr_predict(run_parameters):
    ''' Using SVR model to predict response data against feature data

    Args:
        run_parameters: dictionary of run parameters
    '''
    gene_file = run_parameters['spreadsheet_name_full_path']
    sign_file = run_parameters['response_name_full_path']
    test_file = run_parameters['test_spreadsheet_name_full_path']

    gene_df = kn.get_spreadsheet_df(gene_file)
    sign_df = kn.get_spreadsheet_df(sign_file)
    test_df = kn.get_spreadsheet_df(test_file)

    row_names = test_df.columns

    gene_mat = gene_df.values
    sign_mat = sign_df.values[0]
    test_mat = test_df.values

    svr_kernel = run_parameters['svr_kernel']
    p_grid    = {'svr_degree': 3, 'svr_gamma': 'auto', 'svr_coef0': 0.0, \
                'svr_tol': 0.001, 'svr_C': 1.0, 'svr_epsilon': 0.1, \
                'svr_shrinking': True, 'svr_cache_size':200, \
                'svr_verbose': False, 'svr_max_iter': -1}

    for k, v in p_grid.items():
        if k in run_parameters:
            p_grid[k] = v

    reg_model = SVR(kernel=svr_kernel, degree=p_grid['svr_degree'], gamma=p_grid['svr_gamma'], \
        coef0=p_grid['svr_coef0'], tol=p_grid['svr_tol'], C=p_grid['svr_C'], epsilon=p_grid['svr_epsilon'], \
        shrinking=p_grid['svr_shrinking'], cache_size=p_grid['svr_cache_size'], verbose=p_grid['svr_verbose'], \
        max_iter=p_grid['svr_max_iter'])

    reg_model.fit(gene_mat.T, sign_mat)
    filename = os.path.join(run_parameters['results_directory'],
                            'svr_model.pkl')
    pickle.dump(reg_model, open(filename, 'wb'))
    response_predict = reg_model.predict(test_mat.T)
    predict_df = pd.DataFrame(response_predict.T,
                              index=row_names,
                              columns=['predict'])
    write_predict_data(predict_df, run_parameters)
Exemple #26
0
def run_DRaWR(run_parameters):
    ''' wrapper: call sequence to perform random walk with restart
    Args:
        run_parameters: dictionary of run parameters
    '''

    network_sparse, unique_gene_names, \
    pg_network_n1_names = build_hybrid_sparse_matrix(run_parameters, True, True)

    unique_all_node_names = unique_gene_names + pg_network_n1_names
    spreadsheet_df = kn.get_spreadsheet_df(
        run_parameters['spreadsheet_name_full_path'])
    new_spreadsheet_df = kn.update_spreadsheet_df(spreadsheet_df,
                                                  unique_all_node_names)

    unique_genes_length = len(unique_gene_names)
    property_length = len(set(pg_network_n1_names))
    base_col = np.append(np.ones(unique_genes_length, dtype=np.int),
                         np.zeros(property_length, dtype=np.int))

    new_spreadsheet_df = kn.append_column_to_spreadsheet(
        new_spreadsheet_df, base_col, 'base')
    hetero_network = normalize(network_sparse, norm='l1', axis=0)

    final_spreadsheet_matrix, step = kn.smooth_matrix_with_rwr(
        normalize(new_spreadsheet_df, norm='l1', axis=0), hetero_network,
        run_parameters)

    final_spreadsheet_df = pd.DataFrame(final_spreadsheet_matrix)
    final_spreadsheet_df.index = new_spreadsheet_df.index.values
    final_spreadsheet_df.columns = new_spreadsheet_df.columns.values
    prop_spreadsheet_df = rank_drawr_property(final_spreadsheet_df,
                                              pg_network_n1_names)

    spreadsheet_df_mask = final_spreadsheet_df.loc[
        final_spreadsheet_df.index.isin(spreadsheet_df.index)]
    gene_result_df = construct_drawr_result_df(spreadsheet_df_mask, 0,
                                               spreadsheet_df_mask.shape[0],
                                               True, run_parameters)
    prop_result_df = construct_drawr_result_df(final_spreadsheet_df,
                                               unique_genes_length,
                                               final_spreadsheet_df.shape[0],
                                               False, run_parameters)

    save_timestamped_df(prop_spreadsheet_df,
                        run_parameters['results_directory'],
                        'DRaWR_ranked_by_property')
    save_timestamped_df(gene_result_df, run_parameters['results_directory'],
                        'DRaWR_sorted_by_gene_score')
    save_timestamped_df(prop_result_df, run_parameters['results_directory'],
                        'DRaWR_sorted_by_property_score')

    map_and_save_droplist(spreadsheet_df, unique_gene_names, 'DRaWR_droplist',
                          run_parameters)

    return prop_spreadsheet_df
Exemple #27
0
def run_bootstrap_correlation(run_parameters):
    """ perform feature prioritization using bootstrap sampling

    Args:
        run_parameters: parameter set dictionary.
    """
    run_parameters["results_tmp_directory"] = kn.create_dir(run_parameters["results_directory"], 'tmp')

    phenotype_df        = kn.get_spreadsheet_df(run_parameters["phenotype_name_full_path"])
    spreadsheet_df      = kn.get_spreadsheet_df(run_parameters["spreadsheet_name_full_path"])
    phenotype_df        = phenotype_df.T
    n_bootstraps        = run_parameters["number_of_bootstraps"]
    number_of_jobs      = len(phenotype_df.index)
    jobs_id             = range(0, number_of_jobs)
    zipped_arguments    = dstutil.zip_parameters(run_parameters, spreadsheet_df, phenotype_df, n_bootstraps, jobs_id)

    dstutil.parallelize_processes_locally(run_bootstrap_correlation_worker, zipped_arguments, number_of_jobs)
    write_phenotype_data_all(run_parameters)
    kn.remove_dir(run_parameters["results_tmp_directory"])
Exemple #28
0
def run_spreadsheet_numerical_transform(run_parameters):
    """ numerical transformation of dataframe

    Args:           run_parameters with keys:
                    "results_directory", "spreadsheet_file_name", "numeric_function", (with corresponding options):
                    (z_transform_axis, z_transform_ddof)
                    (log_transform_log_base, log_transform_log_offset_
                    (threshold_cut_off, threshold_substitution_value, threshold_scope)
    """
    results_directory = run_parameters['results_directory']
    spreadsheet_name_full_path = run_parameters['spreadsheet_name_full_path']
    numeric_function = run_parameters['numeric_function']

    spreadsheet_df = kn.get_spreadsheet_df(spreadsheet_name_full_path)

    if numeric_function == 'abs':
        spreadsheet_df = abs_df(spreadsheet_df)
        transform_name = "absolute_value"

    elif numeric_function == 'z_transform':
        z_transform_axis = run_parameters['z_transform_axis']
        z_transform_ddof = run_parameters['z_transform_ddof']
        spreadsheet_df = z_transform_df(spreadsheet_df,
                                        axis=z_transform_axis,
                                        ddof=z_transform_ddof)
        transform_name = 'z_transform'

    elif numeric_function == 'log_transform':
        log_transform_log_base = run_parameters['log_transform_log_base']
        if log_transform_log_base == "e":
            log_transform_log_base = np.exp(1)
        log_transform_log_offset = run_parameters['log_transform_log_offset']
        spreadsheet_df = log_transform_df(spreadsheet_df,
                                          log_base=log_transform_log_base,
                                          log_offset=log_transform_log_offset)
        transform_name = 'log_transform'

    elif numeric_function == 'threshold':
        threshold_cut_off = run_parameters['threshold_cut_off']
        threshold_substitution_value = run_parameters[
            'threshold_substitution_value']
        threshold_scope = run_parameters['threshold_scope']
        spreadsheet_df = threshold_df(spreadsheet_df,
                                      cut_off=threshold_cut_off,
                                      sub_val=threshold_substitution_value,
                                      scope=threshold_scope)
        transform_name = 'threshold'

    else:
        return

    write_transform_df(spreadsheet_df, spreadsheet_name_full_path,
                       transform_name, results_directory)
Exemple #29
0
def run_common_samples_df(run_parameters):
    """ Make two spreadsheets consistent by samples: two new spreadsheets created
        with samples being the intersection of sample sets of given spreadsheets.

    Args:           run_parameters with keys:
                    "results_directory", "spreadsheet_1_file_name", "spreadsheet_2_file_name"
    """
    results_directory = run_parameters['results_directory']
    spreadsheet_1_file_name = run_parameters['spreadsheet_1_file_name']
    spreadsheet_2_file_name = run_parameters['spreadsheet_2_file_name']

    spreadsheet_1_df = kn.get_spreadsheet_df(spreadsheet_1_file_name)
    spreadsheet_2_df = kn.get_spreadsheet_df(spreadsheet_2_file_name)

    spreadsheet_1_df, spreadsheet_2_df = common_samples_df(
        spreadsheet_1_df, spreadsheet_2_df)

    transform_name = "common_samples"
    write_transform_df(spreadsheet_1_df, spreadsheet_1_file_name,
                       transform_name, results_directory)
    write_transform_df(spreadsheet_2_df, spreadsheet_2_file_name,
                       transform_name, results_directory)
def run_cc_nmf(run_parameters):
    """ wrapper: call sequence to perform non-negative matrix factorization with
        consensus clustering and write results.

    Args:
        run_parameters: parameter set dictionary.
    """
    tmp_dir = 'tmp_cc_nmf'
    run_parameters = update_tmp_directory(run_parameters, tmp_dir)

    processing_method = run_parameters['processing_method']
    number_of_bootstraps = run_parameters['number_of_bootstraps']
    number_of_clusters = run_parameters['number_of_clusters']
    spreadsheet_name_full_path = run_parameters['spreadsheet_name_full_path']

    spreadsheet_df = kn.get_spreadsheet_df(spreadsheet_name_full_path)
    spreadsheet_mat = spreadsheet_df.as_matrix()
    spreadsheet_mat = kn.get_quantile_norm_matrix(spreadsheet_mat)
    number_of_samples = spreadsheet_mat.shape[1]

    if processing_method == 'serial':
        for sample in range(0, number_of_bootstraps):
            run_cc_nmf_clusters_worker(spreadsheet_mat, run_parameters, sample)

    elif processing_method == 'parallel':
        find_and_save_cc_nmf_clusters_parallel(spreadsheet_mat, run_parameters,
                                               number_of_bootstraps)

    elif processing_method == 'distribute':
        func_args = [spreadsheet_mat, run_parameters]
        dependency_list = [
            run_cc_nmf_clusters_worker, save_a_clustering_to_tmp,
            dstutil.determine_parallelism_locally
        ]
        dstutil.execute_distribute_computing_job(
            run_parameters['cluster_ip_address'], number_of_bootstraps,
            func_args, find_and_save_cc_nmf_clusters_parallel, dependency_list)
    else:
        raise ValueError('processing_method contains bad value.')

    consensus_matrix = form_consensus_matrix(run_parameters, number_of_samples)
    labels = kn.perform_kmeans(consensus_matrix, number_of_clusters)

    sample_names = spreadsheet_df.columns
    save_consensus_clustering(consensus_matrix, sample_names, labels,
                              run_parameters)
    save_final_samples_clustering(sample_names, labels, run_parameters)
    save_spreadsheet_and_variance_heatmap(spreadsheet_df, labels,
                                          run_parameters)

    kn.remove_dir(run_parameters["tmp_directory"])