Example #1
0
def combine_phenotype_data_and_clustering(run_parameters):
    """This is to insert the sample clusters column into the phenotype dataframe.

    Returns:
        phenotype_df: phenotype dataframe with the first column as sample clusters.
    """
    phenotype_df = kn.get_spreadsheet_df(
        run_parameters['phenotype_name_full_path'])

    phenotype_df.insert(0, 'Cluster_ID', np.nan)  # pylint: disable=no-member

    cluster_labels_df = pd.read_csv(
        run_parameters['cluster_mapping_full_path'],
        index_col=0,
        header=None,
        sep='\t')

    cluster_labels_df.columns = ['Cluster_ID']

    common_samples = kn.find_common_node_names(phenotype_df.index,
                                               cluster_labels_df.index)

    phenotype_df.loc[common_samples,
                     'Cluster_ID'] = cluster_labels_df.loc[common_samples,
                                                           'Cluster_ID']  # pylint: disable=no-member

    return phenotype_df
def generate_similarity_mat(expression_df, signature_df,similarity_measure):
    """generate matrix which save the similarity value of input dataframes

    Args:
        expression_df: genes x samples dataframe.
        signature_df:  genes x samples dataframe.
        
    Returns:
        similarity_mat: matrix with similarity values
    """

    genes_in_expression =  expression_df.index
    genes_in_signature  =   signature_df.index

    common_genes        = kn.find_common_node_names(genes_in_expression, genes_in_signature)

    expression_mat      = expression_df.loc[common_genes, :].values
    signature_mat       =  signature_df.loc[common_genes, :].values

    nx                  = expression_mat.shape[1]

    if   (similarity_measure == "cosine" ):
          similarity_mat      = cosine_similarity(expression_mat.T, signature_mat.T)

    elif (similarity_measure == "spearman"):
          similarity_mat      = spearmanr(expression_mat, signature_mat)[0]
          similarity_mat      = similarity_mat[0:nx,nx:]

    elif (similarity_measure == "pearson"):
          similarity_mat      = get_pearsonr(expression_mat, signature_mat)


    return similarity_mat
Example #3
0
def run_fisher(run_parameters):
    ''' wrapper: call sequence to perform fisher gene-set characterization
    Args:
        run_parameters: dictionary of run parameters
    '''
    # -----------------------------------
    # - Data read and extraction Section -
    # -----------------------------------
    spreadsheet_df = kn.get_spreadsheet_df(
        run_parameters['spreadsheet_name_full_path'])
    prop_gene_network_df = kn.get_network_df(
        run_parameters['pg_network_name_full_path'])

    spreadsheet_gene_names = kn.extract_spreadsheet_gene_names(spreadsheet_df)

    prop_gene_network_n1_names, \
    prop_gene_network_n2_names = kn.extract_network_node_names(prop_gene_network_df)
    # -----------------------------------------------------------------------
    # - limit the gene set to the intersection of network and user gene set -
    # -----------------------------------------------------------------------
    common_gene_names = kn.find_common_node_names(prop_gene_network_n2_names,
                                                  spreadsheet_gene_names)
    common_gene_names_dict = kn.create_node_names_dict(common_gene_names)
    prop_gene_network_n1_names_dict = kn.create_node_names_dict(
        prop_gene_network_n1_names)
    reverse_prop_dict = kn.create_reverse_node_names_dict(
        prop_gene_network_n1_names_dict)
    # ----------------------------------------------------------------------------
    # - restrict spreadsheet and network to common genes and drop everthing else -
    # ----------------------------------------------------------------------------
    new_spreadsheet_df = kn.update_spreadsheet_df(spreadsheet_df,
                                                  common_gene_names)
    prop_gene_network_df = kn.update_network_df(prop_gene_network_df,
                                                common_gene_names, "node_2")
    prop_gene_network_df['wt'] = 1
    # ----------------------------------------------------------------------------
    # - map every gene name to an integer index in sequential order startng at 0 -
    # ----------------------------------------------------------------------------
    prop_gene_network_df = kn.map_node_names_to_index(
        prop_gene_network_df, prop_gene_network_n1_names_dict, "node_1")
    prop_gene_network_df = kn.map_node_names_to_index(prop_gene_network_df,
                                                      common_gene_names_dict,
                                                      "node_2")
    # --------------------------------------------
    # - store the network in a csr sparse format -
    # --------------------------------------------
    universe_count = len(common_gene_names)
    prop_gene_network_sparse = kn.convert_network_df_to_sparse(
        prop_gene_network_df, universe_count, len(prop_gene_network_n1_names))
    fisher_contingency_pval = get_fisher_exact_test(prop_gene_network_sparse,
                                                    reverse_prop_dict,
                                                    new_spreadsheet_df)
    fisher_final_result = save_fisher_test_result(
        fisher_contingency_pval, run_parameters['results_directory'],
        spreadsheet_df.columns.values, 2)
    map_and_save_droplist(spreadsheet_df, common_gene_names, 'fisher_droplist',
                          run_parameters)

    return fisher_final_result
Example #4
0
def select_genes_df(spreadsheet_df, gene_select_list):
    """ Subset genes based on given gene set. Output is a spreadsheet with fewer rows
    Args:
        spreadsheet_df:             genes x samples data frame
        gene_select_list:           list of some gene names in the spreadsheet
    Returns:
        spreadsheet_intersected_df: data frame with only the genes in the intersection of input gene names.
    """
    gene_names = kn.extract_spreadsheet_gene_names(spreadsheet_df)
    intersection_names = kn.find_common_node_names(gene_names,
                                                   gene_select_list)
    return spreadsheet_df.loc[intersection_names]
Example #5
0
def common_samples_df(sxp_1_df, sxp_2_df):
    """ Make two spreadsheets consistent by samples: two new spreadsheets created
        with samples being the intersection of sample sets of given spreadsheets.
    Args:
        sxp_1_df:      samples x phenotypes dataframe (sxp_1_df = kn.get_spreadsheet_df(sxp_filename_1))
        sxp_2_df:      samples x phenotypes dataframe
    Returns:
        sxp_1_trim_df: samples x phenotypes with only sample names in both input dataframes
        sxp_2_trim_df: samples x phenotypes with only sample names in both input dataframes
    """
    sxp_1_gene_names = kn.extract_spreadsheet_gene_names(sxp_1_df)
    sxp_2_gene_names = kn.extract_spreadsheet_gene_names(sxp_2_df)
    common_samples_list = kn.find_common_node_names(sxp_1_gene_names,
                                                    sxp_2_gene_names)

    return sxp_1_df.loc[common_samples_list], sxp_2_df.loc[common_samples_list]
 def test_find_common_node_names(self):
     ret = kn.find_common_node_names(self.list_1, self.list_2)
     self.assertEqual(True, set(ret) == set([1]), 'incorrect intersection')