def combine_phenotype_data_and_clustering(run_parameters): """This is to insert the sample clusters column into the phenotype dataframe. Returns: phenotype_df: phenotype dataframe with the first column as sample clusters. """ phenotype_df = kn.get_spreadsheet_df( run_parameters['phenotype_name_full_path']) phenotype_df.insert(0, 'Cluster_ID', np.nan) # pylint: disable=no-member cluster_labels_df = pd.read_csv( run_parameters['cluster_mapping_full_path'], index_col=0, header=None, sep='\t') cluster_labels_df.columns = ['Cluster_ID'] common_samples = kn.find_common_node_names(phenotype_df.index, cluster_labels_df.index) phenotype_df.loc[common_samples, 'Cluster_ID'] = cluster_labels_df.loc[common_samples, 'Cluster_ID'] # pylint: disable=no-member return phenotype_df
def generate_similarity_mat(expression_df, signature_df,similarity_measure): """generate matrix which save the similarity value of input dataframes Args: expression_df: genes x samples dataframe. signature_df: genes x samples dataframe. Returns: similarity_mat: matrix with similarity values """ genes_in_expression = expression_df.index genes_in_signature = signature_df.index common_genes = kn.find_common_node_names(genes_in_expression, genes_in_signature) expression_mat = expression_df.loc[common_genes, :].values signature_mat = signature_df.loc[common_genes, :].values nx = expression_mat.shape[1] if (similarity_measure == "cosine" ): similarity_mat = cosine_similarity(expression_mat.T, signature_mat.T) elif (similarity_measure == "spearman"): similarity_mat = spearmanr(expression_mat, signature_mat)[0] similarity_mat = similarity_mat[0:nx,nx:] elif (similarity_measure == "pearson"): similarity_mat = get_pearsonr(expression_mat, signature_mat) return similarity_mat
def run_fisher(run_parameters): ''' wrapper: call sequence to perform fisher gene-set characterization Args: run_parameters: dictionary of run parameters ''' # ----------------------------------- # - Data read and extraction Section - # ----------------------------------- spreadsheet_df = kn.get_spreadsheet_df( run_parameters['spreadsheet_name_full_path']) prop_gene_network_df = kn.get_network_df( run_parameters['pg_network_name_full_path']) spreadsheet_gene_names = kn.extract_spreadsheet_gene_names(spreadsheet_df) prop_gene_network_n1_names, \ prop_gene_network_n2_names = kn.extract_network_node_names(prop_gene_network_df) # ----------------------------------------------------------------------- # - limit the gene set to the intersection of network and user gene set - # ----------------------------------------------------------------------- common_gene_names = kn.find_common_node_names(prop_gene_network_n2_names, spreadsheet_gene_names) common_gene_names_dict = kn.create_node_names_dict(common_gene_names) prop_gene_network_n1_names_dict = kn.create_node_names_dict( prop_gene_network_n1_names) reverse_prop_dict = kn.create_reverse_node_names_dict( prop_gene_network_n1_names_dict) # ---------------------------------------------------------------------------- # - restrict spreadsheet and network to common genes and drop everthing else - # ---------------------------------------------------------------------------- new_spreadsheet_df = kn.update_spreadsheet_df(spreadsheet_df, common_gene_names) prop_gene_network_df = kn.update_network_df(prop_gene_network_df, common_gene_names, "node_2") prop_gene_network_df['wt'] = 1 # ---------------------------------------------------------------------------- # - map every gene name to an integer index in sequential order startng at 0 - # ---------------------------------------------------------------------------- prop_gene_network_df = kn.map_node_names_to_index( prop_gene_network_df, prop_gene_network_n1_names_dict, "node_1") prop_gene_network_df = kn.map_node_names_to_index(prop_gene_network_df, common_gene_names_dict, "node_2") # -------------------------------------------- # - store the network in a csr sparse format - # -------------------------------------------- universe_count = len(common_gene_names) prop_gene_network_sparse = kn.convert_network_df_to_sparse( prop_gene_network_df, universe_count, len(prop_gene_network_n1_names)) fisher_contingency_pval = get_fisher_exact_test(prop_gene_network_sparse, reverse_prop_dict, new_spreadsheet_df) fisher_final_result = save_fisher_test_result( fisher_contingency_pval, run_parameters['results_directory'], spreadsheet_df.columns.values, 2) map_and_save_droplist(spreadsheet_df, common_gene_names, 'fisher_droplist', run_parameters) return fisher_final_result
def select_genes_df(spreadsheet_df, gene_select_list): """ Subset genes based on given gene set. Output is a spreadsheet with fewer rows Args: spreadsheet_df: genes x samples data frame gene_select_list: list of some gene names in the spreadsheet Returns: spreadsheet_intersected_df: data frame with only the genes in the intersection of input gene names. """ gene_names = kn.extract_spreadsheet_gene_names(spreadsheet_df) intersection_names = kn.find_common_node_names(gene_names, gene_select_list) return spreadsheet_df.loc[intersection_names]
def common_samples_df(sxp_1_df, sxp_2_df): """ Make two spreadsheets consistent by samples: two new spreadsheets created with samples being the intersection of sample sets of given spreadsheets. Args: sxp_1_df: samples x phenotypes dataframe (sxp_1_df = kn.get_spreadsheet_df(sxp_filename_1)) sxp_2_df: samples x phenotypes dataframe Returns: sxp_1_trim_df: samples x phenotypes with only sample names in both input dataframes sxp_2_trim_df: samples x phenotypes with only sample names in both input dataframes """ sxp_1_gene_names = kn.extract_spreadsheet_gene_names(sxp_1_df) sxp_2_gene_names = kn.extract_spreadsheet_gene_names(sxp_2_df) common_samples_list = kn.find_common_node_names(sxp_1_gene_names, sxp_2_gene_names) return sxp_1_df.loc[common_samples_list], sxp_2_df.loc[common_samples_list]
def test_find_common_node_names(self): ret = kn.find_common_node_names(self.list_1, self.list_2) self.assertEqual(True, set(ret) == set([1]), 'incorrect intersection')