def semsim_squaredf(self, ids_ls, go_tree, go_dag): ids_ls_copy = copy.deepcopy(ids_ls) ids_ls_copy_df = pd.DataFrame(ids_ls_copy) ## mapping alternate GO IDs in order to calculate Wang SemSim ids_ls_copy_df = ids_ls_copy_df.replace(0, self.G.alt_ids) blank_df = pd.DataFrame(columns=list(ids_ls_copy_df[0]), index=list(ids_ls_copy_df[0])) id_names_zip_dict = { go_id: preprocess(go_dag[go_id].name) for go_id in ids_ls } #df_combos = pd.DataFrame(list(itertools.combinations_with_replacement(list(ids_ls_copy_df[0]), 2))) for x_run, y_run in itertools.combinations_with_replacement( list(ids_ls_copy_df[0]), 2): # if not x in full_tree.nodes: # x_run = full_tree.alt_ids[x] # else: # x_run = x # if not y in full_tree.nodes: # y_run = full_tree.alt_ids[y] # else: # y_run = y #print(x_run,y_run) score = wang(go_tree, x_run, y_run) #print(x,y,score) blank_df[y_run].loc[x_run] = score blank_df[x_run].loc[y_run] = score blank_df.columns = ids_ls blank_df.index = ids_ls blank_df = blank_df.rename(columns=id_names_zip_dict, index=id_names_zip_dict) blank_df.columns.name = None blank_df.index.name = None blank_df = blank_df[blank_df.columns].astype(float) return (blank_df, id_names_zip_dict)
def queries_df_to_vecs(self, input_df): if isinstance(input_df, list): input_df = pd.DataFrame(input_df) if input_df.shape[1] != 1: raise ValueError( f'Expected input pd.DataFrame to have single column with no header of natural language queries' f'Can also input a list of queries that will be coerced into a pd.DataFrame' ) ## Preprocess strings input_df[0] = input_df[0].apply(lambda x: preprocess(str(x))) ## drop duplicates input_df = input_df.drop_duplicates(keep='first') ## get embedding sentence vectors for queries vecs_df = pd.DataFrame(list(input_df[0].apply( self.model.get_sentence_vector)), index=input_df[0]) ## L2 normalize vectors vec_norm = np.sqrt(np.square(np.array(vecs_df)).sum(axis=1)) queries_vec_normalized = pd.DataFrame( np.array(vecs_df) / vec_norm.reshape(vecs_df.shape[0], 1), index=vecs_df.index) return (queries_vec_normalized)
def query_tree_get_mqueries_nns(query_vecs_input, trees_dict, tree_type_list, k_nns=10, **kwargs): output_dict = {} if not isinstance(query_vecs_input, pd.core.frame.DataFrame): raise ValueError( 'query_vecs_input needs to be a Pandas DataFrame. Please make it a dataframe and try again.' ) if not isinstance(trees_dict, dict): raise ValueError( 'trees_dict must be a dictionary with key corresponding to tree name tag (e.g. "MF_GO")' ' and value corresponding to the associated tree (e.g. go_mf_tree)' ) if not all([isinstance(x, list) for x in (tree_type_list)]): raise ValueError( "\nYou've inputted the wrong type for tree_type_list.\n" "Input should be list of lists corresponding to trees you want run.\n" "Please consider first using multi query file reader as input before this step." ) allowed_tree_types = { 'MF_GO': 'go_mf_tree', 'BP_GO': 'go_bp_tree', 'CC_GO': 'go_cc_tree', 'MF_GO_euc': 'go_mf_tree_euc', 'BP_GO_euc': 'go_bp_tree_euc', 'CC_GO_euc': 'go_cc_tree_euc', 'Disease': 'fname_dis_tree', 'Pathway': 'pathways_tree', 'PC': 'pc_tree', 'CORUM_GO': 'go_corum_tree' } if not all([ set(x).issubset(tuple(allowed_tree_types.keys())) for x in tree_type_list ]): trees_check = [ set(x).issubset(tuple(allowed_tree_types.keys())) for x in tree_type_list ] lines_idxs_wrong_trees = [ i for i, x in enumerate(trees_check) if x == False ] raise ValueError( 'tree_type must be string item in %s\n' 'Problematic Lines indexes are: %s ' % (list(allowed_tree_types.keys()), lines_idxs_wrong_trees)) if 'go_dag' in kwargs: go_dag = kwargs['go_dag'] go_map_full = pd.DataFrame( [(preprocess(go_dag[go_id].name), go_id, go_dag[go_id].namespace) for go_id in go_dag.keys()], columns=['GO', 'GO ID', 'class']) go_map_full_dict = { go_map_full['GO'].iloc[i]: go_map_full['GO ID'].iloc[i] for i in range(go_map_full.shape[0]) } # retrieve out queries list from input dataframe full_queries = list(query_vecs_input.index) print(len(full_queries)) print(len(tree_type_list)) for tree_type in list(allowed_tree_types.keys()): true_idxs = [tree_type in sublist for sublist in tree_type_list] tree_type_queries = [ item for idx, item in enumerate(full_queries) if true_idxs[idx] == True ] if len(tree_type_queries) == len(full_queries): if k_nns == -1: k_nn = len( trees_dict[allowed_tree_types[tree_type]].word_series) else: k_nn = k_nns tmp_nn_list = trees_dict[allowed_tree_types[tree_type]].kneighbors( X=query_vecs_input.loc[tree_type_queries], k=k_nn, return_similarity=True) for i, key in enumerate(list(tree_type_queries)): final_df = pd.DataFrame( list( zip(tmp_nn_list[0][i], tmp_nn_list[2][i], tmp_nn_list[3][i])), columns=['NNs_natlang', 'NNs_distance', 'NNs_simil']) if 'go_dag' in kwargs: final_df['GO ID'] = final_df['NNs_natlang'].map( go_map_full_dict) if key in list(output_dict.keys()): output_dict[key][tree_type] = final_df else: output_dict[key] = {tree_type: final_df} else: continue return (output_dict)
def functional_enrichment( predterms_dict, # =mf_dict_predterms kdtree_dict, # =consol_dict_paralogs go_tree, # =go_dag map_go_dict, # =go_mf_map iloc_cut_dict={ 'BP_GO': 11044, 'MF_GO': 5213, 'CC_GO': 1896 }, # values obtained from average NNs needed to recover 100% of ground truth from KDTree search alpha_val=0.05): res_dict = core.AutoVivification() for go_class in ['BP_GO', 'MF_GO', 'CC_GO']: for n, pc in enumerate(list(kdtree_dict.keys())): print(n, ": " + pc) print(go_class) kdtree_rez = kdtree_dict[pc][go_class] # kdtree_rez['GO ID'] = kdtree_rez['NNs_natlang'].map(go_map_dict) ## map_go_dict --> {GO_name: GO_ID} assoc = {} hash_gos = { i: go for i, go in enumerate(predterms_dict[pc][go_class]['combined'] ['GO ID']) } hash_gos_pos = { go: pos for go, pos in zip( predterms_dict[pc][go_class]['combined']['GO ID'], predterms_dict[pc][go_class]['combined']['pos']) } # hash_gos['dummy'] = 'dummy' pool = set() for i, go_id in hash_gos.items(): # print(set(go_tree[go_id].get_all_children())) assoc[i] = set(go_tree[go_id].get_all_children()) assoc[i].add(go_id) pool.update(list(assoc[i])) assoc['dummy'] = set(kdtree_rez['GO ID']) - pool ## cut kdtree results to iloc_cut to get 'sample population' pop_names = kdtree_rez.iloc[0:iloc_cut_dict[go_class]] pop_new = pop_names['GO ID'] for ii in assoc: isSignif = False M = len(set(kdtree_rez['NNs_natlang']) ) ## Total number of GO terms in MF, BP, or CC set n_hyper = len( set.intersection(assoc[ii], set(kdtree_rez['GO ID'])) ) ## number of intersections between children terms of ML GO term of interest and full set of GO Terms N = len( set(pop_new) ) ## Size of sample (should be equal to iloc_cut_dict[go_class]) if not N == iloc_cut_dict[go_class]: raise ValueError( 'N should be equal to iloc_cut. Currently N={} and iloc_cut_dict[{}]={}' '\nPlease check if you have used correct map_go_df or if drop_duplicates has messed up.' .format(N, go_class, iloc_cut_dict[go_class])) successes = set.intersection(assoc[ii], set(pop_new)) x = len(successes) ## Number of successes pval = hypergeom.sf(x - 1, M, n_hyper, N) ## Bonferroni correction alpha_alt = pval / len(assoc.keys()) # print(alpha_alt) alpha_crit = 1 - (1 - alpha_alt)**(len(assoc.keys())) if alpha_crit < alpha_val: ##Bonferroni correction for multiple testing alpha_crit_str = str(alpha_crit) + '******' isSignif = True else: alpha_crit_str = str(alpha_crit) if not ii == 'dummy': if isSignif: print('\t{}: {} {} alpha_crit = {}'.format( ii, hash_gos[ii], go_tree[hash_gos[ii]].name, alpha_crit_str)) print('\t\tM = {}; N = {}; n = {}; x = {}'.format( M, N, n_hyper, x)) res_dict[pc][go_class][hash_gos[ii]] = { 'go_name': core.preprocess(go_tree[hash_gos[ii]].name), 'M': M, 'N': N, 'n_hyper': n_hyper, 'x': x, 'pval': pval, 'alpha_alt': alpha_alt, 'alpha_crit': alpha_crit, 'isSignif': isSignif, 'successes': successes, 'mapped': pop_names, 'pos': hash_gos_pos[hash_gos[ii]] } else: if isSignif: print('\t{} alpha_crit = {}'.format( ii, alpha_crit_str)) print('\t\tM = {}; N = {}; n = {}; x = {}'.format( M, N, n_hyper, x)) res_dict[pc][go_class][ii] = { 'go_name': ii, 'M': M, 'N': N, 'n_hyper': n_hyper, 'x': x, 'pval': pval, 'alpha_alt': alpha_alt, 'alpha_crit': alpha_crit, 'isSignif': isSignif, 'successes': successes, 'mapped': pop_names, 'pos': np.nan } return (res_dict)
def runner(self): print('start') # bp_tree = self.G.subgraph( # [n for n, v in self.G.nodes(data=True) if v['namespace'] == 'biological_process']) # self.bp_tree = bp_tree n_clust = self.nclusts print('created BP Tree') for i, query in enumerate(list( self.queries_vecs.index)): # queries_oi_names[:]): print(1, time.time()) print(f'Functional Annotation Clustering for: {query}') k, id_names_zip_dict = self.semsim_squaredf(ids_ls=list( self.queries_rez[query]['BP_GO']['combined']['GO ID']), go_tree=self.G, go_dag=self.go_dag) print(2, time.time()) Z = scipy.cluster.hierarchy.linkage( k, method='weighted', metric='euclidean') ## calculate linkages for clusters print(3, time.time()) clusters = fcluster(Z, n_clust, criterion='maxclust') print(4, time.time()) clust_lists = [] dcas = {} dcas_goids = {} for i in set(clusters): names = list(k.index[clusters == i]) # print(names) clust_lists.append(names) go_ids = [] for name in names: go_ids.append(self.go_map['GO ID'][list( self.go_map['GO'] == name).index(True)]) dca = deepest_common_ancestor(go_ids, self.go_dag) dcas[str(i)] = preprocess(self.go_dag[dca].name) dcas_goids[preprocess(self.go_dag[dca].name)] = dca print(5, time.time()) go_term_clust_map = dict(zip(k.index, list(map(str, clusters)))) clusters = \ pd.DataFrame((list((k, dcas.get(v, v)) for (k, v) in go_term_clust_map.items()))).set_index( [0])[1] print(6, time.time()) row_colors = sns.color_palette("cubehelix", len(set(clusters))) lut = dict(zip(clusters.unique(), row_colors)) clusters.name = None dca_clustermap = sns.clustermap( k, cmap='Blues', row_colors=clusters.map(lut), col_cluster=True, linewidths=0, xticklabels=False # yticklabels=True, ) for label in clusters.unique(): dca_clustermap.ax_col_dendrogram.bar(0, 0, color=lut[label], label=label, linewidth=0) dca_clustermap.fig.suptitle(query, ha='left', va='center').set_size(16) dca_clustermap.ax_col_dendrogram.legend( loc="lower left", ncol=3).set_title('deepest common ancestor clusters', 'large') dca_clustermap.ax_col_dendrogram.set_xlim([0, 0]) os.makedirs(os.path.join(self.out_rez_path, query), exist_ok=True) dca_clustermap.savefig( os.path.join(self.out_rez_path, query, 'FuncAnnotClust_DCA_plot.png')) print(7, time.time()) ## Write out original ML predicted terms for each PC df_out = self.queries_rez_orig[query]['BP_GO']['combined'] if not df_out.shape[0] == 0: df_out.to_csv(os.path.join( self.out_rez_path, query, 'ML_pred_results_before_DCA_func_clustering.tsv'), sep='\t') else: print( query, 'has no ML predicted terms for BP_GO, hence no reason to write out original ML predicted' ' terms prior to doing functional annotation clustering based on pairwise semantic similarity.' ) print(8, time.time()) ## Overwrite DCA GO terms to self.queries_rez self.queries_rez[query]['BP_GO']['combined'] = pd.DataFrame( dcas_goids, index=['GO ID']).T.drop_duplicates() print(9, time.time()) return (self.queries_rez, self.queries_rez_orig)