Python preprocess Exemples, pcfun.core.preprocess Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : go_dag_functionalities.py Projet : sharmavaruns/PCfun

 def semsim_squaredf(self, ids_ls, go_tree, go_dag):
     ids_ls_copy = copy.deepcopy(ids_ls)
     ids_ls_copy_df = pd.DataFrame(ids_ls_copy)
     ## mapping alternate GO IDs in order to calculate Wang SemSim
     ids_ls_copy_df = ids_ls_copy_df.replace(0, self.G.alt_ids)
     blank_df = pd.DataFrame(columns=list(ids_ls_copy_df[0]),
                             index=list(ids_ls_copy_df[0]))
     id_names_zip_dict = {
         go_id: preprocess(go_dag[go_id].name)
         for go_id in ids_ls
     }
     #df_combos = pd.DataFrame(list(itertools.combinations_with_replacement(list(ids_ls_copy_df[0]), 2)))
     for x_run, y_run in itertools.combinations_with_replacement(
             list(ids_ls_copy_df[0]), 2):
         # if not x in full_tree.nodes:
         #     x_run = full_tree.alt_ids[x]
         # else:
         #     x_run = x
         # if not y in full_tree.nodes:
         #     y_run = full_tree.alt_ids[y]
         # else:
         #     y_run = y
         #print(x_run,y_run)
         score = wang(go_tree, x_run, y_run)
         #print(x,y,score)
         blank_df[y_run].loc[x_run] = score
         blank_df[x_run].loc[y_run] = score
     blank_df.columns = ids_ls
     blank_df.index = ids_ls
     blank_df = blank_df.rename(columns=id_names_zip_dict,
                                index=id_names_zip_dict)
     blank_df.columns.name = None
     blank_df.index.name = None
     blank_df = blank_df[blank_df.columns].astype(float)
     return (blank_df, id_names_zip_dict)

Exemple #2

0

Afficher le fichier

Fichier : mapping.py Projet : sharmavaruns/PCfun

 def queries_df_to_vecs(self, input_df):
     if isinstance(input_df, list):
         input_df = pd.DataFrame(input_df)
     if input_df.shape[1] != 1:
         raise ValueError(
             f'Expected input pd.DataFrame to have single column with no header of natural language queries'
             f'Can also input a list of queries that will be coerced into a pd.DataFrame'
         )
     ## Preprocess strings
     input_df[0] = input_df[0].apply(lambda x: preprocess(str(x)))
     ## drop duplicates
     input_df = input_df.drop_duplicates(keep='first')
     ## get embedding sentence vectors for queries
     vecs_df = pd.DataFrame(list(input_df[0].apply(
         self.model.get_sentence_vector)),
                            index=input_df[0])
     ## L2 normalize vectors
     vec_norm = np.sqrt(np.square(np.array(vecs_df)).sum(axis=1))
     queries_vec_normalized = pd.DataFrame(
         np.array(vecs_df) / vec_norm.reshape(vecs_df.shape[0], 1),
         index=vecs_df.index)
     return (queries_vec_normalized)

Exemple #3

0

Afficher le fichier

def query_tree_get_mqueries_nns(query_vecs_input,
                                trees_dict,
                                tree_type_list,
                                k_nns=10,
                                **kwargs):
    output_dict = {}
    if not isinstance(query_vecs_input, pd.core.frame.DataFrame):
        raise ValueError(
            'query_vecs_input needs to be a Pandas DataFrame. Please make it a dataframe and try again.'
        )
    if not isinstance(trees_dict, dict):
        raise ValueError(
            'trees_dict must be a dictionary with key corresponding to tree name tag (e.g. "MF_GO")'
            ' and value corresponding to the associated tree (e.g. go_mf_tree)'
        )
    if not all([isinstance(x, list) for x in (tree_type_list)]):
        raise ValueError(
            "\nYou've inputted the wrong type for tree_type_list.\n"
            "Input should be list of lists corresponding to trees you want run.\n"
            "Please consider first using multi query file reader as input before this step."
        )
    allowed_tree_types = {
        'MF_GO': 'go_mf_tree',
        'BP_GO': 'go_bp_tree',
        'CC_GO': 'go_cc_tree',
        'MF_GO_euc': 'go_mf_tree_euc',
        'BP_GO_euc': 'go_bp_tree_euc',
        'CC_GO_euc': 'go_cc_tree_euc',
        'Disease': 'fname_dis_tree',
        'Pathway': 'pathways_tree',
        'PC': 'pc_tree',
        'CORUM_GO': 'go_corum_tree'
    }
    if not all([
            set(x).issubset(tuple(allowed_tree_types.keys()))
            for x in tree_type_list
    ]):
        trees_check = [
            set(x).issubset(tuple(allowed_tree_types.keys()))
            for x in tree_type_list
        ]
        lines_idxs_wrong_trees = [
            i for i, x in enumerate(trees_check) if x == False
        ]
        raise ValueError(
            'tree_type must be string item in %s\n'
            'Problematic Lines indexes are: %s ' %
            (list(allowed_tree_types.keys()), lines_idxs_wrong_trees))
    if 'go_dag' in kwargs:
        go_dag = kwargs['go_dag']
        go_map_full = pd.DataFrame(
            [(preprocess(go_dag[go_id].name), go_id, go_dag[go_id].namespace)
             for go_id in go_dag.keys()],
            columns=['GO', 'GO ID', 'class'])
        go_map_full_dict = {
            go_map_full['GO'].iloc[i]: go_map_full['GO ID'].iloc[i]
            for i in range(go_map_full.shape[0])
        }

    # retrieve out queries list from input dataframe
    full_queries = list(query_vecs_input.index)
    print(len(full_queries))
    print(len(tree_type_list))
    for tree_type in list(allowed_tree_types.keys()):
        true_idxs = [tree_type in sublist for sublist in tree_type_list]
        tree_type_queries = [
            item for idx, item in enumerate(full_queries)
            if true_idxs[idx] == True
        ]
        if len(tree_type_queries) == len(full_queries):
            if k_nns == -1:
                k_nn = len(
                    trees_dict[allowed_tree_types[tree_type]].word_series)
            else:
                k_nn = k_nns
            tmp_nn_list = trees_dict[allowed_tree_types[tree_type]].kneighbors(
                X=query_vecs_input.loc[tree_type_queries],
                k=k_nn,
                return_similarity=True)
            for i, key in enumerate(list(tree_type_queries)):
                final_df = pd.DataFrame(
                    list(
                        zip(tmp_nn_list[0][i], tmp_nn_list[2][i],
                            tmp_nn_list[3][i])),
                    columns=['NNs_natlang', 'NNs_distance', 'NNs_simil'])
                if 'go_dag' in kwargs:
                    final_df['GO ID'] = final_df['NNs_natlang'].map(
                        go_map_full_dict)
                if key in list(output_dict.keys()):
                    output_dict[key][tree_type] = final_df
                else:
                    output_dict[key] = {tree_type: final_df}
        else:
            continue
    return (output_dict)

Exemple #4

0

Afficher le fichier

def functional_enrichment(
    predterms_dict,  # =mf_dict_predterms
    kdtree_dict,  # =consol_dict_paralogs
    go_tree,  # =go_dag
    map_go_dict,  # =go_mf_map
    iloc_cut_dict={
        'BP_GO': 11044,
        'MF_GO': 5213,
        'CC_GO': 1896
    },
    # values obtained from average NNs needed to recover 100% of ground truth from KDTree search
    alpha_val=0.05):
    res_dict = core.AutoVivification()
    for go_class in ['BP_GO', 'MF_GO', 'CC_GO']:
        for n, pc in enumerate(list(kdtree_dict.keys())):
            print(n, ":  " + pc)
            print(go_class)
            kdtree_rez = kdtree_dict[pc][go_class]
            # kdtree_rez['GO ID'] = kdtree_rez['NNs_natlang'].map(go_map_dict) ## map_go_dict --> {GO_name: GO_ID}
            assoc = {}
            hash_gos = {
                i: go
                for i, go in enumerate(predterms_dict[pc][go_class]['combined']
                                       ['GO ID'])
            }
            hash_gos_pos = {
                go: pos
                for go, pos in zip(
                    predterms_dict[pc][go_class]['combined']['GO ID'],
                    predterms_dict[pc][go_class]['combined']['pos'])
            }
            # hash_gos['dummy'] = 'dummy'
            pool = set()
            for i, go_id in hash_gos.items():
                # print(set(go_tree[go_id].get_all_children()))
                assoc[i] = set(go_tree[go_id].get_all_children())
                assoc[i].add(go_id)
                pool.update(list(assoc[i]))
            assoc['dummy'] = set(kdtree_rez['GO ID']) - pool
            ## cut kdtree results to iloc_cut to get 'sample population'
            pop_names = kdtree_rez.iloc[0:iloc_cut_dict[go_class]]
            pop_new = pop_names['GO ID']

            for ii in assoc:
                isSignif = False
                M = len(set(kdtree_rez['NNs_natlang'])
                        )  ## Total number of GO terms in MF, BP, or CC set
                n_hyper = len(
                    set.intersection(assoc[ii], set(kdtree_rez['GO ID']))
                )  ## number of intersections between children terms of ML GO term of interest and full set of GO Terms
                N = len(
                    set(pop_new)
                )  ## Size of sample (should be equal to iloc_cut_dict[go_class])
                if not N == iloc_cut_dict[go_class]:
                    raise ValueError(
                        'N should be equal to iloc_cut. Currently N={} and iloc_cut_dict[{}]={}'
                        '\nPlease check if you have used correct map_go_df or if drop_duplicates has messed up.'
                        .format(N, go_class, iloc_cut_dict[go_class]))
                successes = set.intersection(assoc[ii], set(pop_new))
                x = len(successes)  ## Number of successes
                pval = hypergeom.sf(x - 1, M, n_hyper, N)
                ## Bonferroni correction
                alpha_alt = pval / len(assoc.keys())
                # print(alpha_alt)
                alpha_crit = 1 - (1 - alpha_alt)**(len(assoc.keys()))
                if alpha_crit < alpha_val:  ##Bonferroni correction for multiple testing
                    alpha_crit_str = str(alpha_crit) + '******'
                    isSignif = True
                else:
                    alpha_crit_str = str(alpha_crit)
                if not ii == 'dummy':
                    if isSignif:
                        print('\t{}: {} {} alpha_crit = {}'.format(
                            ii, hash_gos[ii], go_tree[hash_gos[ii]].name,
                            alpha_crit_str))
                        print('\t\tM = {}; N = {}; n = {}; x = {}'.format(
                            M, N, n_hyper, x))
                    res_dict[pc][go_class][hash_gos[ii]] = {
                        'go_name': core.preprocess(go_tree[hash_gos[ii]].name),
                        'M': M,
                        'N': N,
                        'n_hyper': n_hyper,
                        'x': x,
                        'pval': pval,
                        'alpha_alt': alpha_alt,
                        'alpha_crit': alpha_crit,
                        'isSignif': isSignif,
                        'successes': successes,
                        'mapped': pop_names,
                        'pos': hash_gos_pos[hash_gos[ii]]
                    }
                else:
                    if isSignif:
                        print('\t{} alpha_crit = {}'.format(
                            ii, alpha_crit_str))
                        print('\t\tM = {}; N = {}; n = {}; x = {}'.format(
                            M, N, n_hyper, x))
                    res_dict[pc][go_class][ii] = {
                        'go_name': ii,
                        'M': M,
                        'N': N,
                        'n_hyper': n_hyper,
                        'x': x,
                        'pval': pval,
                        'alpha_alt': alpha_alt,
                        'alpha_crit': alpha_crit,
                        'isSignif': isSignif,
                        'successes': successes,
                        'mapped': pop_names,
                        'pos': np.nan
                    }
    return (res_dict)

Exemple #5

0

Afficher le fichier

Fichier : go_dag_functionalities.py Projet : sharmavaruns/PCfun

    def runner(self):
        print('start')
        # bp_tree = self.G.subgraph(
        #     [n for n, v in self.G.nodes(data=True) if v['namespace'] == 'biological_process'])
        # self.bp_tree = bp_tree
        n_clust = self.nclusts
        print('created BP Tree')
        for i, query in enumerate(list(
                self.queries_vecs.index)):  # queries_oi_names[:]):
            print(1, time.time())
            print(f'Functional Annotation Clustering for: {query}')
            k, id_names_zip_dict = self.semsim_squaredf(ids_ls=list(
                self.queries_rez[query]['BP_GO']['combined']['GO ID']),
                                                        go_tree=self.G,
                                                        go_dag=self.go_dag)
            print(2, time.time())
            Z = scipy.cluster.hierarchy.linkage(
                k, method='weighted',
                metric='euclidean')  ## calculate linkages for clusters
            print(3, time.time())
            clusters = fcluster(Z, n_clust, criterion='maxclust')
            print(4, time.time())
            clust_lists = []
            dcas = {}
            dcas_goids = {}
            for i in set(clusters):
                names = list(k.index[clusters == i])
                # print(names)
                clust_lists.append(names)
                go_ids = []
                for name in names:
                    go_ids.append(self.go_map['GO ID'][list(
                        self.go_map['GO'] == name).index(True)])
                dca = deepest_common_ancestor(go_ids, self.go_dag)
                dcas[str(i)] = preprocess(self.go_dag[dca].name)
                dcas_goids[preprocess(self.go_dag[dca].name)] = dca
            print(5, time.time())
            go_term_clust_map = dict(zip(k.index, list(map(str, clusters))))
            clusters = \
                pd.DataFrame((list((k, dcas.get(v, v)) for (k, v) in go_term_clust_map.items()))).set_index(
                    [0])[1]
            print(6, time.time())
            row_colors = sns.color_palette("cubehelix", len(set(clusters)))
            lut = dict(zip(clusters.unique(), row_colors))
            clusters.name = None

            dca_clustermap = sns.clustermap(
                k,
                cmap='Blues',
                row_colors=clusters.map(lut),
                col_cluster=True,
                linewidths=0,
                xticklabels=False  # yticklabels=True,
            )
            for label in clusters.unique():
                dca_clustermap.ax_col_dendrogram.bar(0,
                                                     0,
                                                     color=lut[label],
                                                     label=label,
                                                     linewidth=0)
            dca_clustermap.fig.suptitle(query, ha='left',
                                        va='center').set_size(16)
            dca_clustermap.ax_col_dendrogram.legend(
                loc="lower left",
                ncol=3).set_title('deepest common ancestor clusters', 'large')
            dca_clustermap.ax_col_dendrogram.set_xlim([0, 0])
            os.makedirs(os.path.join(self.out_rez_path, query), exist_ok=True)
            dca_clustermap.savefig(
                os.path.join(self.out_rez_path, query,
                             'FuncAnnotClust_DCA_plot.png'))
            print(7, time.time())
            ## Write out original ML predicted terms for each PC
            df_out = self.queries_rez_orig[query]['BP_GO']['combined']
            if not df_out.shape[0] == 0:
                df_out.to_csv(os.path.join(
                    self.out_rez_path, query,
                    'ML_pred_results_before_DCA_func_clustering.tsv'),
                              sep='\t')
            else:
                print(
                    query,
                    'has no ML predicted terms for BP_GO, hence no reason to write out original ML predicted'
                    ' terms prior to doing functional annotation clustering based on pairwise semantic similarity.'
                )
            print(8, time.time())
            ## Overwrite DCA GO terms to self.queries_rez
            self.queries_rez[query]['BP_GO']['combined'] = pd.DataFrame(
                dcas_goids, index=['GO ID']).T.drop_duplicates()
            print(9, time.time())
        return (self.queries_rez, self.queries_rez_orig)