Exemple #1
0
    def __GO_enrich__(self):
        go_file = "go-basic.obo"
        if not os.path.exists(go_file):
            download_go_basic_obo()

        # Load gene ontologies
        obodag = GODag("go-basic.obo")

        # Read NCBI's gene2go. Store annotations in a list of namedtuples
        fin_gene2go = download_ncbi_associations()
        objanno = Gene2GoReader(fin_gene2go, taxids=[9606])
        # Get namespace2association where:
        #    namespace is:
        #        BP: biological_process
        #        MF: molecular_function
        #        CC: cellular_component
        #    association is a dict:
        #        key: NCBI GeneID
        #        value: A set of GO IDs associated with that gene
        ns2assoc = objanno.get_ns2assc()

        self.goeaobj = GOEnrichmentStudyNS(
            GeneID2nt_hum.keys(),  # List of human protein-acoding genes
            ns2assoc,  # geneID/GO associations
            obodag,  # Ontologies
            propagate_counts=False,
            alpha=0.05,  # default significance cut-off
            methods=['fdr_bh'])  # default multipletest correction method
Exemple #2
0
def run_go_enrichment(strain,
                      genes_of_interest,
                      significant=True,
                      cutoff=0.05,
                      use_parent_terms=True):
    # Load GO term association dictionary
    with open(os.path.join('data', 'go_association.pickle'), 'rb') as handle:
        go_association = pickle.load(handle)

    background_genes = get_genes(
        os.path.join('data', strain + '_all_genes.csv'))
    obo_go_fname = download_go_basic_obo()
    obo_dag = GODag('go-basic.obo')

    if strain == 'PA14':
        genes_of_interest = map_pa14_genes(genes_of_interest)
        background_genes = map_pa14_genes(background_genes)

    goea_obj = GOEnrichmentStudyNS(background_genes,
                                   go_association,
                                   obo_dag,
                                   propagate_counts=use_parent_terms,
                                   alpha=cutoff,
                                   methods=['fdr_bh'])
    goea_results = goea_obj.run_study(genes_of_interest)

    if significant is True:
        goea_results = [
            result for result in goea_results if result.p_fdr_bh < cutoff
        ]

    enrichment_results = get_enrichment_results(goea_results)
    return [enrichment_results, goea_results]
Exemple #3
0
    def __init__(
        self,
        work_dir: str = '.',
        clean_work_dir: bool = False,
        organism: str = 'human',
        study_parameters: Dict[str, Union[int, float, str, List, Dict]] = {
            'propagate_counts': False,
            'alpha': 0.05,
            'methods': ['fdr_bh']
        }
    ) -> GOEngine:
        """A GOEngine that can be used for performing analysis using GOATOOLS

        Args:
            work_dir (str, optional): The path to a temp directory were intermediate-results and raw data will be downloaded/written to. Defaults to the current working directory.
            clean_work_dir (bool, optional): Whether or not to remove data written to the work directory at class termination, default to True.
            organism (str, optional): The organism . Defaults to 'human'.
            study_parameters (Dict[str,Union[int,float,str,List,Dict]], optional): A dict of parameters to control the base function, defaults to {'propagate_counts':False,'alpha':0.05, 'methods':['fdr_bh']}
        Returns:
            GOEngine: return a GO engine that can be used for performing GO enrichment analysis GOEnrichmentStudyNS
        """
        print("Creating a GO Engine ...")
        if not os.path.exists(work_dir):
            raise ValueError(
                f"The provided work path: {work_dir} does not exist!!!")
        self.work_dir = work_dir
        if organism != 'human' and organism != 'mouse':
            raise ValueError(
                f"The provided organism: {organism} is not support, current engine mainly work with human and moues only"
            )
        print(f"\t --> Downloading data ...")
        obo_fname = download_go_basic_obo(
            os.path.join(work_dir, 'go-basic.obo'))
        gene2go_fname = download_ncbi_associations(
            os.path.join(work_dir, 'gene2go'))
        ## parse the GO term
        print(
            f"\t --> parsing the data and intializing the base GOEA object...")
        obo_dag = GODag(obo_fname)
        if organism == 'human':
            self._goea_obj = GOEnrichmentStudyNS(
                gene2iden_human.keys(),
                Gene2GoReader(gene2go_fname, taxids=[9606]).get_ns2assc(),
                obo_dag, **study_parameters)
        else:
            self._goea_obj = GOEnrichmentStudyNS(
                gene2iden_human.keys(),
                Gene2GoReader(gene2go_fname, taxids=[10090]).get_ns2assc(),
                obo_dag, **study_parameters)
        self._clean_work_dir = clean_work_dir
        self._gene_ids = None
        return
def _get_objgoeans(pop, ns2assoc, godag):
    """Run gene ontology enrichment analysis (GOEA)."""
    return GOEnrichmentStudyNS(pop,
                               ns2assoc,
                               godag,
                               propagate_counts=True,
                               relationships=False,
                               alpha=0.05,
                               methods={'fdr_bh'})
Exemple #5
0
 def _init_objgoeans(self, pop, ns2assoc):
     """Run gene ontology enrichment analysis (GOEA)."""
     propagate_counts = not self.args.no_propagate_counts
     return GOEnrichmentStudyNS(pop,
                                ns2assoc,
                                self.godag,
                                propagate_counts=propagate_counts,
                                relationships=False,
                                alpha=self.args.alpha,
                                pvalcalc=self.args.pvalcalc,
                                methods=self.methods)
Exemple #6
0
def _get_goeaobj(methods=None):
    """Test GOEA with method, fdr."""
    # REad GODag
    obo_fin = os.path.join(REPO, "go-basic.obo")
    obo_dag = get_godag(obo_fin, loading_bar=None)
    # Read association
    fin_assc = "{REPO}/tests/data/small_association".format(REPO=REPO)
    objanno = IdToGosReader(fin_assc, godag=obo_dag)
    ns2assc = objanno.get_ns2assc()
    popul_fin = "{REPO}/tests/data/small_population".format(REPO=REPO)
    popul_ids = [line.rstrip() for line in open(popul_fin)]
    goeaobj = GOEnrichmentStudyNS(popul_ids, ns2assc, obo_dag, methods=methods)
    return goeaobj
Exemple #7
0
def gene_set_query(genes,
                   fdr_threshold=0.10,
                   return_header=False,
                   species='mouse'):
    """
    Runs a GO enrichment analysis query using goatools.
    The GO dataset here is for mouse, but it might apply to human as well.
    """
    ns2assoc, ids_to_symbols, symbols_to_ids, genes_list = get_species_genes(
        species)
    goeaobj = GOEnrichmentStudyNS(
        genes_list,  # List of mouse protein-coding genes
        ns2assoc,  # geneid/GO associations
        obodag,  # Ontologies
        propagate_counts=False,
        alpha=fdr_threshold,  # default significance cut-off
        methods=['fdr_bh'])  # defult multipletest correction method
    if species == 'mouse' or species == 'mus_musculus':
        genes = [x.capitalize() for x in genes]
    else:
        genes = [x.upper() for x in genes]
    gene_ids = [symbols_to_ids[x] for x in genes if x in symbols_to_ids]
    print('gene_ids:', gene_ids)

    results = goeaobj.run_study(gene_ids)
    results_sig = [r for r in results if r.p_fdr_bh < fdr_threshold]
    results_table = []
    for r in results_sig:
        results_table.append([
            r.goterm.id, r.goterm.name, r.p_fdr_bh,
            [ids_to_symbols[gene_id] for gene_id in r.study_items]
        ])
    print(results_table)
    results_table.sort(key=lambda x: x[2])
    if return_header:
        results_table = [['GO ID', 'Name', 'FDR', 'Overlapping Genes']
                         ] + results_table
    print('GO results_table:', results_table)
    return results_table
Exemple #8
0
def GOEA(genes, objanno):
    """ returns go term enrichment

    Keyword arguments:
    genes -- list of genes
    objanno -- background dict
    performs GO term enrichment
    """
    goeaobj = GOEnrichmentStudyNS(
        objanno.get_id2gos().keys(),  # List of mouse protein-coding genes
        objanno.get_ns2assc(),  # geneid/GO associations
        godag,  # Ontologies
        propagate_counts=True,
        alpha=0.05,  # default significance cut-off
        methods=['fdr_bh'])  # defult multipletest correction method
    goea_quiet_all = goeaobj.run_study(genes, prt=None)
    goea_results = dict((el, []) for el in ontologies)
    for r in goea_quiet_all:
        goea_results[r.NS].append([r.GO, r.p_fdr_bh])
    for ont in goea_results:
        goea_results[ont] = np.array(goea_results[ont])
        goea_results[ont] = goea_results[ont][goea_results[ont][:, 0].argsort()]
    return goea_results
Exemple #9
0
 def _init_objgoeans(self, pop):
     """Run gene ontology enrichment analysis (GOEA)."""
     ns2assoc = self.objanno.get_ns2assc(**self._get_anno_kws())
     ## BROAD rm_goids = self._get_remove_goids()
     rm_goids = False  # BROAD
     return GOEnrichmentStudyNS(
         pop,
         ns2assoc,
         self.godag,
         propagate_counts=not self.args.no_propagate_counts,
         relationships=self.args.relationships,
         alpha=self.args.alpha,
         pvalcalc=self.args.pvalcalc,
         methods=self.methods,
         remove_goids=rm_goids)
Exemple #10
0
def prep_goea(taxid=9606,
              prop_counts=True,
              alpha=0.05,
              method='fdr_bh',
              ref_list=None):
    ### DOWNLOAD AND LOAD ALL THE GENE STUFF for GOEA
    # download ontology
    from goatools.base import download_go_basic_obo
    obo_fname = download_go_basic_obo()

    # download associations
    from goatools.base import download_ncbi_associations
    fin_gene2go = download_ncbi_associations()

    # load ontology
    from goatools.obo_parser import GODag
    obodag = GODag("go-basic.obo")

    # load human gene ontology
    from goatools.anno.genetogo_reader import Gene2GoReader
    objanno = Gene2GoReader(fin_gene2go,
                            taxids=[taxid
                                    ])  #9606 is taxonomy ID for h**o sapiens
    ns2assoc = objanno.get_ns2assc()
    for nspc, id2gos in ns2assoc.items():
        print("{NS} {N:,} annotated human genes".format(NS=nspc,
                                                        N=len(id2gos)))

    from goatools.goea.go_enrichment_ns import GOEnrichmentStudyNS
    #pop_ids = pd.read_csv('../data/df_human_geneinfo.csv',index_col=0)['GeneID'].to_list()
    df_genehumans = pd.read_csv('../data/df_human_geneinfo.csv', index_col=0)

    # if no reference list is given, default to all genes in ABHA
    if ref_list is None:
        ref_list = df_genehumans['GeneID'].to_list()

    goeaobj = GOEnrichmentStudyNS(ref_list,
                                  ns2assoc,
                                  obodag,
                                  propagate_counts=prop_counts,
                                  alpha=alpha,
                                  methods=[method])

    # get symbol to ID translation dictionary to get overexpressed IDs
    symbol2id = dict(
        zip(df_genehumans['Symbol'].str.upper(), df_genehumans['GeneID']))

    return goeaobj, symbol2id
def load_files(obo_fname, fin_gene2go):
    """function to load ontologies, associations and background gene set and then initialise a GOEA object"""

    # import the python module created in generate_background()
    # find specificy the current folder lcoation as the location of the module
    import sys

    sys.path.insert(1, ".")
    # import the module
    from genes_ncbi_3702_proteincoding import GENEID2NT as GeneID2nt_ara

    # load ontologies
    obodag = GODag(obo_fname)

    # load associations
    # Read NCBI's gene2go. Store Arabidopsis thaliana annotations in a list of named tuples
    objanno = Gene2GoReader(fin_gene2go, taxids=[3702])
    # Get namespace2association where:
    #    namespace is:
    #        BP: biological_process
    #        MF: molecular_function
    #        CC: cellular_component
    #    assocation is a dict:
    #        key: NCBI GeneID
    #        value: A set of GO IDs associated with that gene
    ns2assoc = objanno.get_ns2assc()

    for nspc, id2gos in ns2assoc.items():
        print("{NS} {N:,} annotated Arabidopsis genes".format(NS=nspc,
                                                              N=len(id2gos)))

    goeaobj = GOEnrichmentStudyNS(
        GeneID2nt_ara.keys(
        ),  # List of filtered Arabidopsis protein-coding genes
        ns2assoc,  # geneid/GO associations
        obodag,  # Ontologies
        propagate_counts=False,
        alpha=0.05,  # default significance cut-off
        methods=["fdr_bh"],
    )  # defult multipletest correction method

    return goeaobj, obodag, ns2assoc
Exemple #12
0
def plot_go_enrichment(coef_df, auc_vals, pheno_dict, args, mode='abs'):
    obo_fl = os.path.join(args.go_dir, "go-basic.obo")
    download_go_basic_obo(obo_fl)
    obodag = GODag(obo_fl)

    assoc_fl = os.path.join(args.go_dir, "gene2go")
    download_ncbi_associations(assoc_fl)
    objanno = Gene2GoReader(assoc_fl, taxids=[9606])
    ns2assoc = objanno.get_ns2assc()

    ncbi_map = {info.Symbol: ncbi_id for ncbi_id, info in GENEID2NT.items()}
    use_genes = set(coef_df.columns) & set(ncbi_map)
    bgrd_ids = [ncbi_map[gn] for gn in use_genes]

    goeaobj = GOEnrichmentStudyNS(bgrd_ids,
                                  ns2assoc,
                                  obodag,
                                  propagate_counts=False,
                                  alpha=0.05,
                                  methods=['fdr_bh'])

    plot_dict = dict()
    use_gos = set()
    coef_mat = coef_df.loc[:, [gene in use_genes for gene in coef_df.columns]]

    if mode == 'bayes':
        coef_means = coef_mat.groupby(level=0, axis=1).mean()
        coef_stds = coef_mat.groupby(level=0, axis=1).std()
    else:
        coef_mat = coef_mat.groupby(level=0, axis=1).mean()

    for mtype, coefs in coef_mat.iterrows():
        if not isinstance(mtype, RandomType):
            if mode == 'abs':
                fgrd_ctf = coefs.abs().quantile(0.95)
                fgrd_genes = coefs.index[coefs.abs() > fgrd_ctf]
                use_clr = 3.17

            elif mode == 'high':
                fgrd_ctf = coefs.quantile(0.95)
                fgrd_genes = coefs.index[coefs > fgrd_ctf]
                use_clr = 2.03
            elif mode == 'low':
                fgrd_ctf = coefs.quantile(0.05)
                fgrd_genes = coefs.index[coefs < fgrd_ctf]
                use_clr = 1.03

            elif mode == 'bayes':
                gene_scrs = coef_means.loc[mtype].abs() - coef_stds.loc[mtype]
                fgrd_genes = gene_scrs.index[gene_scrs > 0]
                use_clr = 3.17

            else:
                raise ValueError(
                    "Unrecognized `mode` argument <{}>!".format(mode))

            fgrd_ids = [ncbi_map[gn] for gn in fgrd_genes]
            goea_out = goeaobj.run_study(fgrd_ids, prt=None)

            plot_dict[mtype] = {
                rs.name: np.log10(rs.p_fdr_bh)
                for rs in goea_out
                if rs.enrichment == 'e' and rs.p_fdr_bh < 0.05
            }

    plot_df = pd.DataFrame(plot_dict, columns=plot_dict.keys())
    if plot_df.shape[0] == 0:
        print("Could not find any enriched GO terms across {} "
              "subgroupings!".format(plot_df.shape[1]))
        return None

    fig, ax = plt.subplots(figsize=(4.7 + plot_df.shape[0] / 2.3,
                                    2 + plot_df.shape[1] / 5.3))

    if plot_df.shape[0] > 2:
        plot_df = plot_df.iloc[dendrogram(linkage(distance.pdist(
            plot_df.fillna(0.0), metric='cityblock'),
                                                  method='centroid'),
                                          no_plot=True)['leaves']].transpose()
    else:
        plot_df = plot_df.transpose()

    xlabs = [rs_nm for rs_nm in plot_df.columns]
    ylabs = [
        get_fancy_label(tuple(mtype.subtype_iter())[0][1])
        for mtype in plot_df.index
    ]

    pval_cmap = sns.cubehelix_palette(start=use_clr,
                                      rot=0,
                                      dark=0,
                                      light=1,
                                      reverse=True,
                                      as_cmap=True)

    sns.heatmap(plot_df,
                cmap=pval_cmap,
                vmin=-5,
                vmax=0,
                linewidths=0.23,
                linecolor='0.73',
                xticklabels=xlabs,
                yticklabels=ylabs)

    ax.set_xticklabels(xlabs, size=15, ha='right', rotation=31)
    ax.set_yticklabels(ylabs, size=9, ha='right', rotation=0)
    ax.set_xlim((plot_df.shape[1] / -83, plot_df.shape[1] * 1.009))
    ax.set_ylim((plot_df.shape[0] * 1.009, plot_df.shape[0] / -83))

    plt.savefig(os.path.join(
        plot_dir, '__'.join([args.expr_source, args.cohort]),
        "{}_go-{}-enrichment_{}.svg".format(args.gene, mode, args.classif)),
                bbox_inches='tight',
                format='svg')

    plt.close()
    obo_fname = download_go_basic_obo()
    fin_gene2go = download_ncbi_associations()

if not download:
    obo_fname = '/d1/studies/singleCellTools/go-basic.obo'
    fin_gene2go = '/d1/studies/singleCellTools/gene2go'

obodag = GODag("go-basic.obo")
objanno = Gene2GoReader(fin_gene2go, taxids=[10090])
ns2assoc = objanno.get_ns2assc()
for nspc, id2gos in ns2assoc.items():
    print("{NS} {N:,} annotated mouse genes".format(NS=nspc, N=len(id2gos)))
    goeaobj = GOEnrichmentStudyNS(
        GeneID2nt_mus.keys(), # List of mouse protein-coding genes
        ns2assoc, # geneid/GO associations
        obodag, # Ontologies
        propagate_counts = False,
        alpha = 0.05, # default significance cut-off
        methods = ['fdr_bh']) # defult multipletest correction method


def getEntrezIDs(genes):
    ID = []
    IDs = []
    nullGenes = []
    for gene in genes:
        try:
            gene_info = geneIndex.loc[geneIndex.symbol==str(gene)]
            entrez_id = gene_info['entrez_id'].astype(int)
            eid = entrez_id.unique()
            if len(eid) == 0:
Exemple #14
0
class Pose(object):
    def __init__(self, data_dir: str, device='cpu'):
        # load pretrained model
        self.model, self.name = self.__pretrained_model_construction__()
        self.model.load_state_dict(
            torch.load(data_dir + self.name + '-model.pt'))

        self.device = device
        self.__GO_enrich__()

    def __GO_enrich__(self):
        go_file = "go-basic.obo"
        if not os.path.exists(go_file):
            download_go_basic_obo()

        # Load gene ontologies
        obodag = GODag("go-basic.obo")

        # Read NCBI's gene2go. Store annotations in a list of namedtuples
        fin_gene2go = download_ncbi_associations()
        objanno = Gene2GoReader(fin_gene2go, taxids=[9606])
        # Get namespace2association where:
        #    namespace is:
        #        BP: biological_process
        #        MF: molecular_function
        #        CC: cellular_component
        #    association is a dict:
        #        key: NCBI GeneID
        #        value: A set of GO IDs associated with that gene
        ns2assoc = objanno.get_ns2assc()

        self.goeaobj = GOEnrichmentStudyNS(
            GeneID2nt_hum.keys(),  # List of human protein-acoding genes
            ns2assoc,  # geneID/GO associations
            obodag,  # Ontologies
            propagate_counts=False,
            alpha=0.05,  # default significance cut-off
            methods=['fdr_bh'])  # default multipletest correction method

    def __pretrained_model_construction__(self):
        nhids_gcn = [64, 32, 32]
        prot_out_dim = sum(nhids_gcn)
        drug_dim = 128
        pp = PP(gdata.n_prot, nhids_gcn)
        pd = PD(prot_out_dim, drug_dim, gdata.n_drug)
        mip = MultiInnerProductDecoder(drug_dim + pd.d_dim_feat, gdata.n_et)
        name = 'poly-' + str(nhids_gcn) + '-' + str(drug_dim)

        return Model(pp, pd, mip).to('cpu'), name

    def get_prediction_train(self, threshold=0.5):
        train_idx, train_et = remove_bidirection(gdata.train_idx,
                                                 gdata.train_et)

        return self.predict(train_idx[0].tolist(),
                            train_idx[1].tolist(),
                            train_et.tolist(),
                            threshold=threshold)

    def get_prediction_test(self, threshold=0.5):
        test_idx, test_et = remove_bidirection(gdata.test_idx, gdata.test_et)

        return self.predict(test_idx[0].tolist(),
                            test_idx[1].tolist(),
                            test_et.tolist(),
                            threshold=threshold)

    def predict(self, drug1, drug2, side_effect, threshold=0.5):
        device = self.device
        data = gdata.to(device)
        model = self.model.to(device)
        model.eval()

        pp_static_edge_weights = torch.ones(
            (data.pp_index.shape[1])).to(device)
        pd_static_edge_weights = torch.ones(
            (data.pd_index.shape[1])).to(device)
        z = model.pp(data.p_feat, data.pp_index, pp_static_edge_weights)
        z0 = z.clone()
        z1 = z.clone()

        # prediction based on all infor
        z = model.pd(z, data.pd_index, pd_static_edge_weights)
        P = torch.sigmoid((z[drug1] * z[drug2] *
                           model.mip.weight[side_effect]).sum(dim=1)).to('cpu')

        index_filter = P > threshold
        drug1 = torch.Tensor(drug1)[index_filter].numpy().astype(int).tolist()
        if not drug1:
            raise ValueError(
                "No Satisfied Edges." +
                "\n - Suggestion: reduce the threshold probability." +
                "Current probability threshold is {}. ".format(threshold) +
                "\n - Please use -h for help")

        drug2 = torch.Tensor(drug2)[index_filter].numpy().astype(int).tolist()
        side_effect = torch.Tensor(side_effect)[index_filter].numpy().astype(
            int).tolist()

        # prediction based on protein info and their interactions
        z0.data[:, 64:] *= 0
        z0 = model.pd(z0, data.pd_index, pd_static_edge_weights)
        P0 = torch.sigmoid(
            (z0[drug1] * z0[drug2] *
             model.mip.weight[side_effect]).sum(dim=1)).to("cpu")
        ppiu_score = (P[index_filter] - P0) / P[index_filter]

        # prediction based on drug info only
        z1.data *= 0
        z1 = model.pd(z1, data.pd_index, pd_static_edge_weights)
        P1 = torch.sigmoid(
            (z1[drug1] * z1[drug2] *
             model.mip.weight[side_effect]).sum(dim=1)).to("cpu")
        piu_score = (P[index_filter] - P1) / P[index_filter]

        # reture a query object
        query = PoseQuery(drug1, drug2, side_effect)
        query.set_pred_result(P[index_filter].tolist(), piu_score.tolist(),
                              ppiu_score.tolist())

        return query

    def explain_list(self,
                     drug_list_1,
                     drug_list_2,
                     side_effect_list,
                     regulization=2,
                     if_auto_tuning=True,
                     if_pred=True):
        if if_pred:
            query = self.predict(drug_list_1, drug_list_2, side_effect_list)
        else:
            query = PoseQuery(drug_list_1, drug_list_2, side_effect_list,
                              regulization)
        return self.explain_query(query,
                                  if_auto_tuning=if_auto_tuning,
                                  regulization=query.regulization)

    def explain_query(self, query, if_auto_tuning=True, regulization=2):
        query.regulization = regulization

        pp_left_index, pp_left_weight, pd_left_index, pd_left_weight = self.__explain(
            query)

        if if_auto_tuning:
            while pp_left_index.shape[1] == 0:
                if query.regulization < 0.0001:
                    print("Warning: auto tuning forced to stop.")
                    break
                query.regulization /= 2
                pp_left_index, pp_left_weight, pd_left_index, pd_left_weight = self.__explain(
                    query)

        query.set_exp_result(pp_left_index, pp_left_weight, pd_left_index,
                             pd_left_weight)

        goea_results_sig = self.enrich_go(pp_left_index)
        query.set_enrich_result(goea_results_sig)

        return query

    def enrich_go(self, pp_left_index):
        # -------------- Go Enrichment --------------
        geneids_study = pp_left_index.flatten()  # geneid2symbol.keys()
        geneids_study = [
            int(gdata.prot_idx_to_id[idx].replace('GeneID', ''))
            for idx in geneids_study
        ]

        goea_results_all = self.goeaobj.run_study(geneids_study)
        goea_results_sig = [r for r in goea_results_all if r.p_fdr_bh < 0.05]

        return goea_results_sig

    def __explain(self, query):

        data = gdata
        model = self.model
        device = self.device

        drug_list_1, drug_list_2, side_effect_list, regulization = query.get_query(
        )

        pre_mask = Pre_mask(data.pp_index.shape[1] // 2,
                            data.pd_index.shape[1]).to(device)
        data = data.to(device)
        model = model.to(device)

        for gcn in self.model.pp.conv_list:
            gcn.cached = False
        self.model.pd.conv.cached = False
        self.model.eval()

        # pp_static_edge_weights = torch.ones((data.pp_index.shape[1])).to(device)
        # pd_static_edge_weights = torch.ones((data.pd_index.shape[1])).to(device)

        optimizer = torch.optim.Adam(pre_mask.parameters(), lr=0.01)
        fake_optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

        # z = model.pp(data.p_feat, data.pp_index, pp_static_edge_weights)
        # z = model.pd(z, data.pd_index, pd_static_edge_weights)

        # # P = torch.sigmoid((z[drug1, :] * z[drug2, :] * model.mip.weight[side_effect, :]).sum())
        # P = torch.sigmoid((z[drug_list_1] * z[drug_list_2] * model.mip.weight[side_effect_list]).sum(dim=1))

        # if len(drug_list_1) < 5:
        #     print(P.tolist())

        tmp = 0.0
        pre_mask.reset_parameters()
        for i in range(9999):
            model.train()
            pre_mask.desaturate()
            optimizer.zero_grad()
            fake_optimizer.zero_grad()

            # half_mask = torch.sigmoid(pre_mask.pp_weight)
            half_mask = torch.nn.Hardtanh(0, 1)(pre_mask.pp_weight)
            pp_mask = torch.cat([half_mask, half_mask])

            pd_mask = torch.nn.Hardtanh(0, 1)(pre_mask.pd_weight)

            z = model.pp(data.p_feat, data.pp_index, pp_mask)

            # TODO:
            # z = model.pd(z, data.pd_index, pd_static_edge_weights)
            z = model.pd(z, data.pd_index, pd_mask)
            # TODO:

            # P = torch.sigmoid((z[drug1, :] * z[drug2, :] * model.mip.weight[side_effect, :]).sum())
            P = torch.sigmoid((z[drug_list_1] * z[drug_list_2] *
                               model.mip.weight[side_effect_list]).sum(dim=1))
            EPS = 1e-7

            # TODO:
            loss = torch.log(1 - P + EPS).sum() / regulization \
                   + 0.5 * (pp_mask * (2 - pp_mask)).sum() \
                   + (pd_mask * (2 - pd_mask)).sum()
            # loss = -  torch.log(P) + 0.5 * (pp_mask * (2 - pp_mask)).sum() + (pd_mask * (2 - pd_mask)).sum()
            # TODO:

            loss.backward()
            optimizer.step()
            # print("Epoch:{}, loss:{}, prob:{}, pp_link_sum:{}, pd_link_sum:{}".format(i, loss.tolist(), P.tolist(), pp_mask.sum().tolist(), pd_mask.sum().tolist()))
            if i % 100 == 0:
                print(
                    "Epoch:{:3d}, loss:{:0.2f}, prob:{:0.2f}, pp_link_sum:{:0.2f}, pd_link_sum:{:0.2f}"
                    .format(i, loss.tolist(),
                            P.mean().tolist(),
                            pp_mask.sum().tolist(),
                            pd_mask.sum().tolist()))

            # until no weight need to be updated --> no sum of weights changes
            if tmp == (pp_mask.sum().tolist(), pd_mask.sum().tolist()):
                break
            else:
                tmp = (pp_mask.sum().tolist(), pd_mask.sum().tolist())

        pre_mask.saturate()

        pp_left_mask = (pp_mask > 0.2).detach().cpu().numpy()
        tmp = (data.pp_index[0, :] >
               data.pp_index[1, :]).detach().cpu().numpy()
        pp_left_mask = np.logical_and(pp_left_mask, tmp)

        pd_left_mask = (pd_mask > 0.2).detach().cpu().numpy()

        pp_left_index = data.pp_index[:, pp_left_mask].cpu().numpy()
        pd_left_index = data.pd_index[:, pd_left_mask].cpu().numpy()

        pp_left_weight = pp_mask[pp_left_mask].detach().cpu().numpy()
        pd_left_weight = pd_mask[pd_left_mask].detach().cpu().numpy()

        return pp_left_index, pp_left_weight, pd_left_index, pd_left_weight
Exemple #15
0
def go_enrichment(gene_list,
                  taxid=9606,
                  background_chrom=None,
                  background_genes=None,
                  terms=None,
                  list_study_genes=False,
                  alpha=0.05):

    if type(gene_list) is pd.core.series.Series:
        gene_list = gene_list.tolist()
    if type(terms) is pd.core.series.Series:
        terms = terms.tolist()

    _assert_entrez_email()

    gene_list = list(gene_list)

    taxid = _tidy_taxid(taxid)

    ncbi_tsv = f'geneinfo_cache/{taxid}_protein_genes.txt'
    if not os.path.exists(ncbi_tsv):
        fetch_background_genes(taxid)

    with open(os.devnull, 'w') as null, redirect_stdout(null):

        obo_fname = download_and_move_go_basic_obo(prt=null)

        file_gene2go = download_ncbi_associations(prt=null)

        obodag = GODag("geneinfo_cache/go-basic.obo",
                       optional_attrs=['relationship', 'def'],
                       prt=null)

        # read NCBI's gene2go. Store annotations in a list of namedtuples
        objanno = Gene2GoReader(file_gene2go, taxids=[taxid])

        # get associations for each branch of the GO DAG (BP, MF, CC)
        ns2assoc = objanno.get_ns2assc()

        # limit go dag to a sub graph including only specified terms and their children
        if terms is not None:
            sub_obo_name = 'geneinfo_cache/' + str(
                hash(''.join(sorted(terms)).encode())) + '.obo'
            wrsobo = WrSubObo(obo_fname,
                              optional_attrs=['relationship', 'def'])
            wrsobo.wrobo(sub_obo_name, terms)
            obodag = GODag(sub_obo_name,
                           optional_attrs=['relationship', 'def'],
                           prt=null)

        # load background gene set of all genes
        background_genes_file = f'geneinfo_cache/{taxid}_protein_genes.txt'
        if not os.path.exists(background_genes_file):
            fetch_background_genes(taxid)

        # # load any custum subset
        if background_genes:
            if not all(type(x) is int for x in background_genes):
                if all(x.isnumeric() for x in background_genes):
                    background_genes = list(map(str, background_genes))
                else:
                    background_genes = _cached_symbol2ncbi(background_genes,
                                                           taxid=taxid)
            df = pd.read_csv(background_genes_file, sep='\t')
            no_suffix = os.path.splitext(background_genes_file)[0]
            background_genes_file = f'{no_suffix}_{hash("".join(map(str, sorted(background_genes))))}.txt'
            df.loc[df.GeneID.isin(background_genes)].to_csv(
                background_genes_file, sep='\t', index=False)

        # limit background gene set
        if background_chrom is not None:
            df = pd.read_csv(background_genes_file, sep='\t')
            background_genes_file = f'{os.path.splitext(background_genes_file)[0]}_{background_chrom}.txt'
            df.loc[df.chromosome == background_chrom].to_csv(
                background_genes_file, sep='\t', index=False)

        output_py = f'geneinfo_cache/{taxid}_background.py'
        ncbi_tsv_to_py(background_genes_file, output_py, prt=null)

        background_genes_name = output_py.replace('.py', '').replace('/', '.')
        background_genes = importlib.import_module(background_genes_name)
        importlib.reload(background_genes)
        GeneID2nt = background_genes.GENEID2NT

        if not all(type(x) is int for x in gene_list):
            gene_list = _cached_symbol2ncbi(gene_list, taxid=taxid)

        goeaobj = GOEnrichmentStudyNS(
            GeneID2nt,  # List of mouse protein-coding genes
            ns2assoc,  # geneid/GO associations
            obodag,  # Ontologies
            propagate_counts=False,
            alpha=0.05,  # default significance cut-off
            methods=['fdr_bh'],
            pvalcalc='fisher_scipy_stats')

        goea_results_all = goeaobj.run_study(gene_list)

        rows = []
        columns = [
            'namespace', 'term_id', 'e/p', 'pval_uncorr', 'p_fdr_bh', 'ratio',
            'bg_ratio', 'obj'
        ]
        if list_study_genes:
            columns.append('study_genes')
        for ntd in goea_results_all:

            ntd.__class__ = My_GOEnrichemntRecord  # Hack. Changes __class__ of all instances...

            row = [
                ntd.NS, ntd.GO, ntd.enrichment, ntd.p_uncorrected,
                ntd.p_fdr_bh, ntd.ratio_in_study[0] / ntd.ratio_in_study[1],
                ntd.ratio_in_pop[0] / ntd.ratio_in_pop[1], ntd
            ]

            if list_study_genes:
                row.append(_cached_ncbi2symbol(sorted(ntd.study_items)))
            rows.append(row)
        df = (pd.DataFrame().from_records(rows, columns=columns).sort_values(
            by=['p_fdr_bh', 'ratio']).reset_index(drop=True))
        return df.loc[df.p_fdr_bh < alpha]
### to download a set of background population genes from NCBI.

from genes_ncbi_10090_proteincoding import GENEID2NT as GeneID2nt_mus

###       3. INITIALIZE A GOEA object

### The GOEA object holds the Ontologies, Associations, and background.
###  Numerous studies can then be run withough needing to re-load the above items.
### In this case, we only run one GOEA.

from goatools.goea.go_enrichment_ns import GOEnrichmentStudyNS

goeaobj = GOEnrichmentStudyNS(
        GeneID2nt_mus,                             # List of mouse protein-coding genes
        ns2assoc,                                  # geneid/GO associations
        obodag,                                    # Ontologies
        propagate_counts = False,
        alpha = 0.05,                              # default significance cut-off
        methods = ['fdr_bh'])                      # defult multipletest correction method

###         4. READ STUDY GENES


# Data will be stored in this variable
import os
geneid2symbol = {}
# Get xlsx filename where data is stored
din_xlsx = r"C:\Users\krishna\Downloads\padj_converted.xlsx" ###excel file containing 3 columns:
                                                        ### gene_symbols (our test data), their respective ESENMBL gene ids, and their p adj values (test_data)

# Read data
Exemple #17
0
def pullGOenrichment(inputFile, project):
    GeneID2nt_hum = genes_NCBI_9606_ProteinCoding.GENEID2NT

    obo_fname = download_go_basic_obo()

    fin_gene2go = download_ncbi_associations()

    obodag = GODag("go-basic.obo")

    # Read NCBI's gene2go. Store annotations in a list of namedtuples
    objanno = Gene2GoReader(fin_gene2go, taxids=[9606])

    # Get namespace2association where:
    #    namespace is:
    #        BP: biological_process
    #        MF: molecular_function
    #        CC: cellular_component
    #    assocation is a dict:
    #        key: NCBI GeneID
    #        value: A set of GO IDs associated with that gene
    ns2assoc = objanno.get_ns2assc()

    for nspc, id2gos in ns2assoc.items():
        print("{NS} {N:,} annotated human genes".format(NS=nspc,
                                                        N=len(id2gos)))

    print(len(GeneID2nt_hum))

    goeaobj = GOEnrichmentStudyNS(
        GeneID2nt_hum.keys(),  # List of human protein-coding genes
        ns2assoc,  # geneid/GO associations
        obodag,  # Ontologies
        propagate_counts=False,
        alpha=0.05,  # default significance cut-off
        methods=['fdr_bh'])  # defult multipletest correction method

    geneid2symbol = {}
    with open(inputFile, 'r') as infile:
        input_genes = csv.reader(infile)
        for line in input_genes:
            geneid = line[0]
            symbol = line[1]
            if geneid:
                geneid2symbol[int(geneid)] = symbol

    infile.close()

    geneids_study = geneid2symbol.keys()
    goea_results_all = goeaobj.run_study(geneids_study)
    goea_results_sig = [r for r in goea_results_all if r.p_fdr_bh < 0.05]

    import collections as cx
    ctr = cx.Counter([r.NS for r in goea_results_sig])
    print('Significant results[{TOTAL}] = {BP} BP + {MF} MF + {CC} CC'.format(
        TOTAL=len(goea_results_sig),
        BP=ctr['BP'],  # biological_process
        MF=ctr['MF'],  # molecular_function
        CC=ctr['CC']))  # cellular_component

    goeaobj.wr_xlsx("Data/go_enrichment" + project + ".csv", goea_results_sig)
    goeaobj.wr_txt("Data/go_enrichment" + project + ".txt", goea_results_sig)
    def keep_only_go(x):
        golist = x.split(';')
        cleanlist = {i.split('#')[-1] for i in golist}
        return cleanlist

    dfA['BP'] = dfA['BP'].apply(keep_only_go)

    dfA['UniProtKB/Swiss-Prot ID'] = dfA.index.map(dfQ.reset_index().set_index('PANTHER ID')['UniProtKB/Swiss-Prot ID'].dropna().to_dict())
    dfA['Gene stable ID'] = dfA.index.map(dfQ.reset_index().set_index('PANTHER ID')['Gene stable ID'].dropna().to_dict())

    ns2assoc_combined = dfA.set_index('UniProtKB/Swiss-Prot ID').loc[:, ['BP']].to_dict()
    """

    # Gene Ontology Enrichment Analysis (GOEA)
    goea = GOEnrichmentStudyNS(pop=pop, ns2assoc=ns2assoc_combined, godag=godag, propagate_counts=True, alpha=0.05, methods=['fdr_bh'])

    # Load PCA
    rPCAFile = '../../04-network/results/pca/{celltype:s}/{layer:s}/pca-{celltype:s}-{network:s}-{threshold:s}-{layer:s}-dim.csv.gz'.format(celltype=celltype, network=network, threshold=threshold_str, layer=layer)
    rDiAnFile = '../../04-network/results/pca/{celltype:s}/{layer:s}/pca-{celltype:s}-{network:s}-{threshold:s}-{layer:s}-dian.csv.gz'.format(celltype=celltype, network=network, threshold=threshold_str, layer=layer)
    rEntFile = '../../04-network/results/pca/{celltype:s}/{layer:s}/pca-{celltype:s}-{network:s}-{threshold:s}-{layer:s}-entropy.csv.gz'.format(celltype=celltype, network=network, threshold=threshold_str, layer=layer)
    #
    wCSVFile = 'results/goea/{celltype:s}/goea-{celltype:s}-{network:s}-{threshold:s}-{layer:s}.csv.gz'.format(celltype=celltype, network=network, threshold=threshold_str, layer=layer)

    df_pca = pd.read_csv(rPCAFile, index_col=0)
    df_dian = pd.read_csv(rDiAnFile, index_col=0)
    df_ent = pd.read_csv(rEntFile, index_col=0)

    ldfS = []
    for module in modules:
Exemple #19
0
#    assocation is a dict:
#        key: NCBI GeneID
#        value: A set of GO IDs associated with that gene
ns2assoc = objanno.get_ns2assc()

for nspc, id2gos in ns2assoc.items():
    print("{NS} {N:,} annotated human  genes".format(NS=nspc, N=len(id2gos)))

print()
print(len(GeneID2nt_homo))
print()

goeaobj = GOEnrichmentStudyNS(
    GeneID2nt_homo.keys(),  # List of mouse protein-coding genes
    ns2assoc,  # geneid/GO associations
    obodag,  # Ontologies
    propagate_counts=False,
    alpha=0.05,  # default significance cut-off
    methods=['fdr_bh'])  # defult multipletest correction method

symbols = np.zeros(len(GeneID2nt_homo.keys()), dtype='U100')
geneids = np.zeros(len(GeneID2nt_homo.keys()), dtype=int)

#Creating a lookup table to convert the gene symbols to the gene ids needed for the gene enrichment analysis
for idx, key in enumerate(GeneID2nt_homo.keys()):
    symbols[idx] = GeneID2nt_homo[key].Symbol
    geneids[idx] = GeneID2nt_homo[key].GeneID

boolean_symbol = np.isin(symbols, CDK1_gene_list)

matches_idx = np.where(boolean_symbol)[0]
Exemple #20
0
# select genes significantly differentially expressed according to BH FDR of sleuth
fdr_level_gene = float(snakemake.params.gene_fdr)
sig_genes = all_genes[all_genes['qval']<fdr_level_gene]


# initialize GOEA object
fdr_level_go_term = float(snakemake.params.go_term_fdr)

goeaobj = GOEnrichmentStudyNS(
    # list of 'population' of genes looked at in total
    pop = all_genes['ens_gene'].tolist(),
    # geneid -> GO ID mapping
    ns2assoc = ns2assoc,
    # ontology DAG
    godag = obodag,
    propagate_counts = False,
    # multiple testing correction method (fdr_bh is false discovery rate control with Benjamini-Hochberg)
    methods = ['fdr_bh'],
    # significance cutoff for method named above
    alpha = fdr_level_go_term
    )

goea_results_all = goeaobj.run_study(sig_genes['ens_gene'].tolist())


# write results to text file
goeaobj.wr_tsv(snakemake.output.enrichment, goea_results_all)


# plot results
Exemple #21
0
#    namespace is:
#        BP: biological_process
#        MF: molecular_function
#        CC: cellular_component
#    assocation is a dict:
#        key: NCBI GeneID
#        value: A set of GO IDs associated with that gene
ns2assoc = objanno.get_ns2assc()

for nspc, id2gos in ns2assoc.items():
    print("{NS} {N:,} annotated human genes".format(NS=nspc, N=len(id2gos)))

goeaobj = GOEnrichmentStudyNS(
        GeneID2nt_hum.keys(),       # List of human protein-acoding genes
        ns2assoc,                   # geneid/GO associations
        obodag, # Ontologies
        propagate_counts=False,
        alpha=0.05,                 # default significance cut-off
        methods=['fdr_bh'])         # defult multipletest correction method

# 'p_' means "pvalue". 'fdr_bh' is the multipletest method we are currently using.
geneids_study = pp_idx.flatten()    # geneid2symbol.keys()
geneids_study = [int(data.prot_idx_to_id[idx].replace('GeneID', ''))
                 for idx in geneids_study]

# ##############################################################################
goea_results_all = goeaobj.run_study(geneids_study)
goea_results_sig = [r for r in goea_results_all if r.p_fdr_bh < 0.05]
# ##############################################################################

n_sig = len(goea_results_sig)
Exemple #22
0
class GOEngine:
    def __init__(
        self,
        work_dir: str = '.',
        clean_work_dir: bool = False,
        organism: str = 'human',
        study_parameters: Dict[str, Union[int, float, str, List, Dict]] = {
            'propagate_counts': False,
            'alpha': 0.05,
            'methods': ['fdr_bh']
        }
    ) -> GOEngine:
        """A GOEngine that can be used for performing analysis using GOATOOLS

        Args:
            work_dir (str, optional): The path to a temp directory were intermediate-results and raw data will be downloaded/written to. Defaults to the current working directory.
            clean_work_dir (bool, optional): Whether or not to remove data written to the work directory at class termination, default to True.
            organism (str, optional): The organism . Defaults to 'human'.
            study_parameters (Dict[str,Union[int,float,str,List,Dict]], optional): A dict of parameters to control the base function, defaults to {'propagate_counts':False,'alpha':0.05, 'methods':['fdr_bh']}
        Returns:
            GOEngine: return a GO engine that can be used for performing GO enrichment analysis GOEnrichmentStudyNS
        """
        print("Creating a GO Engine ...")
        if not os.path.exists(work_dir):
            raise ValueError(
                f"The provided work path: {work_dir} does not exist!!!")
        self.work_dir = work_dir
        if organism != 'human' and organism != 'mouse':
            raise ValueError(
                f"The provided organism: {organism} is not support, current engine mainly work with human and moues only"
            )
        print(f"\t --> Downloading data ...")
        obo_fname = download_go_basic_obo(
            os.path.join(work_dir, 'go-basic.obo'))
        gene2go_fname = download_ncbi_associations(
            os.path.join(work_dir, 'gene2go'))
        ## parse the GO term
        print(
            f"\t --> parsing the data and intializing the base GOEA object...")
        obo_dag = GODag(obo_fname)
        if organism == 'human':
            self._goea_obj = GOEnrichmentStudyNS(
                gene2iden_human.keys(),
                Gene2GoReader(gene2go_fname, taxids=[9606]).get_ns2assc(),
                obo_dag, **study_parameters)
        else:
            self._goea_obj = GOEnrichmentStudyNS(
                gene2iden_human.keys(),
                Gene2GoReader(gene2go_fname, taxids=[10090]).get_ns2assc(),
                obo_dag, **study_parameters)
        self._clean_work_dir = clean_work_dir
        self._gene_ids = None
        return

    def load_data(self, exp: Experiment, num_proteins: int = -1) -> None:
        """Load the data to the Engine, so GOEA can be conducted 

        Args:
            exp (Experiment): An Experimental object to extract uniprot ids 
            num_proteins (int, optional): The number of proteins to be included in the analysis. Defaults -1 to which mean use all proteins,\
                 otherwise it uses the number of proteins provided by the user. note that the function is sorted by number of peptides per protein,\
                      that is the first 10 protein means, getting the top 10 protein with most peptides. 
        Raises:
            ValueError: if the function called while data being already associated with the engine from a previous call
        """
        if self._gene_ids is not None:
            raise ValueError(
                f"There some data still in the engine, the first 10 genes are: {','.join(self._gene_ids[:10])}\
                clean your engine from previous data using the function, clean_engine and try again."
            )
        print(
            f"Getting the number of peptide per protein ..., started at: {time.ctime()}"
        )
        num_protein_per_peptides = exp.get_peptides_per_protein()
        if num_proteins == -1:
            list_proteins = num_protein_per_peptides.iloc[:, 0].to_list()
        else:
            list_proteins = num_protein_per_peptides.iloc[:, 0].to_list(
            )[:num_proteins]
        print(
            f"Map uniprot to Entrez gene ids ..., starting at: {time.ctime()}")
        self._gene_ids = [
            int(gene_id) for gene_id in map_from_uniprot_to_Entrez_Gene(
                list_proteins).iloc[:, 1].to_list()
        ]
        print(f"{len(self._gene_ids)} Genes have been correctly loaded")
        return

    def run_analysis(
        self,
        quite: bool = False,
        only_signifcant: bool = True,
        significance_level: float = 0.05,
        get_list_term: bool = False
    ) -> Union[pd.DataFrame, List[GOEnrichmentRecord]]:
        if quite:
            goea_results = self._goea_obj.run_study(self._gene_ids, prt=None)
        else:
            goea_results = self._goea_obj.run_study(self._gene_ids)
        if only_signifcant:
            goea_results = [
                res for res in goea_results
                if res.p_fdr_bh < significance_level
            ]
        if get_list_term:
            return goea_results
        else:
            self._goea_obj.wr_tsv(os.path.join(self.work_dir, 'temp_file.tsv'),
                                  goea_results)
            results_df = pd.read_csv(os.path.join(self.work_dir,
                                                  'temp_file.tsv'),
                                     sep='\t')
            os.system(f"rm -f {os.path.join(self.work_dir,'temp_file.tsv')}")
        return results_df

    def clean_engine(self) -> None:
        """Remove Current list of gene ids associated with the engine 
        """
        self._gene_ids = None
        return

    def __del__(self) -> None:
        """class destructor, clean work directory if  clean_work_dir is set to True 
        """
        if self.clean_work_dir: os.system(f"rm -f {self.work_dir}/*")
        return