Beispiel #1
0
def importKEGGAssignments(outfile, mart, host, biomart_dataset):
    '''import the KEGG annotations from the R KEGG.db annotations
    package. Note that since KEGG is no longer publically availible,
    this is not up-to-date and maybe removed from bioconductor in
    future releases

    '''

    R.library("KEGG.db")

    E.info("getting entrez to ensembl mapping ...")
    entrez2ensembl = PipelineBiomart.biomart_iterator(
        ("ensembl_gene_id", "entrezgene"),
        biomart=mart,
        dataset=biomart_dataset,
        host=host,
        path="/biomart/martservice")

    entrez2ensembl = dict(
        (x['entrezgene'], x['ensembl_gene_id']) for x in entrez2ensembl)

    E.info("Done")

    E.info("getting entrez to kegg mapping ... ")
    entrez2path = R('as.list(KEGGEXTID2PATHID)')
    E.info("Done")

    E.info("Getting KEGG names")
    pathnames = R('as.list(KEGGPATHID2NAME)')
    pathid2name = dict(zip(pathnames.names, R.unlist(pathnames)))
    E.info("Done")

    outf = IOTools.openFile(outfile, "w")
    outf.write("ontology\tgene_id\tkegg_ID\tkegg_name\tevidence\n")

    # rx2 did not work in rpy2 2.4.2 - workaround uses
    # absolute indices
    for gene_column, gene in enumerate(entrez2path.names):

        try:
            gene = int(gene)
        except ValueError:
            continue

        if gene in entrez2ensembl:
            ensid = entrez2ensembl[gene]

        else:
            continue

        for pathway in entrez2path[gene_column]:
            pathid = re.match("[a-z]+([0-9]+)", pathway).groups()[0]
            pathname = pathid2name[pathid]
            outf.write("\t".join(["kegg", ensid,
                                  str(pathway), pathname, "NA"]) + "\n")
Beispiel #2
0
def importKEGGAssignments(outfile, mart, host, biomart_dataset):
    ''' import the KEGG annotations from the R KEGG.db 
    annotations package. Note that since KEGG is no longer
    publically availible, this is not up-to-date and maybe removed
    from bioconductor in future releases '''

    R.library("KEGG.db")
    R.library("biomaRt")

    E.info("getting entrez to ensembl mapping ...")
    mart = R.useMart(biomart=mart,
                     host=host,
                     path="/biomart/martservice",
                     dataset=biomart_dataset)

    entrez2ensembl = R.getBM(attributes=ro.StrVector(
        ["ensembl_gene_id", "entrezgene"]),
                             mart=mart)

    entrez = entrez2ensembl.rx2("entrezgene")
    ensembl = entrez2ensembl.rx2("ensembl_gene_id")
    entrez2ensembl = dict(zip(entrez, ensembl))

    E.info("Done")

    E.info("getting entrez to kegg mapping ... ")
    entrez2path = R('as.list(KEGGEXTID2PATHID)')
    E.info("Done")

    E.info("Getting KEGG names")
    pathnames = R('as.list(KEGGPATHID2NAME)')
    pathid2name = dict(zip(pathnames.names, R.unlist(pathnames)))
    E.info("Done")

    outf = IOTools.openFile(outfile, "w")
    outf.write("ontology\tgene_id\tkegg_ID\tkegg_name\tevidence\n")

    for gene in entrez2path.names:

        try:
            gene = int(gene)
        except ValueError:
            continue

        if gene in entrez2ensembl:
            ensid = entrez2ensembl[gene]

        else:
            continue

        for pathway in entrez2path.rx2(str(gene)):
            pathid = re.match("[a-z]+([0-9]+)", pathway).groups()[0]
            pathname = pathid2name[pathid]
            outf.write("\t".join(["kegg", ensid,
                                  str(pathway), pathname, "NA"]) + "\n")
Beispiel #3
0
def importKEGGAssignments(outfile, mart, host, biomart_dataset):
    ''' import the KEGG annotations from the R KEGG.db 
    annotations package. Note that since KEGG is no longer
    publically availible, this is not up-to-date and maybe removed
    from bioconductor in future releases '''

    R.library("KEGG.db")
    R.library("biomaRt")

    E.info("getting entrez to ensembl mapping ...")
    mart = R.useMart(biomart=mart,
                     host=host,
                     path="/biomart/martservice",
                     dataset=biomart_dataset)

    entrez2ensembl = R.getBM(attributes=ro.StrVector(["ensembl_gene_id", "entrezgene"]),
                             mart=mart)

    entrez = entrez2ensembl.rx2("entrezgene")
    ensembl = entrez2ensembl.rx2("ensembl_gene_id")
    entrez2ensembl = dict(zip(entrez, ensembl))

    E.info("Done")

    E.info("getting entrez to kegg mapping ... ")
    entrez2path = R('as.list(KEGGEXTID2PATHID)')
    E.info("Done")

    E.info("Getting KEGG names")
    pathnames = R('as.list(KEGGPATHID2NAME)')
    pathid2name = dict(zip(pathnames.names, R.unlist(pathnames)))
    E.info("Done")

    outf = IOTools.openFile(outfile, "w")
    outf.write("ontology\tgene_id\tkegg_ID\tkegg_name\tevidence\n")

    for gene in entrez2path.names:

        try:
            gene = int(gene)
        except ValueError:
            continue

        if gene in entrez2ensembl:
            ensid = entrez2ensembl[gene]

        else:
            continue

        for pathway in entrez2path.rx2(str(gene)):
            pathid = re.match("[a-z]+([0-9]+)", pathway).groups()[0]
            pathname = pathid2name[pathid]
            outf.write(
                "\t".join(["kegg", ensid, str(pathway), pathname, "NA"]) + "\n")
Beispiel #4
0
def importKEGGAssignments(outfile, mart, host, biomart_dataset):
    '''import the KEGG annotations from the R KEGG.db annotations
    package. Note that since KEGG is no longer publically availible,
    this is not up-to-date and maybe removed from bioconductor in
    future releases

    '''

    if not re.match("rnorvegicus|scerevisiae|hsapiens|mmusculus", 
                    biomart_dataset):
        E.warn("KEGG.db doesn't map Entrez ids for %s, %s will"
               " likely be empty" % (biomart_dataset, outfile))

    R.library("KEGG.db")

    E.info("getting entrez to ensembl mapping ...")
    entrez2ensembl = PipelineBiomart.biomart_iterator(
        ("ensembl_gene_id", "entrezgene"),
        biomart=mart,
        dataset=biomart_dataset,
        host=host,
        path="/biomart/martservice")

    entrez2ensembl = dict((x['entrezgene'],
                           x['ensembl_gene_id'])
                          for x in entrez2ensembl)

    E.info("Done")

    E.info("getting entrez to kegg mapping ... ")
    entrez2path = R('as.list(KEGGEXTID2PATHID)')
    E.info("Done")

    E.info("Getting KEGG names")
    pathnames = R('as.list(KEGGPATHID2NAME)')
    pathid2name = dict(zip(pathnames.names, R.unlist(pathnames)))
    E.info("Done")

    outf = IOTools.openFile(outfile, "w")
    outf.write("ontology\tgene_id\tkegg_ID\tkegg_name\tevidence\n")

    # rx2 did not work in rpy2 2.4.2 - workaround uses
    # absolute indices
    for gene_column, gene in enumerate(entrez2path.names):

        try:
            gene = int(gene)
        except ValueError:
            continue

        if gene in entrez2ensembl:
            ensid = entrez2ensembl[gene]

        else:
            continue

        for pathway in entrez2path[gene_column]:
            pathid = re.match("[a-z]+([0-9]+)", pathway).groups()[0]
            pathname = pathid2name[pathid]
            outf.write(
                "\t".join(["kegg", ensid, str(pathway),
                           pathname, "NA"]) + "\n")
Beispiel #5
0
def importKEGGAssignments(outfile, mart, host, biomart_dataset):
    '''import the KEGG annotations from the R KEGG.db annotations
    package.

    .. note::

        Since KEGG is no longer publically available, this is not
        up-to-date and maybe removed from bioconductor in future
        releases

    The table written to outfile has the following columns:
    ``ontology``, ``gene_id``, ``kegg_ID``, ``kegg_name``,
    ``evidence``.

    Arguments
    ---------
    outfile : string
        Output filename in :term:`tsv` format.
    mart : string
        Name of the biomart
    host : string
        Host name of the biomart server
    biomart_dataset : string
        Biomart dataset

    '''

    if not re.match("rnorvegicus|scerevisiae|hsapiens|mmusculus",
                    biomart_dataset):
        E.warn("KEGG.db doesn't map Entrez ids for %s, %s will"
               " likely be empty" % (biomart_dataset, outfile))

    R.library("KEGG.db")

    E.info("getting entrez to ensembl mapping ...")

    # Generates an iterator containing the data from biomart
    entrez2ensembl = Biomart.biomart_iterator(
        ("ensembl_gene_id", "entrezgene"),
        biomart=mart,
        dataset=biomart_dataset,
        host=host,
        path="/biomart/martservice")

    entrez2ensembl = dict((x['entrezgene'],
                           x['ensembl_gene_id'])
                          for x in entrez2ensembl)

    E.info("Done")

    E.info("getting entrez to kegg mapping ... ")
    entrez2path = R('as.list(KEGGEXTID2PATHID)')
    E.info("Done")

    E.info("Getting KEGG names")
    pathnames = R('as.list(KEGGPATHID2NAME)')
    pathid2name = dict(list(zip(pathnames.names, R.unlist(pathnames))))
    E.info("Done")

    outf = IOTools.openFile(outfile, "w")
    outf.write("ontology\tgene_id\tkegg_ID\tkegg_name\tevidence\n")

    # rx2 did not work in rpy2 2.4.2 - workaround uses
    # absolute indices
    for gene_column, gene in enumerate(entrez2path.names):

        try:
            gene = int(gene)
        except ValueError:
            continue

        if gene in entrez2ensembl:
            ensid = entrez2ensembl[gene]

        else:
            continue

        for pathway in entrez2path[gene_column]:
            pathid = re.match("[a-z]+([0-9]+)", pathway).groups()[0]
            pathname = pathid2name[pathid]
            outf.write(
                "\t".join(["kegg", ensid, str(pathway),
                           pathname, "NA"]) + "\n")
Beispiel #6
0
    Differentially_significant_genes = nrDEG[nrDEG["P.Value"] < 0.05].query(
        '''logFC>0.5849625007211562 or logFC<-0.5849625007211562''')
    Differentially_significant_genes.to_csv(
        f"{path}Differentially_significant_genes.txt", sep="\t")

    ################# 基因功能富集分析 #################
    # 富集分析 https://www.jianshu.com/p/988d90484f77
    # BiocManager::install("org.Hs.eg.db")
    r("suppressMessages(library(clusterProfiler))")
    r("suppressMessages(library(org.Hs.eg.db))")
    symbol_id = robjects.StrVector(list(
        Differentially_significant_genes.index))
    # symbol_id 转 entre_id
    entre_id = r.unlist(
        r.bitr(symbol_id,
               fromType="SYMBOL",
               toType="ENTREZID",
               OrgDb="org.Hs.eg.db").rx2('ENTREZID'))
    # GO富集分析
    # ALL_R = r.enrichGO(entre_id, "org.Hs.eg.db", keyType="ENTREZID", ont='ALL', pvalueCutoff=0.05, pAdjustMethod="BH",
    #                   qvalueCutoff=0.1, readable=True)  # 一步到位
    BP_R = r.enrichGO(entre_id,
                      "org.Hs.eg.db",
                      keyType="ENTREZID",
                      ont="BP",
                      pvalueCutoff=0.05,
                      pAdjustMethod="BH",
                      qvalueCutoff=0.1,
                      readable=True)  # 3种分开进行富集
    MF_R = r.enrichGO(entre_id,
                      "org.Hs.eg.db",