def importKEGGAssignments(outfile, mart, host, biomart_dataset): '''import the KEGG annotations from the R KEGG.db annotations package. Note that since KEGG is no longer publically availible, this is not up-to-date and maybe removed from bioconductor in future releases ''' R.library("KEGG.db") E.info("getting entrez to ensembl mapping ...") entrez2ensembl = PipelineBiomart.biomart_iterator( ("ensembl_gene_id", "entrezgene"), biomart=mart, dataset=biomart_dataset, host=host, path="/biomart/martservice") entrez2ensembl = dict( (x['entrezgene'], x['ensembl_gene_id']) for x in entrez2ensembl) E.info("Done") E.info("getting entrez to kegg mapping ... ") entrez2path = R('as.list(KEGGEXTID2PATHID)') E.info("Done") E.info("Getting KEGG names") pathnames = R('as.list(KEGGPATHID2NAME)') pathid2name = dict(zip(pathnames.names, R.unlist(pathnames))) E.info("Done") outf = IOTools.openFile(outfile, "w") outf.write("ontology\tgene_id\tkegg_ID\tkegg_name\tevidence\n") # rx2 did not work in rpy2 2.4.2 - workaround uses # absolute indices for gene_column, gene in enumerate(entrez2path.names): try: gene = int(gene) except ValueError: continue if gene in entrez2ensembl: ensid = entrez2ensembl[gene] else: continue for pathway in entrez2path[gene_column]: pathid = re.match("[a-z]+([0-9]+)", pathway).groups()[0] pathname = pathid2name[pathid] outf.write("\t".join(["kegg", ensid, str(pathway), pathname, "NA"]) + "\n")
def importKEGGAssignments(outfile, mart, host, biomart_dataset): ''' import the KEGG annotations from the R KEGG.db annotations package. Note that since KEGG is no longer publically availible, this is not up-to-date and maybe removed from bioconductor in future releases ''' R.library("KEGG.db") R.library("biomaRt") E.info("getting entrez to ensembl mapping ...") mart = R.useMart(biomart=mart, host=host, path="/biomart/martservice", dataset=biomart_dataset) entrez2ensembl = R.getBM(attributes=ro.StrVector( ["ensembl_gene_id", "entrezgene"]), mart=mart) entrez = entrez2ensembl.rx2("entrezgene") ensembl = entrez2ensembl.rx2("ensembl_gene_id") entrez2ensembl = dict(zip(entrez, ensembl)) E.info("Done") E.info("getting entrez to kegg mapping ... ") entrez2path = R('as.list(KEGGEXTID2PATHID)') E.info("Done") E.info("Getting KEGG names") pathnames = R('as.list(KEGGPATHID2NAME)') pathid2name = dict(zip(pathnames.names, R.unlist(pathnames))) E.info("Done") outf = IOTools.openFile(outfile, "w") outf.write("ontology\tgene_id\tkegg_ID\tkegg_name\tevidence\n") for gene in entrez2path.names: try: gene = int(gene) except ValueError: continue if gene in entrez2ensembl: ensid = entrez2ensembl[gene] else: continue for pathway in entrez2path.rx2(str(gene)): pathid = re.match("[a-z]+([0-9]+)", pathway).groups()[0] pathname = pathid2name[pathid] outf.write("\t".join(["kegg", ensid, str(pathway), pathname, "NA"]) + "\n")
def importKEGGAssignments(outfile, mart, host, biomart_dataset): ''' import the KEGG annotations from the R KEGG.db annotations package. Note that since KEGG is no longer publically availible, this is not up-to-date and maybe removed from bioconductor in future releases ''' R.library("KEGG.db") R.library("biomaRt") E.info("getting entrez to ensembl mapping ...") mart = R.useMart(biomart=mart, host=host, path="/biomart/martservice", dataset=biomart_dataset) entrez2ensembl = R.getBM(attributes=ro.StrVector(["ensembl_gene_id", "entrezgene"]), mart=mart) entrez = entrez2ensembl.rx2("entrezgene") ensembl = entrez2ensembl.rx2("ensembl_gene_id") entrez2ensembl = dict(zip(entrez, ensembl)) E.info("Done") E.info("getting entrez to kegg mapping ... ") entrez2path = R('as.list(KEGGEXTID2PATHID)') E.info("Done") E.info("Getting KEGG names") pathnames = R('as.list(KEGGPATHID2NAME)') pathid2name = dict(zip(pathnames.names, R.unlist(pathnames))) E.info("Done") outf = IOTools.openFile(outfile, "w") outf.write("ontology\tgene_id\tkegg_ID\tkegg_name\tevidence\n") for gene in entrez2path.names: try: gene = int(gene) except ValueError: continue if gene in entrez2ensembl: ensid = entrez2ensembl[gene] else: continue for pathway in entrez2path.rx2(str(gene)): pathid = re.match("[a-z]+([0-9]+)", pathway).groups()[0] pathname = pathid2name[pathid] outf.write( "\t".join(["kegg", ensid, str(pathway), pathname, "NA"]) + "\n")
def importKEGGAssignments(outfile, mart, host, biomart_dataset): '''import the KEGG annotations from the R KEGG.db annotations package. Note that since KEGG is no longer publically availible, this is not up-to-date and maybe removed from bioconductor in future releases ''' if not re.match("rnorvegicus|scerevisiae|hsapiens|mmusculus", biomart_dataset): E.warn("KEGG.db doesn't map Entrez ids for %s, %s will" " likely be empty" % (biomart_dataset, outfile)) R.library("KEGG.db") E.info("getting entrez to ensembl mapping ...") entrez2ensembl = PipelineBiomart.biomart_iterator( ("ensembl_gene_id", "entrezgene"), biomart=mart, dataset=biomart_dataset, host=host, path="/biomart/martservice") entrez2ensembl = dict((x['entrezgene'], x['ensembl_gene_id']) for x in entrez2ensembl) E.info("Done") E.info("getting entrez to kegg mapping ... ") entrez2path = R('as.list(KEGGEXTID2PATHID)') E.info("Done") E.info("Getting KEGG names") pathnames = R('as.list(KEGGPATHID2NAME)') pathid2name = dict(zip(pathnames.names, R.unlist(pathnames))) E.info("Done") outf = IOTools.openFile(outfile, "w") outf.write("ontology\tgene_id\tkegg_ID\tkegg_name\tevidence\n") # rx2 did not work in rpy2 2.4.2 - workaround uses # absolute indices for gene_column, gene in enumerate(entrez2path.names): try: gene = int(gene) except ValueError: continue if gene in entrez2ensembl: ensid = entrez2ensembl[gene] else: continue for pathway in entrez2path[gene_column]: pathid = re.match("[a-z]+([0-9]+)", pathway).groups()[0] pathname = pathid2name[pathid] outf.write( "\t".join(["kegg", ensid, str(pathway), pathname, "NA"]) + "\n")
def importKEGGAssignments(outfile, mart, host, biomart_dataset): '''import the KEGG annotations from the R KEGG.db annotations package. .. note:: Since KEGG is no longer publically available, this is not up-to-date and maybe removed from bioconductor in future releases The table written to outfile has the following columns: ``ontology``, ``gene_id``, ``kegg_ID``, ``kegg_name``, ``evidence``. Arguments --------- outfile : string Output filename in :term:`tsv` format. mart : string Name of the biomart host : string Host name of the biomart server biomart_dataset : string Biomart dataset ''' if not re.match("rnorvegicus|scerevisiae|hsapiens|mmusculus", biomart_dataset): E.warn("KEGG.db doesn't map Entrez ids for %s, %s will" " likely be empty" % (biomart_dataset, outfile)) R.library("KEGG.db") E.info("getting entrez to ensembl mapping ...") # Generates an iterator containing the data from biomart entrez2ensembl = Biomart.biomart_iterator( ("ensembl_gene_id", "entrezgene"), biomart=mart, dataset=biomart_dataset, host=host, path="/biomart/martservice") entrez2ensembl = dict((x['entrezgene'], x['ensembl_gene_id']) for x in entrez2ensembl) E.info("Done") E.info("getting entrez to kegg mapping ... ") entrez2path = R('as.list(KEGGEXTID2PATHID)') E.info("Done") E.info("Getting KEGG names") pathnames = R('as.list(KEGGPATHID2NAME)') pathid2name = dict(list(zip(pathnames.names, R.unlist(pathnames)))) E.info("Done") outf = IOTools.openFile(outfile, "w") outf.write("ontology\tgene_id\tkegg_ID\tkegg_name\tevidence\n") # rx2 did not work in rpy2 2.4.2 - workaround uses # absolute indices for gene_column, gene in enumerate(entrez2path.names): try: gene = int(gene) except ValueError: continue if gene in entrez2ensembl: ensid = entrez2ensembl[gene] else: continue for pathway in entrez2path[gene_column]: pathid = re.match("[a-z]+([0-9]+)", pathway).groups()[0] pathname = pathid2name[pathid] outf.write( "\t".join(["kegg", ensid, str(pathway), pathname, "NA"]) + "\n")
Differentially_significant_genes = nrDEG[nrDEG["P.Value"] < 0.05].query( '''logFC>0.5849625007211562 or logFC<-0.5849625007211562''') Differentially_significant_genes.to_csv( f"{path}Differentially_significant_genes.txt", sep="\t") ################# 基因功能富集分析 ################# # 富集分析 https://www.jianshu.com/p/988d90484f77 # BiocManager::install("org.Hs.eg.db") r("suppressMessages(library(clusterProfiler))") r("suppressMessages(library(org.Hs.eg.db))") symbol_id = robjects.StrVector(list( Differentially_significant_genes.index)) # symbol_id 转 entre_id entre_id = r.unlist( r.bitr(symbol_id, fromType="SYMBOL", toType="ENTREZID", OrgDb="org.Hs.eg.db").rx2('ENTREZID')) # GO富集分析 # ALL_R = r.enrichGO(entre_id, "org.Hs.eg.db", keyType="ENTREZID", ont='ALL', pvalueCutoff=0.05, pAdjustMethod="BH", # qvalueCutoff=0.1, readable=True) # 一步到位 BP_R = r.enrichGO(entre_id, "org.Hs.eg.db", keyType="ENTREZID", ont="BP", pvalueCutoff=0.05, pAdjustMethod="BH", qvalueCutoff=0.1, readable=True) # 3种分开进行富集 MF_R = r.enrichGO(entre_id, "org.Hs.eg.db",