def AnnotateWithMGI(infile, outfile): ''' Uses the list of mouse gene symbols generated using homologene above to annotate mouse phenotypes provided through MGI at mousemine.org Tables: ensemblg2mgi$annot - original host ensemblg to mouse phenotype ID mgi$details - mouse phenotype ID to mouse phenotype details ''' genelist = PipelineGeneInfo.getSymbols(infile) MGI = PipelineGeneInfo.MGIAnnotation(PARAMS['homologues_mousemine'], PARAMS['db_name']) PipelineGeneInfo.runall(MGI, genelist, submit=True)
def AnnotateWithMousePathway(infile, outfile): ''' Uses the list of mouse gene symbols generated using homologene above to annotate mouse pathways provided at mousemine.org Tables: ensemblg2mousepathway$annot - original host ensemblg to mouse pathway ID mousepathway$details - mouse pathway ID to mouse pathway details ''' genelist = PipelineGeneInfo.getSymbols(infile) MP = PipelineGeneInfo.MousePathwayAnnotation( PARAMS['homologues_mousemine'], PARAMS['db_name']) PipelineGeneInfo.runall(MP, genelist, submit=True)
def AnnotateWithMGI(infile, outfile): ''' Uses the list of mouse gene symbols generated using homologene above to annotate mouse phenotypes provided through MGI at mousemine.org Tables: ensemblg2mgi$annot - original host ensemblg to mouse phenotype ID mgi$details - mouse phenotype ID to mouse phenotype details ''' genelist = list(set(PipelineGeneInfo.getSymbols(infile))) MGI = PipelineGeneInfo.MGIAnnotation( PARAMS['homologues_mousemine'], PARAMS['db_name'], ohost=PARAMS['entrez_host']) PipelineGeneInfo.runall(MGI, genelist, submit=True)
def GetAndTranslateAllGenes(outfile): ''' This step is required. 1. All Entrez gene IDs are downloaded from entrez gene. 2. Corresponding ensembl gene, ensembl transcript and ensembl protein IDs are downloaded from mygene.info 3. Corresponding gene symbols are downloaded from mygene.info 4. These are loaded into the database 5. A list of all gene Entrez IDs is stored as 'allgenes.tsv Tables: ensemblg2entrez$geneid - ensemblg to entrez ID ensemblg2ensemblt$other - ensemblg to ensembl transcript ensemblg2ensemblp$other - ensemblg to ensembl protein ensemblg2symbol_xxx$geneid - ensemblg to symbol in species xxx ''' GeneAnnot = PipelineGeneInfo.EntrezGeneAnnotation( PARAMS['db_name'], PARAMS['entrez_email']) if PARAMS['test'] == 1: entrezgenelist = GeneAnnot.download_all(PARAMS['entrez_host'], count=100) else: entrezgenelist = GeneAnnot.download_all(PARAMS['entrez_host']) # Generate a SymbolAnnotation object Sym = PipelineGeneInfo.SymbolAnnotation(PARAMS['my_gene_info_source'], PARAMS['db_name'], PARAMS['entrez_host'], PARAMS['entrez_sciname']) # Get Symbol Annotations PipelineGeneInfo.runall(Sym, entrezgenelist, ['symbol'], scope='entrezgene', species=PARAMS['entrez_host'], submit=True) genesymbols = list(pd.read_csv("entrez2symbol_%s.tsv" % PARAMS[ 'entrez_host'], sep="\t")['symbol_%s' % PARAMS['entrez_host']]) # Generate an EnsemblAnnotation object Ens = PipelineGeneInfo.EnsemblAnnotation(PARAMS['my_gene_info_source'], PARAMS['db_name'], PARAMS['entrez_host']) # Get Ensembl annotations PipelineGeneInfo.runall(Ens, genesymbols, ['ensembl'], scope="symbol", species=PARAMS['entrez_host'], submit=True) # Make output gene list outf = IOTools.openFile(outfile, "w") for gene in genesymbols: outf.write("%s\n" % gene) outf.close()
def AnnotateWithHomologene(infile, outfile): ''' Annotates all genes in allgenes.tsv with homologous gene symbols from either a list of species provided in the pipeline.ini or all species available in homologene via mygene.info Tables: ensemblg2symbol_xxx$geneid - ensemblg in original species to symbol in xxx ''' genelist = PipelineGeneInfo.readGeneList(infile) HG = PipelineGeneInfo.HomologeneAnnotation( PARAMS['my_gene_info_source'], PARAMS['db_name'], PARAMS['my_gene_info_homologene'], PARAMS['entrez_host'], PARAMS['entrez_email']) PipelineGeneInfo.runall(HG, genelist, ['homologene'], submit=True)
def AnnotateWithPathway(infile, outfile): ''' Annotates all genes in allgenes.tsv with pathway details, either for all pathway databases available via mygene.info or those specified in the pipeline.ini Tables: ensemblg2xxx$annot - ensemblg to ID in pathway database xxx$details - pathway database ID to pathway details ''' genelist = PipelineGeneInfo.readGeneList(infile) PW = PipelineGeneInfo.PathwayAnnotation(PARAMS['my_gene_info_source'], PARAMS['db_name'], PARAMS['my_gene_info_pathway']) PipelineGeneInfo.runall(PW, genelist, ['pathway'], submit=True)
def AnnotateWithMousePathway(infile, outfile): ''' Uses the list of mouse gene symbols generated using homologene above to annotate mouse pathways provided at mousemine.org Tables: ensemblg2mousepathway$annot - original host ensemblg to mouse pathway ID mousepathway$details - mouse pathway ID to mouse pathway details ''' genelist = list(set(PipelineGeneInfo.getSymbols(infile))) MP = PipelineGeneInfo.MousePathwayAnnotation( PARAMS['homologues_mousemine'], PARAMS['db_name'], ohost=PARAMS['entrez_host']) PipelineGeneInfo.runall(MP, genelist, submit=True)
def AnnotateWithHPO(infile, outfile): ''' Uses the list of human gene symbols generated using homologene above to annotate human phenotypes provided through HPO at humanmine.org Tables: ensemblg2hpo$annot - original host ensemblg to human phenotype ID hpo$details - human phenotype ID to human phenotype details ''' genelist = PipelineGeneInfo.getSymbols(infile) HPO = PipelineGeneInfo.HPOAnnotation(PARAMS['homologues_humanmine'], PARAMS['db_name']) PipelineGeneInfo.runall(HPO, genelist, submit=True) ont = PipelineGeneInfo.OntologyAnnotation('hpo', PARAMS['homologues_hpoont'], PARAMS['db_name']) ont.runall(genelist)
def AnnotateWithHomologene(infile, outfile): ''' Annotates all genes in allgenes.tsv with homologous gene symbols from either a list of species provided in the pipeline.ini or all species available in homologene via mygene.info Tables: ensemblg2symbol_xxx$geneid - ensemblg in original species to symbol in xxx ''' genelist = PipelineGeneInfo.readGeneList(infile) HG = PipelineGeneInfo.HomologeneAnnotation(PARAMS['my_gene_info_source'], PARAMS['db_name'], PARAMS[ 'my_gene_info_homologene'], PARAMS['entrez_host'], PARAMS['entrez_email']) PipelineGeneInfo.runall(HG, genelist, ['homologene'], species=PARAMS['entrez_host'], submit=True)
def AnnotateWithPathway(infile, outfile): ''' Annotates all genes in allgenes.tsv with pathway details, either for all pathway databases available via mygene.info or those specified in the pipeline.ini Tables: ensemblg2xxx$annot - ensemblg to ID in pathway database xxx$details - pathway database ID to pathway details ''' genelist = PipelineGeneInfo.readGeneList(infile) PW = PipelineGeneInfo.PathwayAnnotation(PARAMS['my_gene_info_source'], PARAMS['db_name'], PARAMS['my_gene_info_pathway'], PARAMS['entrez_host']) PipelineGeneInfo.runall(PW, genelist, ['pathway'], species=PARAMS['entrez_host'], submit=True)
def AnnotateWithGO(infile, outfile): ''' Annotates all genes in allgenes.tsv with GO ontology terms using information from mygene.info Tables: ensemblg2go$annot- ensemblg to go ID go$details - go ID to details of go term go$ont - go ID to parent go IDs ''' genelist = PipelineGeneInfo.readGeneList(infile) # Generate a GoAnnotation object with details from mygene.info GO = PipelineGeneInfo.GoAnnotation(PARAMS['my_gene_info_source'], PARAMS['db_name'], PARAMS['my_gene_info_go'], PARAMS['entrez_host']) PipelineGeneInfo.runall(GO, genelist, ['go'], species=PARAMS['entrez_host'], submit=True) # Get the GO hierarcical ontology from OBO foundry ont = PipelineGeneInfo.OntologyAnnotation('go', PARAMS['my_gene_info_goont'], PARAMS['db_name']) PipelineGeneInfo.runall(ont, genelist, species=PARAMS['entrez_host'], submit=True)
def AnnotateWithHPO(infile, outfile): ''' Uses the list of human gene symbols generated using homologene above to annotate human phenotypes provided through HPO at humanmine.org Tables: ensemblg2hpo$annot - original host ensemblg to human phenotype ID hpo$details - human phenotype ID to human phenotype details ''' genelist = list(set(PipelineGeneInfo.getSymbols(infile))) HPO = PipelineGeneInfo.HPOAnnotation( PARAMS['homologues_humanmine'], PARAMS['db_name'], PARAMS['entrez_host']) PipelineGeneInfo.runall(HPO, genelist, submit=True) ont = PipelineGeneInfo.OntologyAnnotation('hpo', PARAMS['homologues_hpoont'], PARAMS['db_name']) PipelineGeneInfo.runall(ont, genelist, species=PARAMS['entrez_host'], submit=True)
def GetAndTranslateAllGenes(outfile): ''' This step is required. 1. All Entrez gene IDs are downloaded from entrez gene. 2. Corresponding ensembl gene, ensembl transcript and ensembl protein IDs are downloaded from mygene.info 3. Corresponding gene symbols are downloaded from mygene.info 4. These are loaded into the database 5. A list of all gene Entrez IDs is stored as 'allgenes.tsv Tables: ensemblg2entrez$geneid - ensemblg to entrez ID ensemblg2ensemblt$other - ensemblg to ensembl transcript ensemblg2ensemblp$other - ensemblg to ensembl protein ensemblg2symbol_xxx$geneid - ensemblg to symbol in species xxx ''' GeneAnnot = PipelineGeneInfo.EntrezGeneAnnotation(PARAMS['db_name'], PARAMS['entrez_email']) if PARAMS['test'] == 1: entrezgenelist = GeneAnnot.download_all(PARAMS['entrez_host'], count=100) else: entrezgenelist = GeneAnnot.download_all(PARAMS['entrez_host']) # Generate a SymbolAnnotation object Sym = PipelineGeneInfo.SymbolAnnotation(PARAMS['my_gene_info_source'], PARAMS['db_name'], PARAMS['entrez_host'], PARAMS['entrez_sciname']) # Get Symbol Annotations PipelineGeneInfo.runall(Sym, entrezgenelist, ['symbol'], scope='entrezgene', species=PARAMS['entrez_host'], submit=True) genesymbols = list( pd.read_csv("entrez2symbol_%s.tsv" % PARAMS['entrez_host'], sep="\t")['symbol_%s' % PARAMS['entrez_host']]) # Generate an EnsemblAnnotation object Ens = PipelineGeneInfo.EnsemblAnnotation(PARAMS['my_gene_info_source'], PARAMS['db_name'], PARAMS['entrez_host']) # Get Ensembl annotations PipelineGeneInfo.runall(Ens, genesymbols, ['ensembl'], scope="symbol", species=PARAMS['entrez_host'], submit=True) # Make output gene list outf = IOTools.open_file(outfile, "w") for gene in genesymbols: outf.write("%s\n" % gene) outf.close()
def MakeSubDBs(infile, outfile): ''' Takes any lists of genes provided in genesets.dir and makes a database in genesetdbs.dir containing only annotations for genes in the list. These will have the same gene ID type as the input lists and allow the user to quickly see the annotations for their genes of interest. ''' PipelineGeneInfo.MakeSubDBs(infile, outfile, PARAMS['db_subsettype'], PARAMS['db_name'], submit=True)