def cdsExt(ID, geneName): """ returns sequence record object for the input gene refseq ID """ retdata = Entrez.efetch(db="nucleotide", id=ID, rettype='gb', retmode='text').read() with open("Align/" + geneName.split('.')[0] + ".log", "a") as fp: if 'LOW QUALITY PROTEIN' in retdata: fp.write('%s CDS is of low quality\n' % ID) data = retdata.split('\n') for obj in data: if ' CDS ' in obj: try: cdsRange = [ int(obj.lstrip(' CDS ').split('..')[0]), int(obj.lstrip(' CDS ').split('..')[1]) ] except ValueError: try: cdsRange = [ int( obj.lstrip(' CDS ').split('..')[0]. lstrip('<').lstrip('>').rstrip('<').rstrip('>')), int( obj.lstrip(' CDS ').split('..')[1]. lstrip('<').lstrip('>').rstrip('<').rstrip('>')) ] except ValueError: try: cdsRange = [ int( obj.lstrip(' CDS ').split('..') [0]), int( obj.lstrip(' CDS ').split('..') [1].lstrip('>')) ] except ValueError: print( "Problem found while extracting cds from %s. Please report this issue to ambuj (at) ufl (dot) edu" % obj) continue recData = Entrez.efetch(db="nucleotide", id=ID, rettype="gb", warning=False) record =, 'genbank') record.seq = record.seq[cdsRange[0] - 1:cdsRange[1]] return record
def discont(): """Creates a list of discontinued IDs from NCBI gene database""" = '*****@*****.**' distTerm = "all[filter] NOT alive[prop]" handle = Entrez.esearch(db="gene", term=distTerm, rettype='xml', RetMax=10000000, warning=False) records = idListDiscont = records['IdList'] return idListDiscont
def fetchall(spName, discontId): """ @ spName - Species name @ discontId - list of IDs that are discontinued in NCBI gene database @ returns - list of all gene names present in the corresponding species """ inpTerm = spName + "[orgn]" = '*****@*****.**' try: handle = Entrez.esearch(db="gene", term=inpTerm, rettype='xml', RetMax=1000000) except: raise RuntimeError( "Failed to import sequence from NCBI. Check your internet connection.\nThis might also occur due to NCBI failure" ) records = idList = records['IdList'] if is_empty(idList) == True: print("No gene record available for %s" % spName) return None print("Filtering discontinued gene IDs in %s" % spName) goodIds = [i for i in idList if i not in discontId] idNameList = list() for idName in goodIds: try: annot = Entrez.efetch(db='gene', id=idName, retmode='text', rettype='brief').read() print("Scanning %s %s gene" % (spName, annot.split(" ")[1])) idNameList.append(annot.split(" ")[1]) except: print("%s skipped" % idName) idNameList.append(idName + " skipped") continue idNameList = set([x for x in idNameList if " skipped" not in x]) with open("Output/" + spName + "_genes.txt", 'w') as fp: for gene in idNameList: fp.write("%s\n" % gene) return idNameList
def mrnaExt(ID): """Extract sequence record for the given sequence ID""" recData = Entrez.efetch(db="nucleotide", id=ID, rettype="gb", warning=False) record =, 'genbank') return record
def _fetch_Species(inpTerm=None): """Extracts species ID from NCBI genome database""" if inpTerm == None: inpTerm = "Eucaryotes[orgn] NOT Vertebrates[orgn]" try: handle = Entrez.esearch(db="genome", term=inpTerm, rettype='xml', RetMax=10000) except: raise RuntimeError( "Failed to import sequence from NCBI. Check your internet connection.\nThis might also occur due to NCBI failure" ) records = idList = records['IdList'] return idList
def oneGeneCdsImport(geneName, group): """ @ geneName - name of the gene @ group - organism name @ returns - the longest CDS sequence for the corresponding taxa """ inpTerm = geneName + "[sym] AND " + group + "[orgn]" = '*****@*****.**' print("Importing %s %s gene CDS sequence" % (group, geneName)) try: handle = Entrez.esearch(db="gene", term=inpTerm, rettype='xml', RetMax=300) except: raise RuntimeError( "Failed to import sequence from NCBI. Check your internet connection.\nThis might also occur due to NCBI failure" ) records = ids = records["IdList"][0] _xmlcreate(ids) refIds = _xmlparser() os.remove('export.xml') recordList = list() for inIDs in refIds: recordList.append(cdsExt(inIDs, geneName)) longestRec = recordList[0] for rec in recordList: longestRec = rec if len(rec.seq) > len(longestRec.seq) else longestRec = geneName return longestRec
def mrnaImport(geneName, group, ortho): """ @ geneName - name of the gene @ group - organism name @ creates a taxon mRNA aligned fasta file as output for the set of genes given as input """ if ortho != None: inpTerm = ortho + "[sym] AND " + group + "[orgn]" elif group != None: inpTerm = geneName + "[sym] AND " + group + "[orgn]" = '*****@*****.**' try: handle = Entrez.esearch(db="gene", term=inpTerm, rettype='xml', RetMax=300) except: raise RuntimeError( "Failed to import sequence from NCBI. Check your internet connection.\nThis might also occur due to NCBI failure" ) records = idList = records["IdList"] inpTerm = "ortholog_gene_" + str(idList[0]) + "[group]" handle = Entrez.esearch(db="gene", term=inpTerm, rettype='xml', RetMax=300, warning=False) records = idList = records["IdList"] outRecord = list() for ids in idList: _xmlcreate(ids) refIds = _xmlparser() os.remove('export.xml') recordList = list() for inIDs in refIds: recordList.append(mrnaExt(inIDs)) try: longestRec = recordList[0] except: continue for rec in recordList: longestRec = rec if len(rec.seq) > len( longestRec.seq) else longestRec print("%s" % longestRec.description) outRecord.append(longestRec) with open("Align/" + geneName + '.fas', 'w') as fp: SeqIO.write(outRecord, fp, 'fasta') fdata = open("Align/" + geneName + '.fas', 'r').readlines() with open("Align/" + geneName + '.fas', 'w') as fp: for lines in fdata: if '>' in lines and 'PREDICTED' in lines: newLine = '>' + lines.split(' ')[2] + '_' + lines.split( ' ')[3] + '|' + lines.split(' ')[0].lstrip('>') fp.write('%s\n' % newLine) elif '>' in lines and 'PREDICTED' not in lines: newLine = '>' + lines.split(' ')[1] + '_' + lines.split( ' ')[2] + '|' + lines.split(' ')[0].lstrip('>') fp.write('%s\n' % newLine) else: fp.write('%s' % lines)