def search_articles(searchterm,email): '''search_articles Return list of articles based on search term Parameters ========== searchterm: str a search term to search for Returns ======= articles: Article objects a list of articles that match search term ''' print "Getting pubmed articles for search term %s" %(searchterm) Entrez.email = email handle = Entrez.esearch(db='pubmed',term=searchterm,retmax=5000) record = Entrez.read(handle) # If there are papers if "IdList" in record: if record["Count"] != "0": # Fetch the papers ids = record['IdList'] handle = Entrez.efetch(db='pubmed', id=ids,retmode='xml',retmax=5000) return Entrez.read(handle) # If there are no papers else: print "No papers found for searchterm %s!" %(searchterm)
def store_abstracts_for_query(query,query_tag,maxN=None,preview_only=False): # if query_tag=="": # simpleQuery=" ".join(map(lambda x:x.name,queryTerms)) # else: # simpleQuery=query_tag # query=pg.build_query(queryTerms) print "will search",query Entrez.email = "*****@*****.**" search_results = Entrez.read(Entrez.esearch(db="pubmed", term=query, reldate=10*365, datetype="pdat", usehistory="y")) count = int(search_results["Count"]) print "Found %i results" % count if maxN!=None and maxN<count: count=maxN print "Only keeping first",count,"abstracts" if preview_only: return sys.stdout.flush() batch_size = 50 for start in range(0,count,batch_size): end = min(count, start+batch_size) print "Going to download record %i to %i" % (start+1, end) sys.stdout.flush() fetch_handle = Entrez.efetch(db="pubmed", rettype="medline", retmode="text", retstart=start, retmax=batch_size, webenv=search_results["WebEnv"], query_key=search_results["QueryKey"]) records=Medline.parse(fetch_handle) for r in records: pubmed_to_pg.store_medline_entry(r,query_tag)
def uniprotIDlist_Lineages(IdList): lastset = [] taxlist = [] names = [] Lineages =[] genes = {} try: genes = uni.map(list(IdList), f='ACC', t='P_ENTREZGENEID') # handle = Entrez.efetch(db="nuccore", id=list(lastset), retmode = 'xml') if len(genes)>0: #values = genes.values() #handle = Entrez.elink(dbfrom="nucleotide", id=values, linkname="gene_taxonomy") #taxa = Entrez.read(handle, validate = False) #for i,entry in enumerate(taxa): # taxlist.append(entry['LinkSetDb'][0]['Link'][0]['Id']) for Uniprot in genes: genes[Uniprot] = list(genes[Uniprot]) #if entry['IdList'][0] in genes[Uniprot]: #genes[Uniprot].append(entry['LinkSetDb'][0]['Link'][0]['Id']) except: pass codes = getTax(IdList) print len(codes) for code in codes.keys(): if code not in genes: genes[code] = [ 'noGeneID' , codes[code] ] else : genes[code].append(codes[code]) taxlist.append(codes[code]) print len(taxlist) taxonomydata = Entrez.read(Entrez.efetch(db="taxonomy", id=taxlist)) for i,record in enumerate(taxonomydata): name = record['ScientificName'] Lineage = record['Lineage'] print record for gene in genes: if len(genes[gene])>1 and genes[gene][1] == taxlist[i]: genes[gene].append(name) genes[gene].append(Lineage) names.append(name) Lineages.append(Lineage) #annotate the availability of genome and the reference genomes available handle = Entrez.elink(dbfrom="taxonomy", id=taxlist, linkname="taxonomy_genome") genomes = Entrez.read(handle, validate = False) for i,entry in enumerate(genomes): for protcode in genes: genomefound = False if entry['IdList'][0] in genes[protcode]: try: genes[protcode].append(entry['LinkSetDb'][0]['Link'][0]['Id']) genomefound = True break except: print entry if genomefound == False: genes[protcode].append('noGenome') return names, Lineages, genes
def taxonIDs_to_strainIDs(taxonIDs, hasgenome=True, donotexpand=False, excludewgs=False): '''Find all individual strains in a taxonID''' strainIDs = [] for taxonID in taxonIDs: #see http://www.ncbi.nlm.nih.gov/books/NBK21100/#A286 for some details about the terms and limits query = 'txid%s[subtree] AND "taxonomy genome"[filter] AND terminal[prop]' % taxonID if hasgenome: query = query + ' AND (("taxonomy assembly"[Filter]) OR ("taxonomy genome"[Filter]) OR ("taxonomy genome2"[Filter]))' if excludewgs: query = query + ' AND NOT wgs[filter]' sys.stderr.write("Querying NCBI with query: %s\n" %(query)) # The default retmax is 20 which is way too few for some queries... NCBIdata = Entrez.read(Entrez.esearch(db="taxonomy", term=query, retmax=10000)) #there can me multiple records returned taxon_strainIDs = NCBIdata['IdList'] if len(taxon_strainIDs)<1 : sys.stderr.write("WARNING: No sequence information of the type you requested for %s\n" % taxonID) if donotexpand: assert len(taxon_strainIDs)==1, "WARNING: more than one strain ID returned from query of NCBI %s" % taxonID else: if len(taxon_strainIDs)>1: #only need to let user know if there is more than one, otherwise the numbers are the same sys.stderr.write("Note: multiple strains associated with taxonID %s: %s\n" % (taxonID, taxon_strainIDs)) strainIDs = strainIDs + taxon_strainIDs return strainIDs
def add_new_taxonomy(server, new_taxons, parent_ncbi_tax_id): new_taxons = map(lambda x: x.split(":"), new_taxons) parent_taxid = None parent_left_value = None parent_right_value = None if parent_ncbi_tax_id: try: parent_taxid, parent_left_value, parent_right_value = \ server.adaptor.execute_one( 'select taxon_id, left_value, right_value ' 'from taxon where ncbi_taxon_id = %s', (parent_ncbi_tax_id,)) except AssertionError: # the given ncbi taxonomy id isn't currently in the database # download it using the Entrez API db_loader = Loader.DatabaseLoader(server.adaptor, list(server.values())[0], True) handle = Entrez.efetch( db="taxonomy", id=str(parent_ncbi_tax_id), retmode="XML") taxon_record = Entrez.read(handle) taxon_record[0]['LineageEx'].append( {'Rank': taxon_record[0]['Rank'], 'ScientificName': taxon_record[0]['ScientificName'], 'TaxId': taxon_record[0]['TaxId'] }) parent_taxid, parent_left_value, parent_right_value = db_loader._get_taxon_id_from_ncbi_lineage( taxon_record[0]["LineageEx"]) for tax_name, tax_rank in new_taxons: parent_taxid, parent_left_value, parent_right_value = insert_taxon_rank(server, parent_taxid, parent_left_value, parent_right_value, tax_name, tax_rank) return parent_taxid
def history_review(): """ 利用history来搜索和下载综述 """ Entrez.email = "*****@*****.**" search_results = Entrez.read(Entrez.esearch(db="pubmed", term="Opuntia[ORGN]", reldate=365, datetype="pdat", usehistory="y")) count = int(search_results["Count"]) print "Found %i results" % count batch_size = 10 filepath = os.path.join(DATAROOT, "recent_orchid_papers.txt") out_handle = open(filepath, "w") for start in range(0, count, batch_size): end = min(count, start+batch_size) print "Going to download record %i to %i" % (start+1, end) fetch_handle = Entrez.efetch(db="pubmed", rettype="medline", retmode="text", retstart=start, retmax=batch_size, webenv=search_results["WebEnv"], query_key=search_results["QueryKey"]) data = fetch_handle.read() fetch_handle.close() out_handle.write(data) out_handle.close()
def get_publications(parameters,hit_ids): '''get all publications out of pubmed-ids''' id_errors = [] out_file = open(parameters["out_file"],"w") out_file.write("PMID\tAuthor\tTitle\tJournal\n") counter = 0 print "4. Getting publication-details for all hits" for species,ids in hit_ids.items(): counter += 1 progress = int(50.0*counter/len(hit_ids))*"#" progress += (50-len(progress))* " " sys.stderr.write("\r"+"0%"+progress+"100%") for single_id in ids: error_counter = 0 while error_counter < 3: try: handle = Entrez.efetch(db=parameters["database"], id=single_id, retmode="xml") result = Entrez.read(handle) out_line = single_id+"\t" out_line += result[0]["MedlineCitation"]["Article"]["AuthorList"][0]["LastName"]+"\t" out_line += result[0]["MedlineCitation"]["Article"]["ArticleTitle"]+"\t" out_line += result[0]["MedlineCitation"]["Article"]["Journal"]["Title"]+"\n" out_file.write(out_line.encode('UTF-8')) error_counter = 5 handle.close() except: error_counter += 1 if error_counter == 3: print "\nThere was a problem to fetch the publication "+str(single_id) id_errors.append(single_id) print "\n5. Got all publications for all hits" return id_errors
def pubmed(): # Get the count of papers about orchid only in database pubmed Entrez.email = "*****@*****.**" # Always tell NCBI who you are handle = Entrez.egquery(term="orchid") record = Entrez.read(handle) for row in record["eGQueryResult"]: if row["DbName"] == "pubmed": print "The count of papers about orchid in database pubmed:", row["Count"] # Get the list of ids of above handle = Entrez.esearch(db="pubmed", term="orchid", retmax=100) record = Entrez.read(handle) idlist = record["IdList"] print "The id list of papers about orchid in database pubmed:", idlist print # Search papers author by "Liu ZJ" from pubmed handle = Entrez.efetch(db="pubmed", id=idlist, rettype="medline", retmode="text") records = Medline.parse(handle) search_author = "Liu ZJ" for record in records: if "AU" not in record: continue if search_author in record["AU"]: print "Author %s found." % search_author print "title:", record.get("TI", "?") print "authors:", record.get("AU", "?") print "source:", record.get("SO", "?") print
def history_seq(): """ 利用 history 来搜索和下载序列 """ Entrez.email = "*****@*****.**" search_handle = Entrez.esearch(db="nucleotide", term="Opuntia[orgn] and rpl16", usehistory="y") search_results = Entrez.read(search_handle) search_handle.close() gi_list = search_results["IdList"] count = int(search_results["Count"]) assert count == len(gi_list) # 得到两个额外的信息, WebEnv 会话cookie 和 QueryKey webenv = search_results["WebEnv"] query_key = search_results["QueryKey"] batch_size = 3 filepath = os.path.join(DATAROOT, "orchid_rpl16.fasta") out_handle = open(filepath, "w") for start in range(0, count, batch_size): end = min(count, start+batch_size) print "Going to download record %i to %i" % (start+1, end) fetch_handle = Entrez.efetch(db="nucleotide", rettype="fasta", retmode="text", retstart=start, retmax=batch_size, webenv=webenv, query_key=query_key) data = fetch_handle.read() fetch_handle.close() out_handle.write(data) out_handle.close()
def get_geo_data(s_terms, email, maxresults=1000): """Get geo data for query term provided""" Entrez.email = email Entrez.tool = 'GetGeoMetaDataPythonScript' hits = [] s_terms = s_terms + ' NOT GPL' handle = Entrez.esearch(db='gds', term=s_terms, retmax=maxresults, usehistory="y") results = Entrez.read(handle) newhandle = Entrez.esummary(db='gds', retmax = maxresults, webenv=results['WebEnv'], query_key=results['QueryKey']) summary = Entrez.read(newhandle) for i in range(len(summary)): samples = [] geo_data = {'id':summary[i]['Id'], 'n_samples':summary[i]['n_samples'], 'pubdate':summary[i]['PDAT'], 'platform':summary[i]['PlatformTitle'], 'suppfile':summary[i]['suppFile'], 'taxon':summary[i]['taxon'], 'entry_type':summary[i]['entryType'], 'gpl':summary[i]['GPL'], 'gse':summary[i]['GSE'], 'pubmed_ids':summary[i]['PubMedIds'], 'title':summary[i]['title'], 'gds_type':summary[i]['gdsType'], 'summary':summary[i]['summary'], 'soft_file':get_soft_url(summary[i]['GSE'], summary[i]['entryType']) } hits.append(geo_data) return hits
def get_genbank_identifiers(tax_identifier): genbank_entries = Entrez.esearch(db="nucleotide", term="txid"+tax_identifier, retmode="xml") esearch_result = Entrez.read(genbank_entries) genbank_identifiers = esearch_result["IdList"] return genbank_identifiers
def get_articles_in_range_by_author(self, author, range_start, range_end='', terms=''): """ This function returns all of the article IDs of the articles a given 'author' published between 'range_start' and 'range_end' Optionally, additional terms may be provided """ # if there's no rangeEnd set, we'll set it to the current year/month if range_end == '': range_end = "%s/%s" % (str(datetime.today().year), str(datetime.today().month)) E.email = self.email term = "%s [AU] %s:%s[DP] %s" % (author, range_start, range_end, terms) handle = E.esearch(db="pubmed", term=term) record = E.read(handle) handle.close() articles = [] for article_id in record['IdList']: if article_id not in articles: articles.append(article_id) print "got %d articles" % len(articles) return articles
def get_many_prot_seqrec_by_gis(gi_list): """ Download a dictionary of fasta SeqsRec from NCBI given a list of GIs. """ print("Downloading FASTA SeqRecords by GIs from NCBI") num=len(gi_list) fasta_seqrec=dict() for i in range(int(num/1000)+1): print("Fetching %d th thousands from %d"%(i,num)) while True: try: strn = ",".join(map(str,gi_list)[i*1000:(i+1)*1000]) request=Entrez.epost(db="protein",id=strn) result=Entrez.read(request) webEnv=result["WebEnv"] queryKey=result["QueryKey"] handle=Entrez.efetch(db="protein",rettype='fasta',retmode='text',webenv=webEnv, query_key=queryKey) for r in SeqIO.parse(handle,'fasta'): fasta_seqrec[r.id.split('|')[1]]=r except: continue if((len(fasta_seqrec)==(i+1)*1000) or (len(fasta_seqrec)==num)): break else: print "Mismatch:", num," ", len(fasta_seqrec) print("FASTA Records downloaded:") print(len(fasta_seqrec)) return(fasta_seqrec)
def pubsearch(jids): Entrez.email = "*****@*****.**" # always let Entrez know who is calling pubterm = "" for i in jids: pubterm += i + "[JID] or " IDhandle = Entrez.esearch( db="pubmed", term="peptide AND (" + pubterm + " and ", mindate="2011", maxdate="2014", retmax=2500 ) # for documentation on esearch, see # http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch # max number for retmax is 100k. Use retstart to get more than this. # Date range used to limit a search result by the date specified by datetype. These two parameters (mindate, maxdate) must be used together to specify an arbitrary date range. The general date format is YYYY/MM/DD, and these variants are also allowed: YYYY, YYYY/MM. record = Entrez.read(IDhandle) # record is returned as a dictionary. Lists search terms, all ID numbners etc idlist = record["IdList"] # return a list of ID numbers from the record dictionary recordHandle = Entrez.efetch(db="pubmed", id=idlist, rettype="medline", retmode="text") # search pubmed for records with idlist as input records = Medline.parse(recordHandle) # create dictionary from recordHandle return records
def get_abstract(query,file_name,previewOnly=False): Entrez.email = "*****@*****.**" search_results = Entrez.read(Entrez.esearch(db="pubmed", term=query, # reldate=1996, datetype="pdat", #reldate is in number of days! reldate=20*365, datetype="pdat", #reldate is in number of days! usehistory="y")) count = int(search_results["Count"]) print "Found %i results" % count sys.stdout.flush() if previewOnly: return batch_size = 10 out_handle = open(file_name+".txt", "w") for start in range(0,count,batch_size): end = min(count, start+batch_size) print "Going to download record %i to %i" % (start+1, end) sys.stdout.flush() fetch_handle = Entrez.efetch(db="pubmed", rettype="medline", retmode="text", retstart=start, retmax=batch_size, webenv=search_results["WebEnv"], query_key=search_results["QueryKey"]) data = fetch_handle.read() fetch_handle.close() out_handle.write(data) out_handle.close()
def test_efetch_biosystems_xml(self): """Test Entrez parser with XML from biosystems""" handle = Entrez.efetch(id="1134002", db="biosystems", retmode="xml") records = list(Entrez.parse(handle)) handle.close() self.assertEqual(len(records), 1) self.assertEqual(records[0]['System_sysid']['Sys-id']['Sys-id_bsid'], '1134002')
def test_webenv_search(self): """Test Entrez.search from link webenv history""" handle = Entrez.elink(db='nucleotide', dbfrom='protein', id='22347800,48526535', webenv=None, query_key=None, cmd='neighbor_history') self.assertTrue(handle.url.startswith(URL_HEAD + "elink.fcgi?"), handle.url) self.assertIn(URL_TOOL, handle.url) self.assertIn(URL_EMAIL, handle.url) self.assertIn(URL_API_KEY, handle.url) self.assertIn("id=22347800%2C48526535", handle.url) recs = Entrez.read(handle) handle.close() record = recs.pop() webenv = record['WebEnv'] query_key = record['LinkSetDbHistory'][0]['QueryKey'] handle = Entrez.esearch(db='nucleotide', term=None, retstart=0, retmax=10, webenv=webenv, query_key=query_key, usehistory='y') self.assertTrue(handle.url.startswith(URL_HEAD + "esearch.fcgi?"), handle.url) self.assertIn(URL_TOOL, handle.url) self.assertIn(URL_EMAIL, handle.url) self.assertIn(URL_API_KEY, handle.url) search_record = Entrez.read(handle) handle.close() self.assertEqual(2, len(search_record['IdList']))
def fetch_DNA_seqs(terms, maxn=10000, batchsize=1000): """ terms: sequence of search terms, quoted appropriately, with Entrez specifiers, e.g. ['"Mus musculus"[organism]'] maxn: maximum number of sequences to return returns list of SeqRecord objects """ global email assert email, "set email!" Entrez.email = email h = Entrez.esearch(db="nucleotide", term=" OR ".join(terms), usehistory="y") d = Entrez.read(h) env = d['WebEnv']; key = d['QueryKey'] N = int(d['Count']) if maxn: N = min(N, maxn) logging.info('fetching %s sequences', N) retstart = 0 seqs = [] n = 0 while n < N: h = Entrez.efetch( db="nucleotide", rettype='gb', webenv=env, query_key=key, retstart=retstart, retmax=batchsize ) v = list(SeqIO.parse(h, "genbank")) n += len(v) logging.info('...fetched %s', n) seqs.extend(v) retstart += batchsize logging.info('...done') return seqs
def gi2webenv(gilist): h = Entrez.esearch( db="nucleotide", term=" OR ".join(gilist), usehistory="y", retmax=len(gilist) ) d = Entrez.read(h) return d["WebEnv"], d["QueryKey"]
def fetch_aclist(aclist, batchsize=1000): global email assert email, "set email!" Entrez.email = email results = {} n = 0 for v in batch(aclist, batchsize): v = list(v) h = Entrez.esearch( db="nucleotide", term=" OR ".join([ "%s[ACCN]" % x for x in v ]), usehistory="y" ) d = Entrez.read(h) h.close() h = Entrez.efetch(db="nucleotide", rettype="gb", retmax=len(v), webenv=d["WebEnv"], query_key=d["QueryKey"]) seqs = SeqIO.parse(h, "genbank") for s in seqs: try: ac = s.annotations["accessions"][0] if ac in aclist: results[ac] = s except: pass h.close() n += len(v) logging.info('fetched %s sequences', n) return results
def fetchtax(taxid): global email assert email, "set email!" Entrez.email = email n = 1 if not isinstance(taxid, int): # string, possibly with multiple values? try: taxid = taxid.strip() n = taxid.count(',') + 1 except AttributeError: # iterable of values? try: n = len(taxid) taxid = ','.join(map(str, taxid)) except TypeError: pass else: taxid = str(taxid) h = Entrez.efetch(db='taxonomy', id=taxid, retmode='xml', retmax=n) if n == 1: r = Entrez.read(h)[0] else: # a list of taxonomy results in same order of taxids r = Entrez.read(h) return r
def main(Substance, Organism, Gene): zoekterm1 = "Cocaine" zoekterm2 = "Elegans" MAX_COUNT = 50 dic = {} titels = [] TERM = '' TERMS = [] count = 1 if zoekterm2 == "": TERM = zoekterm1 if zoekterm1 == "": print("vul een zoekterm in") sys.exit() elif zoekterm2 != "": TERM = zoekterm1+" and "+zoekterm2 TERMS.append(TERM) print(TERM) handle = Entrez.esearch(db="pubmed", term= TERM, retmax=MAX_COUNT) record = Entrez.read(handle) idlist = record["IdList"] handle = Entrez.efetch(db="pubmed", id=idlist, rettype="medline", retmode="text") records = Medline.parse(handle) records = list(records) for record in records: titel = record.get("PMID","?") titels.append(titel) pubSet = set(titels) dic[TERM] = pubSet print(dic) return "Jay"
def __enter__(self): try: fetch_handle = Entrez.efetch(db="nucleotide", id=self.nuc_seq_ids, rettype="fasta", retmode="text") self.file = tempfile.NamedTemporaryFile() for i in range(int(math.ceil(float(len(self.nuc_seq_ids)) / 5000))): start = i * 5000 remaining = len(self.nuc_seq_ids) - start end = start + remaining if remaining < 5000 else start + 5000 fetch_handle = Entrez.efetch(db="nucleotide", id=self.nuc_seq_ids[start:end], rettype="fasta", retmode="text") for line in fetch_handle: self.file.write(line) fetch_handle.close() self.file.seek(0) except IOError: raise TaskError("Could not download sequence from NCBI") self.filepath = self.file.name return self
def searchgb(stermList, mindate): usedID = [] #try: out = open('seq_searchdb.fas','a') print 'searching genbank' for sterm in stermList: searchTerm = '(' + sterm + '[Organism] OR ' + sterm + '[All Fields]) AND ("' + str(mindate) + ' "[MDAT] : "' + str(date.today())+ ' "[MDAT])' handle = Entrez.esearch(db="nucleotide", retmax=10000, term= sterm) record = Entrez.read(handle) print record["Count"] for ID in record["IdList"]: if ID not in usedID: usedID.append(ID) handle = Entrez.efetch(db="nucleotide", id=ID, rettype="fasta" ) record = SeqIO.read(handle,"fasta") if re.search('18S', record.description): if re.search('5.8S', record.description): print "5.8S: %s" % record.description else: out.write('>' + record.id.split('|')[3] + '\n') out.write(str(record.seq) + '\n') elif re.search('small subunit', record.description): if re.search('5.8S', record.description): print "5.8S: %s" % record.description else: out.write('>' + record.id.split('|')[3] + '\n') out.write(str(record.seq) + '\n') out.close()
def main(): args = parser_get_args() with open(args.filepath, 'r') as species_names: species_list = species_names.read().splitlines() for species in species_list: time.sleep(1) Entrez.email = "*****@*****.**" search_query = Entrez.esearch(db="taxonomy", term=species, retmode="xml") result = Entrez.read(search_query) species_id = result['IdList'] species_id = "txid{}".format(species_id[0]) taxonomy_data = Entrez.efetch(db="taxonomy", id=species_id, retmode="xml") data = Entrez.read(taxonomy_data) lineageex = data[0]["LineageEx"] species_dict = {} species_dict['Species'] = species superclass_dict = get_superclass(lineageex) class_dict = get_class(lineageex) subclass_dict = get_subclass(lineageex) infraclass_dict = get_infraclass(lineageex) superorder_dict = get_superorder(lineageex) order_dict = get_order(lineageex) superfamily_dict = get_superfamily(lineageex) family_dict = get_family(lineageex) genus_dict = get_genus(lineageex) with open(args.outputfile, 'a') as output_file: output_file.write(str((species_dict, superclass_dict, class_dict, subclass_dict, infraclass_dict, superorder_dict, order_dict, superfamily_dict, family_dict, genus_dict))) output_file.write('\n\n')
def pmhits (TERM): # Returns the number of hits returned by searching pubmed for *TERM* Entrez.email = '*****@*****.**' Entrez.tool = 'pm_impacts' h = Entrez.esearch(db='pubmed', retmax=1000000, term=TERM) result = Entrez.read(h) return result['Count']
def gen_paper(identifier,num_to_gen,author): ''' Creates objects from a paper's metadata ''' #import modules from Bio import Entrez Entrez.email = "*****@*****.**" from classes.paper import paper #Search PubMed for a given author, return a maximum of 20 results handle = Entrez.esearch(db="pubmed", retmax=num_to_gen, term=author+" [AU]") results = Entrez.read(handle) IDs = results['IdList'] handle.close() handle = Entrez.esummary(db="pubmed", id=IDs[identifier]) record = Entrez.read(handle) Title = (record[0]['Title']).encode('utf-8') PubDate = (record[0]['PubDate']).encode('utf-8') AuthorList = (record[0]['AuthorList']) AuthorList = [z.encode('utf-8') for z in AuthorList] LastAuthor = (AuthorList[-1]).encode('utf-8') handle.close() reference = paper(Title,LastAuthor,PubDate,AuthorList) return reference
def _search(query): handle = Entrez.esearch(db="pubmed", sort="relevance", retmax="20", retmode="xml", term=query, usehistory="y") results = Entrez.read(handle) webenv = results["WebEnv"] # ID for session query_key = results["QueryKey"] # ID for query within session return results
def test_epost(self): handle = Entrez.epost("nuccore", id="186972394,160418") self.assertEqual(URL_HEAD + "epost.fcgi", handle.url) handle.close() handle = Entrez.epost("nuccore", id=["160418", "160351"]) self.assertEqual(URL_HEAD + "epost.fcgi", handle.url) handle.close()
def acsum(aclist, batchsize=100): """ fetch esummary info for list of accession numbers -- useful for getting gi and taxids """ global email assert email, "set email!" Entrez.email = email results = {} for v in batch(aclist, batchsize): v = list(v) h = Entrez.esearch( db="nucleotide", retmax=len(v), term=" OR ".join([ "%s[ACCN]" % x for x in v ]), usehistory="y" ) d = Entrez.read(h) h.close() # gis, but not in order of aclist gis = d['IdList'] d = Entrez.read(Entrez.esummary(db='nucleotide', id=','.join(gis)), validate=False) for x in d: ac = x['Caption'] if ac in aclist: results[ac] = x return results
from Bio import Entrez from Bio import SeqIO Entrez.email = "*****@*****.**" with Entrez.efetch(db="nucleotide", rettype="fasta", retmode="text", id="13788565") as handle: seq_record = SeqIO.read(handle, "fasta") print(seq_record.seq[0:40]) records = list(SeqIO.parse("sample.fasta", "fasta")) first_record = records[0] first_sequence = str(first_record.seq) print(first_sequence)
hits = collections.defaultdict(list) seq_files = {} with open(args.infile, 'rt') as ih: rdr = csv.reader(ih, delimiter='\t') for i, row in enumerate(rdr): gid = row[0].split('|')[-2] pid = float(row[1]) s = int(row[2]) e = int(row[3]) n = row[4] ## Check if we have already encountered this id, if not, ## download the sequence if gid not in hits: seq = Entrez.efetch(db='nucleotide', id=gid, rettype='gb', retmode='text').read() tmpfile = tempfile.mkstemp(dir='.') with open(tmpfile[0], 'wt') as th: th.write(seq) seq_files[gid] = tmpfile[1] time.sleep(60) ## Store information hits[gid].append((s, e, pid, n, gid)) ## For each hit, find the closest gene with open(args.output, 'wt') as oh: for gid, hit_info in hits.items(): features = [ f for f in SeqIO.read(seq_files[gid], 'genbank').features
Created on Tue Sep 06 18:45:27 2016 USAGE python getPUBMEDabstractsFROMxml.py <INPUT.xml> @author: Jahir Gutierrez """ from Bio import Entrez import sys filename = str(sys.argv[1]) handle = open(filename) records = Entrez.parse(handle) out_filename = filename.replace('.xml', '.tsv') f = open(out_filename, 'w') for record in records: try: pmid = record['MedlineCitation']['PMID'].encode('ascii', 'ignore') title = record['MedlineCitation']['Article']['ArticleTitle'].encode( 'ascii', 'ignore') abstract_list = record['MedlineCitation']['Article']['Abstract'][ 'AbstractText'] abstract = '' except: continue for element in abstract_list: abstract = abstract + element.encode('ascii', 'ignore') + ' ' abstract = abstract[0:len(abstract) - 1]
usage() exit() f = open(sys.argv[1], "r") list_records = [] Entrez.email = "*****@*****.**" out = open(sys.argv[2], "w") count = 0 for l in f: count += 1 if count % 50 == 0: print(str(count) + " sequences processed...") accession = l.rstrip() try: handle_search = Entrez.esearch(db="nucleotide", term=accession) record_search = Entrez.read(handle_search) records_fasta = [ Entrez.efetch(id=i, db="nucleotide", rettype="fasta", retmode="text") for i in record_search["IdList"] ] records_seqio = [ SeqIO.read(record, "fasta") for record in records_fasta ] except: continue for rec in records_seqio: if accession in rec.id: SeqIO.write(rec, out, "fasta")
'IMG_ID', 'N_Reads', 'N_Contigs', 'N50', 'Habitat', 'Location' ]) samplescsv = csv.reader(infh, delimiter="\t") headerrow = next(samplescsv) for row in samplescsv: outrow = row BIOPROJECTID = row[3] print(row) SRARUN = [] NREADS = 0 NBASES = 0 if BIOPROJECTID: SRP = row[4] handle = Entrez.esearch(db="sra", retmax=10, term=SRP) record = Entrez.read(handle) for id in record["IdList"]: SRP = id handle.close() #SRP=row[4] handle = Entrez.efetch(db="sra", id=SRP) tree = ET.parse(handle) root = tree.getroot() for runs in root.iter('RUN_SET'): for run in runs.iter('RUN'): SRARUN.append(run.attrib['accession']) #indent(run) #print(ET.tostring(run)) for dbs in runs.iter('Databases'):
from Bio import Entrez my_em = '*****@*****.**' db = "pubmed" # Search de Entrez website using esearch from eUtils # esearch returns a handle (called h_search) h_search = Entrez.esearch(db=db, email=my_em, term='python and bioinformatics') # Parse the result with Entrez.read() record = Entrez.read(h_search) # Get the list of Ids returned by previous search res_ids = record["IdList"] # For each id in the list for r_id in res_ids: # Get summary information for each id h_summ = Entrez.esummary(db=db, id=r_id, email=my_em) # Parse the result with Entrez.read() summ = Entrez.read(h_summ) print(summ[0]['Title']) print(summ[0]['DOI']) print('==============================================')
organismDict[row[0]] = str(row[1]) + ' ' + str(row[2]) + ' ' + str(row[0]) #выделяем уникальные геномы accessions = df['NCBI accession'].tolist() acc_set = set(accessions) #множество уникальных идентификаторов геномов #Делаем запросы в NCBI и записываем результат в файлы на диске for acc in acc_set: directory = DIR + '\\output\\' + organismDict[acc] if not os.path.exists( directory ): #проверка нужна, чтобы не перезаписывать уже имеющиеся папки gotIt = False print("Start processing " + organismDict[acc]) while (gotIt == False): try: gb_acc = Entrez.efetch(db='nuccore', id=acc, rettype='gb', retmode='text') gotIt = True except (urllib.error.HTTPError, urllib.error.URLError): time.sleep(3) rec = SeqIO.read(gb_acc, 'genbank') os.makedirs( directory ) #дополнительные проверки не нужны, т.к. если сюда попали, значит такой директории нет SeqIO.write(rec, directory + '\\' + organismDict[acc] + '.gbk', 'gb') print(organismDict[acc]) time.sleep(3) else: print(acc) #чтоб было видно, что мы прошли данный идентификатор #print(dfAntismashDB['NCBI accession', 'From', 'To'])
def get_raw_summary(id, db="assembly"): handle = Entrez.esummary(db=db, id=id, report="full") record = Entrez.read(handle) #return(record['DocumentSummarySet']['DocumentSummary'][0]['AssemblyName']) #This will return the Assembly name return (record)
from os.path import join from Bio import Entrez, SeqIO Entrez.email = '*****@*****.**' import logging log = logging.getLogger('orthofinder') handle = Entrez.esearch(db="genome", term="Klebsiella pneumoniae") record = Entrez.read(handle) print record.keys() if record['IdList']: handle = Entrez.efetch(db='genome', id=record['IdList']) rec = Entrez.read(handle) print rec print rec.keys() #for id in record['IdList']: # print(id) # handle = Entrez.efetch(db="genome", id=id) # print(handle.read()) # #for i, id in enumerate(record['IdList']): # print ' Fetching %s...' % id # # fetch_handle = Entrez.efetch(db='nuccore', id=id, retmode='text', rettype='gb') # gb_fpath = join(str(i) + '_test.gb') # with open(gb_fpath, 'w') as file: # file.write(fetch_handle.read()) # # rec = SeqIO.read(gb_fpath, 'gb') # org_name = rec.annotations['organism']
def get_ids(query, db="assembly"): ids = [] handle = Entrez.esearch(db=db, term=query) record = Entrez.read(handle) ids.append(record["IdList"]) return ids[0]
accessions.append([]) records.append([]) else: taxa.append(row[0]) for i, accession in enumerate(row): if i != 0: accessions[i - 1].append(row[i]) Entrez.email = '*****@*****.**' for i, gene in enumerate(genes): print('Downloading accessions for ' + gene + '...') for j, accession in enumerate(accessions[i]): if accession.strip() != '': handle = Entrez.efetch(db='nucleotide', rettype='fasta', retmode='text', id=accession) record = SeqIO.read(handle, 'fasta') records[i].append( SeqRecord(Seq(str(record.seq), IUPAC.ambiguous_dna), id=taxa[j], description="")) handle.close() sleep(0.02) SeqIO.write(records[i], "sequences_unaligned/" + genes[i] + ".fasta", "fasta") for i, gene in enumerate(genes): print("Aligning " + gene + " with MAFFT...") mafft_cline = MafftCommandline(input="sequences_unaligned/" + genes[i] + ".fasta")
def geneAnalysis(i, genes=None): #Start excel FILE workbook = xlsxwriter.Workbook('GeneGBInfo/genesGenbank.xlsx') worksheet = workbook.add_worksheet("Genebank") bold = workbook.add_format({'bold': True}) worksheet.write("A1", "GeneID", bold) worksheet.write("B1", "Location", bold) worksheet.write("C1", "Function", bold) worksheet.write("D1", "Gene", bold) worksheet.write("E1", "Locus tag", bold) worksheet.write("F1", "Note", bold) worksheet.write("G1", "Product", bold) worksheet.write("H1", "Protein_id", bold) worksheet.write("I1", "Sequence", bold) row = 1 col = 0 #End excel file #parse Genebank data handle = Entrez.efetch(db="nucleotide", rettype="gbwithparts", retmode="text", id=i, retmax=10**9, batchSize=1000) for seq_record in SeqIO.parse(handle, "gb"): g = [] protids = [] for feat in (seq_record.features): if feat.type == "CDS" and feat.qualifiers['locus_tag'][0] in genes: gene = feat.qualifiers['db_xref'][0].split( ':') # ex. GeneID:19834053 g.append(gene[1].strip()) worksheet.write(row, col, gene[1].strip()) col += 1 worksheet.write(row, col, str(feat.location)) col += 1 worksheet.write(row, col, feat.qualifiers['function'][0]) col += 1 if ('gene' in feat.qualifiers): worksheet.write(row, col, feat.qualifiers['gene'][0]) col += 1 else: col += 1 worksheet.write(row, col, feat.qualifiers['locus_tag'][0]) col += 1 if ('note' in feat.qualifiers): worksheet.write(row, col, feat.qualifiers['note'][0]) col += 1 else: col += 1 worksheet.write(row, col, feat.qualifiers['product'][0]) col += 1 worksheet.write(row, col, feat.qualifiers['protein_id'][0]) protids.append(feat.qualifiers['protein_id'][0]) col += 1 worksheet.write(row, col, str(seq_record.seq)) col += 1 row += 1 col = 0 u = getUniProtIds(g) getUniProtInfo(u) f = open('protids.txt', 'w') for protid in protids: print(protid, file=f) f.close() workbook.close()
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Sat Sep 23 14:52:24 2017 @author: atmiyashita """ # %% <<<<<<<< NCBI Search and downloading sequence file in a fasta format >>>>>>> from Bio import Entrez # import Entrez Entrez.email = "*****@*****.**" # always tell NCBI who you are search_handle = Entrez.esearch( db="nucleotide", term= '"elongation factor"[PROTEINFULLNAME] AND "arthropods"[porgn] AND 0:10000[Sequence Length]', usehistory="y", idtype="acc", RetMax=1000 ) # open search and search taeget genes in arthropod (limit sequence length less than 10kbp) !<<----Check! search_record = Entrez.read(search_handle) #store search record search_handle.close() #close search print(search_record["Count"]) print(search_record["IdList"]) IDs = search_record["IdList"] #NCBI IDs (nucleotide) to fetch later print(search_record.keys()) print(IDs) out_handle = open( "EF-arth.txt", "w" ) # open file where you save the result to !<<----Check! for seq_id in IDs: fetch_handle = Entrez.efetch( db="nucleotide",
usecols=[0, 1], header=False) snp_tab.columns = colnames snplist = snp_tab.VAR.values.tolist() start_time = datetime.datetime.now() snp_flank_left = [] snp_flank_right = [] for i, snp in enumerate(snplist): snp_split = snp.split(':') if len(snp_split[2]) == 3: chrom = snp_split[0] pos = snp_split[1] handle = Entrez.efetch(db="nucleotide", id=hg19_chrom[chrom], rettype="fasta", strand=1, seq_start=int(pos) - 1, seq_stop=int(pos) + 1) record = SeqIO.read(handle, "fasta") handle.close() if record.seq[1] == snp_split[2].split('>')[0]: #compare to ref base print "record: ", i, "chrom: ", chrom, "pos: ", pos, "trimer: ", record.seq snp_flank_left.append(str(record.seq[0])) snp_flank_right.append(str(record.seq[2])) else: print i, "NA" snp_flank_left.append("NA") snp_flank_right.append("NA") else: print i, "NA" snp_flank_left.append("NA")
import flask from flask import jsonify import Bio from Bio import Entrez from Bio.Align.Applications import ClustalwCommandline from Bio import AlignIO from Bio import Phylo Entrez.email = "*****@*****.**" extracao = Entrez.einfo() extracao_lida = Entrez.read(extracao) bancos = extracao_lida["DbList"] # para cada elemento da lista de bancos, # mostra um ID, para cada. bcs = "" for b in range(1, len(bancos) + 1): print(b, bancos[b - 1]) bcs += str(b) bcs += " - " bcs += bancos[b - 1] bcs += " | " # iniciar a aplicação app = flask.Flask("GenBridgePY") app.config["JSON_AS_ASCII"] = False # gerar url
""" ###################################################################################### import time from Bio import Entrez Entrez.email = '*****@*****.**' #input your email in_file = '/Users/lindsayhopson/Documents/mouse_CensuScope_outputs/newAccList1.2.txt' #use your appropriate file path with open('orgNames_newAccList2.2.txt', 'w') as output_File: with open(in_file, 'r') as input_File: id_list = input_File.readlines() count = 0 for i in id_list: handle = Entrez.esummary(db="nucleotide", id=i) record = Entrez.read(handle) a = str(record[0]['Title']) b = str(record[0]['TaxId']) a = a.replace(',', ' ') handle = Entrez.efetch(db="taxonomy", id=b) record = Entrez.read(handle) c = record[0]['Lineage'] output_File.write(str('\n' + i + '\t' + a + '\t' + b + '\n')) count = count + 1 print(i + '***' + a + '***' + b + '***' + c) time.sleep(1)
from Bio import Entrez from Bio import Medline import pandas as pd MAX_COUNT = 300 #TERM = 'Tuberculosis' TERM = 'precursor bioink gelatin'#pubmed has only 2 papers on these key words, google scholar has lot print('Getting {0} publications containing {1}...'.format(MAX_COUNT, TERM)) Entrez.email = '*****@*****.**' h = Entrez.esearch(db='pubmed', retmax=MAX_COUNT, term=TERM) #h = Entrez.esearch(db='pubmed', term=TERM) #print (h) #print (type(h)) result = Entrez.read(h) #print (result) #print (type(result)) #print (dir(result)) print('Total number of publications containing {0}: {1}'.format(TERM, result['Count'])) ids = result['IdList'] print (ids) h = Entrez.efetch(db='pubmed', id=ids, rettype='medline', retmode='text')#handle #h = Entrez.efetch('pubmed', id=ids, retmode='xml') #records = Entrez.parse(h) #print (type(h)) #print (dir(h)) #print (h) records = Medline.parse(h) #print (records) #print (dir(records))
for seq_record in SeqIO.parse(handle, 'fasta'): print(seq_record.id) print(len(seq_record.seq)) print(repr(seq_record.seq)) handle.close() print("Clustal W Files") handle = open("clustalw.clustal_num") for seq_record in SeqIO.parse(handle, "clustal"): print(seq_record.id) handle.close() #-------------------------------------- print("Entrez information") handle = Entrez.einfo() result = handle.read() print(result) handle = Entrez.efetch(db="nucleotide", id="57240072", rettype="gb", retmode="text") for i in handle.readlines(): print(i.strip()) print(handle.readline().strip()) handle.close() #----------- print("PDB Files") pdb1 = PDBList() pdbFile = pdb1.retrieve_pdb_file('1FAT')
def retrieve_wgsmaster_contigs(uid): """Munges a download URL from the passed UID and downloads the corresponding archive from NCBI, extracting it to the output directory. """ logger.info("Processing wgsmaster UID: %s" % uid) summary = Entrez.read( Entrez.esummary(db='nuccore', id=uid, rettype='text', validate=False)) # Assume that the 'Extra' field is present and is well-formatted. # Which means that the first six characters of the last part of # the 'Extra' string correspond to the download archive filestem. dlstem = summary[0]['Extra'].split('|')[-1][:6] dlver = summary[0]['Extra'].split('|')[3].split('.')[-1] # Download archive to output directory # Establish download size; if version number not in sync with # download, try again with version number decremented by 1. This # may be necessary because genome sequence version and genome/ # assembly version numbers are not synchronised. fsize = None while str(dlver) != '0' and not fsize: try: fname = "%s.%s.fsa_nt.gz" % (dlstem, dlver) outfname = os.path.join(args.outdirname, fname) url = "http://www.ncbi.nlm.nih.gov/Traces/wgs/?download=%s" % \ fname logger.info("Trying URL: %s" % url) response = urlopen(url) meta = response.info() fsize = int(meta.getheaders("Content-length")[0]) logger.info("Downloading: %s Bytes: %s" % (fname, fsize)) fsize_dl = 0 bsize = 1048576 except: # Download didn't work. Assuming it's because of version fsize = None logger.error("Download failed for (%s)" % url) if str(dlver) != '0': dlver = int(dlver) - 1 logger.info("Retrying download with version = %s" % dlver) else: logger.error("No more versions to try (exiting)") sys.exit(1) # Download data try: with open(outfname, 'wb') as fh: while True: buffer = response.read(bsize) if not buffer: break fsize_dl += len(buffer) fh.write(buffer) status = r"%10d [%3.2f%%]" % (fsize_dl, fsize_dl * 100. / fsize) logger.info(status) except: logger.error("Download failed for %s (exiting)" % fname) logger.error(last_exception()) sys.exit(1) # Extract archive asm_summary = entrez_retry(Entrez.esummary, db='assembly', id=asm_uid, rettype='text') asm_record = Entrez.read(asm_summary, validate=False) gname = asm_record['DocumentSummarySet']['DocumentSummary']\ [0]['AssemblyAccession'] extractfname = os.path.join(args.outdirname, '.'.join([gname, 'fasta'])) try: logger.info("Extracting archive %s to %s" % (outfname, extractfname)) with open(extractfname, 'w') as efh: subprocess.call(['gunzip', '-c', outfname], stdout=efh) # can be subprocess.run in Py3.5 logger.info("Archive extracted to %s" % extractfname) except: logger.error("Extracting archive %s failed (exiting)" % outfname) logger.error(last_exception()) sys.exit(1) # Get contig_uids contig_uids = [s.description for s in SeqIO.parse(extractfname, 'fasta')] return contig_uids, extractfname
#!/usr/bin/python """Download infomration on 'water pathogen' search query in NCBI Command-line application that does a search and return (Adina Howe, 2014). """ from Bio import Entrez Entrez.email = '*****@*****.**' handle = Entrez.esearch(db="genome", term="water pathogen", retmax=200) records = Entrez.read(handle) for i in records['IdList']: fp1 = open(i + '.summary', 'w') fp2 = open(i + '.links', 'w') handle1 = Entrez.esummary(db="genome", id=i) records1 = Entrez.read(handle1) print i fp1.write('%s\t%s\t%s\n' % (i, records1[0]['Organism_Name'], records1[0]['DefLine'])) handle2 = Entrez.elink(dbfrom="genome", db="nucleotide", id=i) records2 = Entrez.read(handle2) fp2.write('%s\t' % i) list_of_dict_ids = records2[0]['LinkSetDb'][0]['Link'] for link_id in list_of_dict_ids: fp2.write('%s\t' % link_id['Id']) fp2.write('\n')
def fetch_accession_range(acc: str, start: int, stop: int): with Entrez.efetch(db='nucleotide', id=acc, rettype='fasta', retmode='text', seq_start=start, seq_stop=stop) as h, \ TemporaryFile(mode='w+') as temp: temp.write(h.read()) temp.seek(0) return SeqIO.read(temp, format='fasta')
def write_contigs(asm_uid, contig_uids): """Writes assembly contigs out to a single FASTA file in the script's designated output directory. FASTA records are returned, as GenBank and even GenBankWithParts format records don't reliably give correct sequence in all cases. The script returns two strings for each assembly, a 'class' and a 'label' string - this is for use with, e.g. pyani. """ # Has duplicate code with get_class_label_info() - needs refactoring logger.info("Collecting contig data for %s" % asm_uid) # Assembly record - get binomial and strain names asm_summary = entrez_retry(Entrez.esummary, db='assembly', id=asm_uid, rettype='text') asm_record = Entrez.read(asm_summary, validate=False) asm_organism = asm_record['DocumentSummarySet']['DocumentSummary']\ [0]['SpeciesName'] try: asm_strain = asm_record['DocumentSummarySet']['DocumentSummary']\ [0]['Biosource']['InfraspeciesList'][0]['Sub_value'] except: asm_strain = "" # Assembly UID (long form) for the output filename gname = asm_record['DocumentSummarySet']['DocumentSummary']\ [0]['AssemblyAccession'] outfilename = "%s.fasta" % os.path.join(args.outdirname, gname) # Create label and class strings genus, species = asm_organism.split(' ', 1) ginit = genus[0] + '.' labeltxt = "%s\t%s %s %s" % (gname, ginit, species, asm_strain) classtxt = "%s\t%s" % (gname, asm_organism) # Get FASTA records for contigs logger.info("Downloading FASTA records for assembly %s (%s)" % (asm_uid, ' '.join([ginit, species, asm_strain]))) # We're doing an explicit retry loop here because we want to confirm we # have the correct data, as well as test for Entrez connection errors, # which is all the entrez_retry function does. tries, success = 0, False while not success and tries < args.retries: try: records = [] # Holds all return records # We may need to batch contigs query_uids = ','.join(contig_uids) batch_size = 10000 for start in range(0, len(contig_uids), batch_size): logger.info("Batch: %d-%d" % (start, start + batch_size)) seqdata = entrez_retry(Entrez.efetch, db='nucleotide', id=query_uids, rettype='fasta', retmode='text', retstart=start, retmax=batch_size) records.extend(list(SeqIO.parse(seqdata, 'fasta'))) tries += 1 # Check only that correct number of records returned. if len(records) == len(contig_uids): success = True else: logger.warning("%d contigs expected, %d contigs returned" % (len(contig_uids), len(records))) logger.warning("FASTA download for assembly %s failed" % asm_uid) logger.warning("try %d/20" % tries) # Could also check expected assembly sequence length? totlen = sum([len(r) for r in records]) logger.info("Downloaded genome size: %d" % totlen) except: logger.warning("FASTA download for assembly %s failed" % asm_uid) logger.warning(last_exception()) logger.warning("try %d/20" % tries) if not success: # Could place option on command-line to stop or continue here. #logger.error("Failed to download records for %s (exiting)" % asm_uid) #sys.exit(1) logger.error("Failed to download records for %s (continuing)" % asm_uid) # Write contigs to file retval = SeqIO.write(records, outfilename, 'fasta') logger.info("Wrote %d contigs to %s" % (retval, outfilename))
# <codecell> from Bio import Entrez, SeqIO Entrez.email = "" # Always tell NCBI who you are # <codecell> for index, row in data.iterrows(): for refseq in row['Chromosomes/RefSeq'].split(','): filename = "fasta/%s.fasta" % (refseq, ) if os.path.exists(filename): continue print "%i/%i" % (index, len(data)), refseq, row['#Organism/Name'] handle = Entrez.efetch(db="nucleotide", id=refseq, rettype="fasta", retmode="text") seq = SeqIO.read(handle, 'fasta') output_handle = open(filename, "w") SeqIO.write(seq, output_handle, "fasta") output_handle.close() # <codecell> # <codecell> def skew_increments(s): return [{ 'C': -1, 'A': 0,
def main(): sim = pt.Model(cell_volume=CELL_VOLUME) # Download T7 wild-type genbank records Entrez.email = "*****@*****.**" handle = Entrez.efetch(db="nuccore", id=["NC_001604"], rettype="gb", retmode="text") record = SeqIO.read(handle, "genbank") genome_length = len(record.seq) phage = pt.Genome(name="phage", length=genome_length, transcript_degradation_rate=1e-2, transcript_degradation_rate_ext=1e-5, rnase_speed=20, rnase_footprint=10) # phage = pt.Genome(name="phage", length=genome_length) for feature in record.features: weights = [0.0] * len(record.seq) # Convert to inclusive genomic coordinates start = feature.location.start.position + 1 stop = feature.location.end.position name = '' if "note" in feature.qualifiers: name = feature.qualifiers["note"][0] # Grab promoters and terminators if feature.type == "regulatory": if name in IGNORE_REGULATORY: continue # Construct promoter if "promoter" in feature.qualifiers["regulatory_class"]: length = stop - start if length < 35: start = start - 35 interactions = get_promoter_interactions(name) phage.add_promoter(name, start, stop, interactions) # Construct terminator params if "terminator" in feature.qualifiers["regulatory_class"]: interactions = get_terminator_interactions(name) phage.add_terminator(name, start, stop, interactions) # Grab genes/CDSes if feature.type == "gene": if name in IGNORE_GENES: continue if name in RELABEL_GENES: name = RELABEL_GENES[name] # Construct CDS parameters for this gene phage.add_gene(name=name, start=start, stop=stop, rbs_start=start - 30, rbs_stop=start, rbs_strength=1e7) # Recode gene 10A if name == "gene 10A": gene10_start = start gene10_stop = stop if feature.type == "CDS": weights = compute_cds_weights(record, feature, 1.0, weights) if feature.type == "misc_structure": print(feature.qualifiers) phage.add_rnase_site(start=start, stop=start + 10) print(start, stop, name) weights[gene10_start:gene10_stop] = [0.1] * (gene10_stop - gene10_start) mask_interactions = [ "rnapol-1", "rnapol-3.5", "ecolipol", "ecolipol-p", "ecolipol-2", "ecolipol-2-p" ] phage.add_mask(500, mask_interactions) norm_weights = normalize_weights(weights) phage.add_weights(norm_weights) sim.register_genome(phage) sim.add_polymerase("rnapol-1", 35, 230, 0) sim.add_polymerase("rnapol-3.5", 35, 230, 0) sim.add_polymerase("ecolipol", 35, 45, 0) sim.add_polymerase("ecolipol-p", 35, 45, 0) sim.add_polymerase("ecolipol-2", 35, 45, 0) sim.add_polymerase("ecolipol-2-p", 35, 45, 0) sim.add_ribosome(30, 30, 0) sim.add_species("bound_ribosome", 10000) sim.add_species("bound_ecolipol", 1800) sim.add_species("bound_ecolipol_p", 0) sim.add_species("ecoli_genome", 0) sim.add_species("ecoli_transcript", 0) sim.add_reaction(1e6, ["ecoli_transcript", "__ribosome"], ["bound_ribosome"]) sim.add_reaction(0.04, ["bound_ribosome"], ["__ribosome", "ecoli_transcript"]) sim.add_reaction(0.001925, ["ecoli_transcript"], ["degraded_transcript"]) sim.add_reaction(1e7, ["ecolipol", "ecoli_genome"], ["bound_ecolipol"]) sim.add_reaction(0.3e7, ["ecolipol-p", "ecoli_genome"], ["bound_ecolipol_p"]) sim.add_reaction(0.04, ["bound_ecolipol"], ["ecolipol", "ecoli_genome", "ecoli_transcript"]) sim.add_reaction(0.04, ["bound_ecolipol_p"], ["ecolipol-p", "ecoli_genome", "ecoli_transcript"]) sim.add_reaction(3.8e7, ["protein_kinase-0.7", "ecolipol"], ["ecolipol-p", "protein_kinase-0.7"]) sim.add_reaction(3.8e7, ["protein_kinase-0.7", "ecolipol-2"], ["ecolipol-2-p", "protein_kinase-0.7"]) sim.add_reaction(3.8e7, ["gp-2", "ecolipol"], ["ecolipol-2"]) sim.add_reaction(3.8e7, ["gp-2", "ecolipol-p"], ["ecolipol-2-p"]) sim.add_reaction(1.1, ["ecolipol-2-p"], ["gp-2", "ecolipol-p"]) sim.add_reaction(1.1, ["ecolipol-2"], ["gp-2", "ecolipol"]) sim.add_reaction(3.8e9, ["lysozyme-3.5", "rnapol-1"], ["rnapol-3.5"]) sim.add_reaction(3.5, ["rnapol-3.5"], ["lysozyme-3.5", "rnapol-1"]) sim.seed(32) sim.simulate(time_limit=1200, time_step=5, output="phage_degrade_recoded_01_counts.tsv")
def get_pubmed_esummary(pmid_list): handle = Entrez.esummary(db="pubmed", id=pmid_list) records = Entrez.read(handle) return records
def save(self, update=False, *args, **kwargs): if not self.pk or update: # This code only happens if the objects is not in the database yet. # Otherwise it would have pk. try: #Reference._for_write = True if self.pmid or 'pmid' in kwargs: print "Did not failed" return Reference.objects.get(pmid=self.pmid) #, False elif self.title or 'title' in kwargs: #print self.title handle = Entrez.esearch(db='pubmed', term=self.title) print "Got handle" record = Entrez.read(handle) print "Got record", record print record['Count'], type(record['Count']) if record['Count'] == "1": print "Record count is 1" self.pmid = record['IdList'][0] #print self.title, self.pmid Reference.fetch_data(self) print("Saving") super(Reference, self).save(*args, **kwargs) print("Saved") else: from denigma.library import Bibliography # This statement at the top breaks Denigma for unknown reason. #print("Trying it different. %s" % type(self.title)) # Google: bib = Bibliography() #print("googling") r = bib.google(self.title) if r: r = r[0] self.pmid = r.pmid #print("Google successufull: %s" % self.pmid) else: #print("Google failed.") # r = bib.find(self.title)[0] # self.pmid = r.pmid # print self.pmid #print("Trying it different.") r = bib.find(unicode(self.title)) if len(r) == 1: r = r[0] self.pmid = r.pmid print self.pmid elif len(r) > 1: title = normalize_title(self.title) for areference in r: if normalize_title( areference.title) == title: r = areference print("datasets.Reference.save()") self.__dict__.update(r.__dict__) print r print vars(r) self.date = normalize_time(r.date) print "# Transforming lists into strings:" self.keywords = "; ".join(self.keywords) self.authors = "; ".join(self.authors) print "calling super" print self.pmid try: super(Reference, self).save( *args, **kwargs) # Just save the given information. except Exception as e: print e print "called super" # Raise Exception and state the the given information yielded more than one reference. else: super(Reference, self).save(*args, **kwargs) except Reference.DoesNotExist as e: print "Error", e Reference.fetch_data(self) super(Reference, self).save(*args, **kwargs) else: super(Reference, self).save(*args, **kwargs)
print("Baixar sequencias do Genbank (ate 200 acessos)") print('Adaptado por Tiago Andrade Borges Santos') from Bio import Entrez f = open('sequence.gb', 'w') Entrez.email = "*****@*****.**" # Always tell NCBI who you are print( "Insira sua lista de acessos do Genbank entre aspas, separados por virgula." ) print("Exemplo: 'GU479772', 'GU479773'") record = input("Seqs: ") gb_list = (record) gb_str = ",".join(gb_list) handle = Entrez.efetch(db="nuccore", id=gb_str, rettype="gb", retmode="txt") text = handle.read() f.write(str(text + '\n')) f.close() print("Sequencias com Genbank Number (1) ou sem Genbank Number (2)?") resposta = int(input("resposta: ")) from Bio import SeqIO #importar SeqIO a partir do biopython: sequencias = SeqIO.parse('sequence.gb', 'genbank') nseq = 0 #variavel para contar o numero de sequencias processadas f = open('sequence.fas', 'w') #abre o arquivo onde vai salvar os resultados if resposta == 1: for seq in sequencias: #loop que itera cada uma das sequencias generoespecie = seq.annotations['organism'].split(' ') f.write('>' + generoespecie[0] + '_' + generoespecie[1] + '_' + seq.name + '\n')
from Bio import Entrez Entrez.email = "*****@*****.**" # 아 맞다 메일 handle = Entrez.esummary(db="pubmed", id="31651376") record = Entrez.read(handle) info = record[0] print("Journal info\nid: {}\nTitle: {}".format(record[0]["Id"], info["Title"]))
#!/usr/bin/env python # Given: A genus name, followed by two dates in YYYY/M/D format. # Return: The number of Nucleotide GenBank entries for the given genus that were published between the dates specified. from Bio import Entrez term = "Nesterenkonia" start = "2001/03/24" end = "2011/09/19" Entrez.email = '*****@*****.**' handle = Entrez.esearch(db="nucleotide", term='"' + term + '"[Organism] AND ("' + start + '"[PDAT] : "' + end + '"[PDAT])"') record = Entrez.read(handle) print record["Count"]
def get_pubmed_record_from_xml(pmid_list): [pmid2title, pmid2abstract] = get_titles_abstracts(pmid_list) handle = Entrez.efetch(db="pubmed", id=pmid_list, rettype='xml') record = Entrez.read(handle) data = [] for paper in record['PubmedArticle']: entry = {} entry['pmid'] = int(paper['MedlineCitation']['PMID']) article = paper['MedlineCitation']['Article'] journal = article['Journal'] # entry['issn'] = journal.get('ISSN') entry['journal_abbrev'] = journal.get('ISOAbbreviation') entry['journal_title'] = journal.get('Title') if journal.get('JournalIssue'): entry['issue'] = journal['JournalIssue'].get('Issue') entry['volume'] = journal['JournalIssue'].get('Volume') if journal['JournalIssue'].get('PubDate'): entry['year'] = journal['JournalIssue']['PubDate'].get('Year') entry['title'] = article.get('ArticleTitle') ## the titles from XML format preserve all special charaters ## while the titles from medline format do not so try very best ## to use the titles from XML unless there are html tags in the ## titles - in those cases, XML format will mess up the title parsing titleFromXML = entry['title'] titleFromTXT = pmid2title.get(entry['pmid']) if titleFromTXT is not None: wordsFromXML = titleFromXML.split(" ") wordsFromTXT = titleFromTXT.split(" ") if len(wordsFromTXT) > len(wordsFromXML) + 2: entry['title'] = titleFromTXT abstract = pmid2abstract.get(entry['pmid']) if abstract is not None: entry['abstract'] = abstract if paper['MedlineCitation'].get('DateRevised'): dateRevised = paper['MedlineCitation']['DateRevised'] entry['date_revised'] = dateRevised['Year'] + "-" + dateRevised[ 'Month'] + "-" + dateRevised['Day'] if article.get('Pagination'): entry['page'] = article['Pagination'].get('MedlinePgn') if article.get('PublicationTypeList'): types = [] for type in article['PublicationTypeList']: types.append(str(type)) entry['pubtypes'] = types if article.get('AuthorList'): authors = [] orcid4author = {} for author in article['AuthorList']: if author.get('LastName') is None or author.get( 'Initials') is None: continue authorName = author['LastName'] + " " + author['Initials'] authors.append(authorName) ident = author.get('Identifier') if len(ident) == 0: continue if ident[0].attributes.get('Source') is None: continue if ident[0].attributes.get('Source') == 'ORCID': orcid = str(ident[0]).replace("http://orcid.org/", "").replace( "https://orcid.org/", "") orcid4author[authorName] = orcid entry['authors'] = authors entry['orcid'] = orcid4author if paper['PubmedData'].get('PublicationStatus'): entry['publication_status'] = paper['PubmedData'].get( 'PublicationStatus') if paper['PubmedData'].get('ArticleIdList'): for item in paper['PubmedData'].get('ArticleIdList'): if item.attributes.get( 'IdType') is not None and item.attributes.get( 'IdType') == 'pmc': entry['pmc'] = str(item) if item.attributes.get( 'IdType') is not None and item.attributes.get( 'IdType') == 'doi': entry['doi'] = str(item) # print entry, "\n" data.append(entry) return data