Example #1
0
def search_articles(searchterm,email):
    '''search_articles
    Return list of articles based on search term
    Parameters
    ==========
    searchterm: str
       a search term to search for
    Returns
    =======
    articles: Article objects
        a list of articles that match search term
    '''
    print "Getting pubmed articles for search term %s" %(searchterm)

    Entrez.email = email
    handle = Entrez.esearch(db='pubmed',term=searchterm,retmax=5000)
    record = Entrez.read(handle)

    # If there are papers
    if "IdList" in record:
        if record["Count"] != "0":
            # Fetch the papers
            ids = record['IdList']
            handle = Entrez.efetch(db='pubmed', id=ids,retmode='xml',retmax=5000)
            return Entrez.read(handle)

    # If there are no papers
    else:
        print "No papers found for searchterm %s!" %(searchterm)
Example #2
0
def store_abstracts_for_query(query,query_tag,maxN=None,preview_only=False):
	# if query_tag=="":
	# 	simpleQuery=" ".join(map(lambda x:x.name,queryTerms))
	# else:
	# 	simpleQuery=query_tag
	# query=pg.build_query(queryTerms)
	print "will search",query
	Entrez.email = "*****@*****.**"
	search_results = Entrez.read(Entrez.esearch(db="pubmed",
												term=query,
												reldate=10*365, datetype="pdat",
												usehistory="y"))
	count = int(search_results["Count"])
	print "Found %i results" % count
	if maxN!=None and maxN<count:
		count=maxN
		print "Only keeping first",count,"abstracts"
	if preview_only:
		return
	sys.stdout.flush()
	batch_size = 50
	for start in range(0,count,batch_size):
			end = min(count, start+batch_size)
			print "Going to download record %i to %i" % (start+1, end)
			sys.stdout.flush()
			fetch_handle = Entrez.efetch(db="pubmed",
										 rettype="medline", retmode="text",
										 retstart=start, retmax=batch_size,
										 webenv=search_results["WebEnv"],
										 query_key=search_results["QueryKey"])
			records=Medline.parse(fetch_handle)
			for r in records:
				pubmed_to_pg.store_medline_entry(r,query_tag)
Example #3
0
def uniprotIDlist_Lineages(IdList):
	lastset = []
	taxlist = []
	names = []
	Lineages =[]
	genes = {}
	try:
		genes  = uni.map(list(IdList), f='ACC', t='P_ENTREZGENEID')
		# handle = Entrez.efetch(db="nuccore", id=list(lastset), retmode = 'xml')
		if len(genes)>0:
			#values = genes.values()
			#handle = Entrez.elink(dbfrom="nucleotide", id=values, linkname="gene_taxonomy")
			#taxa = Entrez.read(handle, validate = False)
			#for i,entry in enumerate(taxa):
			#	taxlist.append(entry['LinkSetDb'][0]['Link'][0]['Id'])
			for Uniprot in genes:	
				genes[Uniprot] = list(genes[Uniprot])
				#if entry['IdList'][0] in genes[Uniprot]:
				#genes[Uniprot].append(entry['LinkSetDb'][0]['Link'][0]['Id'])
	except: 
		pass
	
	codes = getTax(IdList)
	print len(codes)
	for code in codes.keys():
		if code not in genes:
			genes[code] = [ 'noGeneID' , codes[code] ]
		else :
			genes[code].append(codes[code])	
		taxlist.append(codes[code])
	print len(taxlist)

	taxonomydata = Entrez.read(Entrez.efetch(db="taxonomy", id=taxlist))
	for i,record in enumerate(taxonomydata):
		name = record['ScientificName']
		Lineage = record['Lineage']
		print record
		for gene in genes:
			if len(genes[gene])>1 and genes[gene][1] == taxlist[i]:
				genes[gene].append(name)
				genes[gene].append(Lineage)
		names.append(name)
		Lineages.append(Lineage)

	#annotate the availability of genome and the reference genomes available
	handle = Entrez.elink(dbfrom="taxonomy", id=taxlist, linkname="taxonomy_genome")
	genomes = Entrez.read(handle, validate = False)
	for i,entry in enumerate(genomes):
		for protcode in genes:
			genomefound = False
			if entry['IdList'][0] in genes[protcode]:
				try:
					genes[protcode].append(entry['LinkSetDb'][0]['Link'][0]['Id'])
					genomefound = True
					break
				except:
					print entry
		if genomefound == False:
			genes[protcode].append('noGenome')
	return names, Lineages, genes 
def taxonIDs_to_strainIDs(taxonIDs, hasgenome=True, donotexpand=False, excludewgs=False):
    '''Find all individual strains in a taxonID'''
    strainIDs = []
    for taxonID in taxonIDs:
        #see http://www.ncbi.nlm.nih.gov/books/NBK21100/#A286 for some details about the terms and limits
        query = 'txid%s[subtree] AND "taxonomy genome"[filter] AND terminal[prop]' % taxonID
        if hasgenome:
            query = query + ' AND (("taxonomy assembly"[Filter]) OR ("taxonomy genome"[Filter]) OR ("taxonomy genome2"[Filter]))'
        if excludewgs:
            query = query + ' AND NOT wgs[filter]'
        sys.stderr.write("Querying NCBI with query: %s\n" %(query))
        # The default retmax is 20 which is way too few for some queries...
        NCBIdata = Entrez.read(Entrez.esearch(db="taxonomy", term=query, retmax=10000))
        #there can me multiple records returned
        taxon_strainIDs = NCBIdata['IdList']
        if len(taxon_strainIDs)<1 :
            sys.stderr.write("WARNING: No sequence information of the type you requested for %s\n" % taxonID)
        if donotexpand:
            assert len(taxon_strainIDs)==1, "WARNING: more than one strain ID returned from query of NCBI %s" % taxonID
        else:
            if len(taxon_strainIDs)>1:
                #only need to let user know if there is more than one, otherwise the numbers are the same
                sys.stderr.write("Note: multiple strains associated with taxonID %s: %s\n" % (taxonID, taxon_strainIDs))
        strainIDs = strainIDs + taxon_strainIDs
    return strainIDs
def add_new_taxonomy(server, new_taxons, parent_ncbi_tax_id):
    new_taxons = map(lambda x: x.split(":"), new_taxons)
    parent_taxid = None
    parent_left_value = None
    parent_right_value = None
    if parent_ncbi_tax_id:
        try:
            parent_taxid, parent_left_value, parent_right_value = \
                server.adaptor.execute_one(
                        'select taxon_id, left_value, right_value '
                        'from taxon where ncbi_taxon_id = %s',
                        (parent_ncbi_tax_id,))
        except AssertionError:
            # the given ncbi taxonomy id isn't currently in the database
            # download it using the Entrez API
            db_loader = Loader.DatabaseLoader(server.adaptor, list(server.values())[0], True)
            handle = Entrez.efetch( db="taxonomy", id=str(parent_ncbi_tax_id), retmode="XML")
            taxon_record = Entrez.read(handle)
            taxon_record[0]['LineageEx'].append(
                    {'Rank': taxon_record[0]['Rank'],
                        'ScientificName': taxon_record[0]['ScientificName'],
                        'TaxId': taxon_record[0]['TaxId']
                        })
            parent_taxid, parent_left_value, parent_right_value = db_loader._get_taxon_id_from_ncbi_lineage(
                                        taxon_record[0]["LineageEx"])

    for tax_name, tax_rank in new_taxons:
        parent_taxid, parent_left_value, parent_right_value = insert_taxon_rank(server,
                parent_taxid,
                parent_left_value,
                parent_right_value,
                tax_name,
                tax_rank)

    return parent_taxid
Example #6
0
def history_review():
    """
    利用history来搜索和下载综述
    """
    Entrez.email = "*****@*****.**"
    search_results = Entrez.read(Entrez.esearch(db="pubmed",
                                                term="Opuntia[ORGN]",
                                                reldate=365, datetype="pdat",
                                                usehistory="y"))
    count = int(search_results["Count"])
    print "Found %i results" % count

    batch_size = 10
    filepath = os.path.join(DATAROOT, "recent_orchid_papers.txt")
    out_handle = open(filepath, "w")
    for start in range(0, count, batch_size):
        end = min(count, start+batch_size)
        print "Going to download record %i to %i" % (start+1, end)
        fetch_handle = Entrez.efetch(db="pubmed",
                                     rettype="medline", retmode="text",
                                     retstart=start, retmax=batch_size,
                                     webenv=search_results["WebEnv"],
                                     query_key=search_results["QueryKey"])
        data = fetch_handle.read()
        fetch_handle.close()
        out_handle.write(data)
    out_handle.close()
def get_publications(parameters,hit_ids):
	'''get all publications out of pubmed-ids'''

	id_errors = []
	out_file = open(parameters["out_file"],"w")
	out_file.write("PMID\tAuthor\tTitle\tJournal\n")
	counter = 0
	print "4. Getting publication-details for all hits"
	for species,ids in hit_ids.items():
		counter += 1
		progress = int(50.0*counter/len(hit_ids))*"#"
		progress += (50-len(progress))* " "		
		sys.stderr.write("\r"+"0%"+progress+"100%")
		for single_id in ids:
			error_counter = 0
			while error_counter < 3:
				try:
					handle = Entrez.efetch(db=parameters["database"],
								id=single_id,
								retmode="xml")
					result = Entrez.read(handle)
					out_line = single_id+"\t"
					out_line += result[0]["MedlineCitation"]["Article"]["AuthorList"][0]["LastName"]+"\t"
					out_line += result[0]["MedlineCitation"]["Article"]["ArticleTitle"]+"\t"
					out_line += result[0]["MedlineCitation"]["Article"]["Journal"]["Title"]+"\n"
					out_file.write(out_line.encode('UTF-8'))
					error_counter = 5
					handle.close()
				except:
					error_counter += 1
			if error_counter == 3:
				print "\nThere was a problem to fetch the publication "+str(single_id)
				id_errors.append(single_id)
		print "\n5. Got all publications for all hits"
		return id_errors
Example #8
0
def pubmed():
    # Get the count of papers about orchid only in database pubmed
    Entrez.email = "*****@*****.**"     # Always tell NCBI who you are
    handle = Entrez.egquery(term="orchid")
    record = Entrez.read(handle)
    for row in record["eGQueryResult"]:
        if row["DbName"] == "pubmed":
            print "The count of papers about orchid in database pubmed:", row["Count"]

    # Get the list of ids of above
    handle = Entrez.esearch(db="pubmed", term="orchid", retmax=100)
    record = Entrez.read(handle)
    idlist = record["IdList"]
    print "The id list of papers about orchid in database pubmed:", idlist
    print

    # Search papers author by "Liu ZJ" from pubmed
    handle = Entrez.efetch(db="pubmed", id=idlist, rettype="medline",
                           retmode="text")
    records = Medline.parse(handle)
    search_author = "Liu ZJ"
    for record in records:
        if "AU" not in record:
            continue
        if search_author in record["AU"]:
            print "Author %s found." % search_author
            print "title:", record.get("TI", "?")
            print "authors:", record.get("AU", "?")
            print "source:", record.get("SO", "?")
            print
Example #9
0
def history_seq():
    """
    利用 history 来搜索和下载序列
    """
    Entrez.email = "*****@*****.**"
    search_handle = Entrez.esearch(db="nucleotide", term="Opuntia[orgn] and rpl16",
                                   usehistory="y")
    search_results = Entrez.read(search_handle)
    search_handle.close()

    gi_list = search_results["IdList"]
    count = int(search_results["Count"])
    assert count == len(gi_list)

    # 得到两个额外的信息, WebEnv 会话cookie 和 QueryKey
    webenv = search_results["WebEnv"]
    query_key = search_results["QueryKey"]

    batch_size = 3
    filepath = os.path.join(DATAROOT, "orchid_rpl16.fasta")
    out_handle = open(filepath, "w")
    for start in range(0, count, batch_size):
        end = min(count, start+batch_size)
        print "Going to download record %i to %i" % (start+1, end)
        fetch_handle = Entrez.efetch(db="nucleotide", rettype="fasta", retmode="text",
                                     retstart=start, retmax=batch_size,
                                     webenv=webenv, query_key=query_key)
        data = fetch_handle.read()
        fetch_handle.close()
        out_handle.write(data)
    out_handle.close()
Example #10
0
def get_geo_data(s_terms, email, maxresults=1000):
	"""Get geo data for query term provided"""
	Entrez.email = email
	Entrez.tool = 'GetGeoMetaDataPythonScript'
	hits = []
	s_terms = s_terms + ' NOT GPL'
	handle = Entrez.esearch(db='gds', term=s_terms, retmax=maxresults, usehistory="y")
	results = Entrez.read(handle)
	newhandle = Entrez.esummary(db='gds', retmax = maxresults, webenv=results['WebEnv'], query_key=results['QueryKey'])
	summary = Entrez.read(newhandle)
	for i in range(len(summary)):
		samples = []
		geo_data = {'id':summary[i]['Id'],
					'n_samples':summary[i]['n_samples'],
					'pubdate':summary[i]['PDAT'],
					'platform':summary[i]['PlatformTitle'],
					'suppfile':summary[i]['suppFile'],
					'taxon':summary[i]['taxon'],
					'entry_type':summary[i]['entryType'],
					'gpl':summary[i]['GPL'],
					'gse':summary[i]['GSE'],
					'pubmed_ids':summary[i]['PubMedIds'],
					'title':summary[i]['title'],
					'gds_type':summary[i]['gdsType'],
					'summary':summary[i]['summary'],
					'soft_file':get_soft_url(summary[i]['GSE'], summary[i]['entryType'])
					}
		hits.append(geo_data)
	return hits
Example #11
0
def get_genbank_identifiers(tax_identifier):
    genbank_entries = Entrez.esearch(db="nucleotide",
                                     term="txid"+tax_identifier,
                                     retmode="xml")
    esearch_result = Entrez.read(genbank_entries)
    genbank_identifiers = esearch_result["IdList"]
    return genbank_identifiers
Example #12
0
    def get_articles_in_range_by_author(self, author,
                                        range_start, range_end='',
                                        terms=''):
        """
        This function returns all of the article IDs
        of the articles a given 'author' published
        between 'range_start' and 'range_end'
        Optionally, additional terms may be provided
        """
        # if there's no rangeEnd set, we'll set it to the current year/month
        if range_end == '':
            range_end = "%s/%s" % (str(datetime.today().year),
                                   str(datetime.today().month))

        E.email = self.email
        term = "%s [AU] %s:%s[DP] %s" % (author, range_start, range_end, terms)

        handle = E.esearch(db="pubmed", term=term)
        record = E.read(handle)
        handle.close()

        articles = []
        for article_id in record['IdList']:
            if article_id not in articles:
                articles.append(article_id)
        print "got %d articles" % len(articles)
        return articles
Example #13
0
def get_many_prot_seqrec_by_gis(gi_list):
    """
    Download a dictionary of fasta SeqsRec from NCBI given a list of GIs.
    """

    print("Downloading FASTA SeqRecords by GIs from NCBI")
    num=len(gi_list)
    fasta_seqrec=dict()

    for i in range(int(num/1000)+1):
      print("Fetching %d th thousands from %d"%(i,num))

      while True:
        try:
            strn = ",".join(map(str,gi_list)[i*1000:(i+1)*1000])
            request=Entrez.epost(db="protein",id=strn)
            result=Entrez.read(request)
            webEnv=result["WebEnv"]
            queryKey=result["QueryKey"]
            handle=Entrez.efetch(db="protein",rettype='fasta',retmode='text',webenv=webEnv, query_key=queryKey)
            for r in SeqIO.parse(handle,'fasta'):
                fasta_seqrec[r.id.split('|')[1]]=r
        except:
            continue
        if((len(fasta_seqrec)==(i+1)*1000) or (len(fasta_seqrec)==num)):
            break
        else:
            print "Mismatch:", num," ", len(fasta_seqrec)
    print("FASTA Records downloaded:")
    print(len(fasta_seqrec))
    return(fasta_seqrec)
Example #14
0
def pubsearch(jids):
    Entrez.email = "*****@*****.**"
    # always let Entrez know who is calling

    pubterm = ""
    for i in jids:
        pubterm += i + "[JID] or "

    IDhandle = Entrez.esearch(
        db="pubmed", term="peptide AND (" + pubterm + " and ", mindate="2011", maxdate="2014", retmax=2500
    )
    # for documentation on esearch, see
    # http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
    # max number for retmax is 100k. Use retstart to get more than this.
    # Date range used to limit a search result by the date specified by datetype. These two parameters (mindate, maxdate) must be used together to specify an arbitrary date range. The general date format is YYYY/MM/DD, and these variants are also allowed: YYYY, YYYY/MM.

    record = Entrez.read(IDhandle)
    # record is returned as a dictionary. Lists search terms, all ID numbners etc

    idlist = record["IdList"]
    # return a list of ID numbers from the record dictionary

    recordHandle = Entrez.efetch(db="pubmed", id=idlist, rettype="medline", retmode="text")
    # search pubmed for records with idlist as input

    records = Medline.parse(recordHandle)
    # create dictionary from recordHandle

    return records
Example #15
0
def get_abstract(query,file_name,previewOnly=False):
	Entrez.email = "*****@*****.**"
	search_results = Entrez.read(Entrez.esearch(db="pubmed",
												term=query,
#												reldate=1996, datetype="pdat", #reldate is in number of days!
												reldate=20*365, datetype="pdat", #reldate is in number of days!
												usehistory="y"))
	count = int(search_results["Count"])
	print "Found %i results" % count
	sys.stdout.flush()
	if previewOnly:
		return
	batch_size = 10
	out_handle = open(file_name+".txt", "w")
	for start in range(0,count,batch_size):
			end = min(count, start+batch_size)
			print "Going to download record %i to %i" % (start+1, end)
			sys.stdout.flush()
			fetch_handle = Entrez.efetch(db="pubmed",
										 rettype="medline", retmode="text",
										 retstart=start, retmax=batch_size,
										 webenv=search_results["WebEnv"],
										 query_key=search_results["QueryKey"])
			data = fetch_handle.read()
			fetch_handle.close()
			out_handle.write(data)
	out_handle.close()
Example #16
0
 def test_efetch_biosystems_xml(self):
     """Test Entrez parser with XML from biosystems"""
     handle = Entrez.efetch(id="1134002", db="biosystems", retmode="xml")
     records = list(Entrez.parse(handle))
     handle.close()
     self.assertEqual(len(records), 1)
     self.assertEqual(records[0]['System_sysid']['Sys-id']['Sys-id_bsid'], '1134002')
Example #17
0
    def test_webenv_search(self):
        """Test Entrez.search from link webenv history"""
        handle = Entrez.elink(db='nucleotide', dbfrom='protein',
                              id='22347800,48526535', webenv=None, query_key=None,
                              cmd='neighbor_history')
        self.assertTrue(handle.url.startswith(URL_HEAD + "elink.fcgi?"), handle.url)
        self.assertIn(URL_TOOL, handle.url)
        self.assertIn(URL_EMAIL, handle.url)
        self.assertIn(URL_API_KEY, handle.url)
        self.assertIn("id=22347800%2C48526535", handle.url)
        recs = Entrez.read(handle)
        handle.close()
        record = recs.pop()

        webenv = record['WebEnv']
        query_key = record['LinkSetDbHistory'][0]['QueryKey']
        handle = Entrez.esearch(db='nucleotide', term=None,
                                retstart=0, retmax=10,
                                webenv=webenv, query_key=query_key,
                                usehistory='y')
        self.assertTrue(handle.url.startswith(URL_HEAD + "esearch.fcgi?"), handle.url)
        self.assertIn(URL_TOOL, handle.url)
        self.assertIn(URL_EMAIL, handle.url)
        self.assertIn(URL_API_KEY, handle.url)
        search_record = Entrez.read(handle)
        handle.close()
        self.assertEqual(2, len(search_record['IdList']))
Example #18
0
File: genbank.py Project: rhr/ivy
def fetch_DNA_seqs(terms, maxn=10000, batchsize=1000):
    """
    terms: sequence of search terms, quoted appropriately, with Entrez
           specifiers, e.g. ['"Mus musculus"[organism]']
    maxn: maximum number of sequences to return
    returns list of SeqRecord objects
    """
    global email
    assert email, "set email!"
    Entrez.email = email
    h = Entrez.esearch(db="nucleotide", term=" OR ".join(terms), usehistory="y")
    d = Entrez.read(h)
    env = d['WebEnv']; key = d['QueryKey']
    N = int(d['Count'])
    if maxn: N = min(N, maxn)
    logging.info('fetching %s sequences', N)
    retstart = 0
    seqs = []
    n = 0
    while n < N:
        h = Entrez.efetch(
            db="nucleotide", rettype='gb', webenv=env, query_key=key,
            retstart=retstart, retmax=batchsize
            )
        v = list(SeqIO.parse(h, "genbank"))
        n += len(v)
        logging.info('...fetched %s', n)
        seqs.extend(v)
        retstart += batchsize
    logging.info('...done')
    return seqs
Example #19
0
File: genbank.py Project: rhr/ivy
def gi2webenv(gilist):
    h = Entrez.esearch(
        db="nucleotide", term=" OR ".join(gilist), usehistory="y",
        retmax=len(gilist)
        )
    d = Entrez.read(h)
    return d["WebEnv"], d["QueryKey"]
Example #20
0
File: genbank.py Project: rhr/ivy
def fetch_aclist(aclist, batchsize=1000):
    global email
    assert email, "set email!"
    Entrez.email = email
    results = {}
    n = 0
    for v in batch(aclist, batchsize):
        v = list(v)
        h = Entrez.esearch(
            db="nucleotide",
            term=" OR ".join([ "%s[ACCN]" % x for x in v ]),
            usehistory="y"
            )
        d = Entrez.read(h)
        h.close()
        h = Entrez.efetch(db="nucleotide", rettype="gb", retmax=len(v),
                          webenv=d["WebEnv"], query_key=d["QueryKey"])
        seqs = SeqIO.parse(h, "genbank")
        for s in seqs:
            try:
                ac = s.annotations["accessions"][0]
                if ac in aclist:
                    results[ac] = s
            except:
                pass
        h.close()
        n += len(v)
        logging.info('fetched %s sequences', n)
    return results
Example #21
0
File: genbank.py Project: rhr/ivy
def fetchtax(taxid):
    global email
    assert email, "set email!"
    Entrez.email = email
    n = 1
    if not isinstance(taxid, int):
        # string, possibly with multiple values?
        try:
            taxid = taxid.strip()
            n = taxid.count(',') + 1
        except AttributeError:
            # iterable of values?
            try:
                n = len(taxid)
                taxid = ','.join(map(str, taxid))
            except TypeError:
                pass
    else:
        taxid = str(taxid)
    h = Entrez.efetch(db='taxonomy', id=taxid, retmode='xml', retmax=n)
    if n == 1:
        r = Entrez.read(h)[0]
    else:
        # a list of taxonomy results in same order of taxids
        r = Entrez.read(h)
    return r
def main(Substance, Organism, Gene):
    zoekterm1 = "Cocaine"
    zoekterm2 = "Elegans"
    MAX_COUNT = 50
    dic = {}
    titels = []
    TERM = ''
    TERMS = []
    count = 1
    if zoekterm2 == "":
        TERM = zoekterm1
    if zoekterm1 == "":
        print("vul een zoekterm in")
        sys.exit()
    elif zoekterm2 != "":
        TERM = zoekterm1+" and "+zoekterm2
    TERMS.append(TERM)
    print(TERM)
    handle = Entrez.esearch(db="pubmed", term= TERM, retmax=MAX_COUNT)
    record = Entrez.read(handle)
    idlist = record["IdList"]
    handle = Entrez.efetch(db="pubmed", id=idlist, rettype="medline",
                           retmode="text")
    records = Medline.parse(handle)
    records = list(records)
    for record in records:
        titel = record.get("PMID","?")
        titels.append(titel)
        pubSet = set(titels)        
    dic[TERM] = pubSet
    print(dic)
    return "Jay"
Example #23
0
 def __enter__(self):
     
     try:
         
         fetch_handle = Entrez.efetch(db="nucleotide", id=self.nuc_seq_ids, rettype="fasta", retmode="text")
         self.file = tempfile.NamedTemporaryFile()
         
         for i in range(int(math.ceil(float(len(self.nuc_seq_ids)) / 5000))):
             
             start = i * 5000
             remaining = len(self.nuc_seq_ids) - start
             end = start + remaining if remaining < 5000 else start + 5000
             
             fetch_handle = Entrez.efetch(db="nucleotide", id=self.nuc_seq_ids[start:end], rettype="fasta", retmode="text")
             
             for line in fetch_handle:
                 self.file.write(line)
             
             fetch_handle.close()
         
         self.file.seek(0)
         
     except IOError:
         raise TaskError("Could not download sequence from NCBI")
     
     self.filepath = self.file.name
     
     return self
def searchgb(stermList, mindate):
	usedID = []
	#try:
	out = open('seq_searchdb.fas','a')
	print 'searching genbank'
	for sterm in stermList:
		searchTerm = '(' + sterm + '[Organism] OR '  + sterm +  '[All Fields]) AND ("' + str(mindate) + ' "[MDAT] : "' + str(date.today())+ ' "[MDAT])'
		handle = Entrez.esearch(db="nucleotide", retmax=10000, term= sterm)
	
		record = Entrez.read(handle)
		print record["Count"]
		for ID in  record["IdList"]:
			if ID not in usedID:
				usedID.append(ID)
			
				handle = Entrez.efetch(db="nucleotide", id=ID, rettype="fasta" )
				record = SeqIO.read(handle,"fasta")
				if re.search('18S', record.description):
					if re.search('5.8S', record.description):
						print "5.8S: %s" % record.description
					else:
					
						out.write('>' + record.id.split('|')[3] + '\n')
						out.write(str(record.seq) + '\n')
				elif re.search('small subunit', record.description):
					if re.search('5.8S', record.description):
						print "5.8S: %s" % record.description
					else:
					
						out.write('>' + record.id.split('|')[3] + '\n')
						out.write(str(record.seq) + '\n')

	
	out.close()
Example #25
0
def main():
    args = parser_get_args()
    with open(args.filepath, 'r') as species_names:
        species_list = species_names.read().splitlines()
        for species in species_list:
            time.sleep(1)
            Entrez.email = "*****@*****.**"
            search_query = Entrez.esearch(db="taxonomy", term=species, retmode="xml")
            result = Entrez.read(search_query)
            species_id = result['IdList']
            species_id = "txid{}".format(species_id[0])
            taxonomy_data = Entrez.efetch(db="taxonomy", id=species_id, retmode="xml")
            data = Entrez.read(taxonomy_data)
            lineageex = data[0]["LineageEx"]
            species_dict = {}
            species_dict['Species'] = species
            superclass_dict = get_superclass(lineageex)
            class_dict = get_class(lineageex)
            subclass_dict = get_subclass(lineageex)
            infraclass_dict = get_infraclass(lineageex)
            superorder_dict = get_superorder(lineageex)
            order_dict = get_order(lineageex)
            superfamily_dict = get_superfamily(lineageex)
            family_dict = get_family(lineageex)
            genus_dict = get_genus(lineageex)
            with open(args.outputfile, 'a') as output_file:
                output_file.write(str((species_dict, superclass_dict, class_dict, subclass_dict, infraclass_dict, superorder_dict, order_dict, superfamily_dict, family_dict, genus_dict)))
                output_file.write('\n\n')
Example #26
0
def pmhits (TERM):
    # Returns the number of hits returned by searching pubmed for *TERM*
    Entrez.email = '*****@*****.**'
    Entrez.tool = 'pm_impacts'
    h = Entrez.esearch(db='pubmed', retmax=1000000, term=TERM)
    result = Entrez.read(h)
    return result['Count']
Example #27
0
def gen_paper(identifier,num_to_gen,author):
    '''
    Creates objects from a paper's metadata
    '''
    
    
    #import modules
    from Bio import Entrez
    Entrez.email = "*****@*****.**"
    
    from classes.paper import paper

    #Search PubMed for a given author, return a maximum of 20 results
    handle = Entrez.esearch(db="pubmed", retmax=num_to_gen, term=author+" [AU]")
    results = Entrez.read(handle)
    IDs = results['IdList']
    handle.close()
    handle = Entrez.esummary(db="pubmed", id=IDs[identifier])
    record = Entrez.read(handle)
    Title = (record[0]['Title']).encode('utf-8')
    PubDate = (record[0]['PubDate']).encode('utf-8')
    AuthorList = (record[0]['AuthorList'])
    AuthorList = [z.encode('utf-8') for z in AuthorList]
    LastAuthor = (AuthorList[-1]).encode('utf-8')
    handle.close()
    reference = paper(Title,LastAuthor,PubDate,AuthorList)
    return reference
Example #28
0
def _search(query):
    handle = Entrez.esearch(db="pubmed", sort="relevance", retmax="20", retmode="xml", term=query, usehistory="y")
    results = Entrez.read(handle)
    webenv = results["WebEnv"]  # ID for session
    query_key = results["QueryKey"]  # ID for query within session

    return results
Example #29
0
 def test_epost(self):
     handle = Entrez.epost("nuccore", id="186972394,160418")
     self.assertEqual(URL_HEAD + "epost.fcgi", handle.url)
     handle.close()
     handle = Entrez.epost("nuccore", id=["160418", "160351"])
     self.assertEqual(URL_HEAD + "epost.fcgi", handle.url)
     handle.close()
Example #30
0
File: genbank.py Project: rhr/ivy
def acsum(aclist, batchsize=100):
    """
    fetch esummary info for list of accession numbers -- useful for
    getting gi and taxids
    """
    global email
    assert email, "set email!"
    Entrez.email = email
    results = {}
    for v in batch(aclist, batchsize):
        v = list(v)
        h = Entrez.esearch(
            db="nucleotide", retmax=len(v),
            term=" OR ".join([ "%s[ACCN]" % x for x in v ]),
            usehistory="y"
            )
        d = Entrez.read(h)
        h.close()
        # gis, but not in order of aclist
        gis = d['IdList']
        d = Entrez.read(Entrez.esummary(db='nucleotide', id=','.join(gis)),
                        validate=False)
        for x in d:
            ac = x['Caption']
            if ac in aclist:
                results[ac] = x
    return results
Example #31
0
from Bio import Entrez
from Bio import SeqIO

Entrez.email = "*****@*****.**"

with Entrez.efetch(db="nucleotide",
                   rettype="fasta",
                   retmode="text",
                   id="13788565") as handle:
    seq_record = SeqIO.read(handle, "fasta")

print(seq_record.seq[0:40])

records = list(SeqIO.parse("sample.fasta", "fasta"))
first_record = records[0]
first_sequence = str(first_record.seq)
print(first_sequence)
hits = collections.defaultdict(list)
seq_files = {}
with open(args.infile, 'rt') as ih:
    rdr = csv.reader(ih, delimiter='\t')
    for i, row in enumerate(rdr):
        gid = row[0].split('|')[-2]
        pid = float(row[1])
        s = int(row[2])
        e = int(row[3])
        n = row[4]

        ## Check if we have already encountered this id, if not,
        ## download the sequence
        if gid not in hits:
            seq = Entrez.efetch(db='nucleotide',
                                id=gid,
                                rettype='gb',
                                retmode='text').read()
            tmpfile = tempfile.mkstemp(dir='.')
            with open(tmpfile[0], 'wt') as th:
                th.write(seq)
            seq_files[gid] = tmpfile[1]
            time.sleep(60)

        ## Store information
        hits[gid].append((s, e, pid, n, gid))

## For each hit, find the closest gene
with open(args.output, 'wt') as oh:
    for gid, hit_info in hits.items():
        features = [
            f for f in SeqIO.read(seq_files[gid], 'genbank').features
Example #33
0
Created on Tue Sep 06 18:45:27 2016


USAGE

python getPUBMEDabstractsFROMxml.py <INPUT.xml>

@author: Jahir Gutierrez
"""

from Bio import Entrez
import sys

filename = str(sys.argv[1])
handle = open(filename)
records = Entrez.parse(handle)
out_filename = filename.replace('.xml', '.tsv')
f = open(out_filename, 'w')
for record in records:
    try:
        pmid = record['MedlineCitation']['PMID'].encode('ascii', 'ignore')
        title = record['MedlineCitation']['Article']['ArticleTitle'].encode(
            'ascii', 'ignore')
        abstract_list = record['MedlineCitation']['Article']['Abstract'][
            'AbstractText']
        abstract = ''
    except:
        continue
    for element in abstract_list:
        abstract = abstract + element.encode('ascii', 'ignore') + ' '
    abstract = abstract[0:len(abstract) - 1]
    usage()
    exit()

f = open(sys.argv[1], "r")

list_records = []
Entrez.email = "*****@*****.**"
out = open(sys.argv[2], "w")
count = 0
for l in f:
    count += 1
    if count % 50 == 0:
        print(str(count) + " sequences processed...")
    accession = l.rstrip()
    try:
        handle_search = Entrez.esearch(db="nucleotide", term=accession)
        record_search = Entrez.read(handle_search)
        records_fasta = [
            Entrez.efetch(id=i,
                          db="nucleotide",
                          rettype="fasta",
                          retmode="text") for i in record_search["IdList"]
        ]
        records_seqio = [
            SeqIO.read(record, "fasta") for record in records_fasta
        ]
    except:
        continue
    for rec in records_seqio:
        if accession in rec.id:
            SeqIO.write(rec, out, "fasta")
Example #35
0
        'IMG_ID', 'N_Reads', 'N_Contigs', 'N50', 'Habitat', 'Location'
    ])

    samplescsv = csv.reader(infh, delimiter="\t")
    headerrow = next(samplescsv)
    for row in samplescsv:
        outrow = row

        BIOPROJECTID = row[3]
        print(row)
        SRARUN = []
        NREADS = 0
        NBASES = 0
        if BIOPROJECTID:
            SRP = row[4]
            handle = Entrez.esearch(db="sra", retmax=10, term=SRP)
            record = Entrez.read(handle)
            for id in record["IdList"]:
                SRP = id
            handle.close()

            #SRP=row[4]
            handle = Entrez.efetch(db="sra", id=SRP)
            tree = ET.parse(handle)
            root = tree.getroot()
            for runs in root.iter('RUN_SET'):
                for run in runs.iter('RUN'):
                    SRARUN.append(run.attrib['accession'])
                    #indent(run)
                    #print(ET.tostring(run))
                for dbs in runs.iter('Databases'):
Example #36
0
from Bio import Entrez
my_em = '*****@*****.**'
db = "pubmed"
# Search de Entrez website using esearch from eUtils
# esearch returns a handle (called h_search)
h_search = Entrez.esearch(db=db, email=my_em, term='python and bioinformatics')
# Parse the result with Entrez.read()
record = Entrez.read(h_search)
# Get the list of Ids returned by previous search
res_ids = record["IdList"]
# For each id in the list
for r_id in res_ids:
    # Get summary information for each id
    h_summ = Entrez.esummary(db=db, id=r_id, email=my_em)
    # Parse the result with Entrez.read()
    summ = Entrez.read(h_summ)
    print(summ[0]['Title'])
    print(summ[0]['DOI'])
    print('==============================================')
Example #37
0
    organismDict[row[0]] = str(row[1]) + ' ' + str(row[2]) + ' ' + str(row[0])
#выделяем уникальные геномы
accessions = df['NCBI accession'].tolist()
acc_set = set(accessions)  #множество уникальных идентификаторов геномов
#Делаем запросы в NCBI и записываем результат в файлы на диске
for acc in acc_set:
    directory = DIR + '\\output\\' + organismDict[acc]
    if not os.path.exists(
            directory
    ):  #проверка нужна, чтобы не перезаписывать уже имеющиеся папки
        gotIt = False
        print("Start processing " + organismDict[acc])
        while (gotIt == False):
            try:
                gb_acc = Entrez.efetch(db='nuccore',
                                       id=acc,
                                       rettype='gb',
                                       retmode='text')
                gotIt = True
            except (urllib.error.HTTPError, urllib.error.URLError):
                time.sleep(3)
        rec = SeqIO.read(gb_acc, 'genbank')
        os.makedirs(
            directory
        )  #дополнительные проверки не нужны, т.к. если сюда попали, значит такой директории нет
        SeqIO.write(rec, directory + '\\' + organismDict[acc] + '.gbk', 'gb')
        print(organismDict[acc])
        time.sleep(3)
    else:
        print(acc)  #чтоб было видно, что мы прошли данный идентификатор

#print(dfAntismashDB['NCBI accession', 'From', 'To'])
Example #38
0
def get_raw_summary(id, db="assembly"):
    handle = Entrez.esummary(db=db, id=id, report="full")
    record = Entrez.read(handle)
    #return(record['DocumentSummarySet']['DocumentSummary'][0]['AssemblyName']) #This will return the Assembly name
    return (record)
Example #39
0
from os.path import join
from Bio import Entrez, SeqIO
Entrez.email = '*****@*****.**'

import logging
log = logging.getLogger('orthofinder')

handle = Entrez.esearch(db="genome", term="Klebsiella pneumoniae")
record = Entrez.read(handle)
print record.keys()
if record['IdList']:
    handle = Entrez.efetch(db='genome', id=record['IdList'])
    rec = Entrez.read(handle)
    print rec
    print rec.keys()

#for id in record['IdList']:
#    print(id)
#    handle = Entrez.efetch(db="genome", id=id)
#    print(handle.read())
#
#for i, id in enumerate(record['IdList']):
#    print '   Fetching %s...' % id
#
#    fetch_handle = Entrez.efetch(db='nuccore', id=id, retmode='text', rettype='gb')
#    gb_fpath = join(str(i) + '_test.gb')
#    with open(gb_fpath, 'w') as file:
#        file.write(fetch_handle.read())
#
#    rec = SeqIO.read(gb_fpath, 'gb')
#    org_name = rec.annotations['organism']
Example #40
0
def get_ids(query, db="assembly"):
    ids = []
    handle = Entrez.esearch(db=db, term=query)
    record = Entrez.read(handle)
    ids.append(record["IdList"])
    return ids[0]
                    accessions.append([])
                    records.append([])
        else:
            taxa.append(row[0])
            for i, accession in enumerate(row):
                if i != 0:
                    accessions[i - 1].append(row[i])

Entrez.email = '*****@*****.**'

for i, gene in enumerate(genes):
    print('Downloading accessions for ' + gene + '...')
    for j, accession in enumerate(accessions[i]):
        if accession.strip() != '':
            handle = Entrez.efetch(db='nucleotide',
                                   rettype='fasta',
                                   retmode='text',
                                   id=accession)
            record = SeqIO.read(handle, 'fasta')
            records[i].append(
                SeqRecord(Seq(str(record.seq), IUPAC.ambiguous_dna),
                          id=taxa[j],
                          description=""))
            handle.close()
            sleep(0.02)
    SeqIO.write(records[i], "sequences_unaligned/" + genes[i] + ".fasta",
                "fasta")

for i, gene in enumerate(genes):
    print("Aligning " + gene + " with MAFFT...")
    mafft_cline = MafftCommandline(input="sequences_unaligned/" + genes[i] +
                                   ".fasta")
def geneAnalysis(i, genes=None):
    #Start excel FILE
    workbook = xlsxwriter.Workbook('GeneGBInfo/genesGenbank.xlsx')
    worksheet = workbook.add_worksheet("Genebank")
    bold = workbook.add_format({'bold': True})
    worksheet.write("A1", "GeneID", bold)
    worksheet.write("B1", "Location", bold)
    worksheet.write("C1", "Function", bold)
    worksheet.write("D1", "Gene", bold)
    worksheet.write("E1", "Locus tag", bold)
    worksheet.write("F1", "Note", bold)
    worksheet.write("G1", "Product", bold)
    worksheet.write("H1", "Protein_id", bold)
    worksheet.write("I1", "Sequence", bold)
    row = 1
    col = 0
    #End excel file

    #parse Genebank data
    handle = Entrez.efetch(db="nucleotide",
                           rettype="gbwithparts",
                           retmode="text",
                           id=i,
                           retmax=10**9,
                           batchSize=1000)
    for seq_record in SeqIO.parse(handle, "gb"):
        g = []
        protids = []
        for feat in (seq_record.features):
            if feat.type == "CDS" and feat.qualifiers['locus_tag'][0] in genes:
                gene = feat.qualifiers['db_xref'][0].split(
                    ':')  # ex. GeneID:19834053
                g.append(gene[1].strip())
                worksheet.write(row, col, gene[1].strip())
                col += 1
                worksheet.write(row, col, str(feat.location))
                col += 1
                worksheet.write(row, col, feat.qualifiers['function'][0])
                col += 1
                if ('gene' in feat.qualifiers):
                    worksheet.write(row, col, feat.qualifiers['gene'][0])
                    col += 1
                else:
                    col += 1
                worksheet.write(row, col, feat.qualifiers['locus_tag'][0])
                col += 1
                if ('note' in feat.qualifiers):
                    worksheet.write(row, col, feat.qualifiers['note'][0])
                    col += 1
                else:
                    col += 1
                worksheet.write(row, col, feat.qualifiers['product'][0])
                col += 1
                worksheet.write(row, col, feat.qualifiers['protein_id'][0])
                protids.append(feat.qualifiers['protein_id'][0])
                col += 1
                worksheet.write(row, col, str(seq_record.seq))
                col += 1

                row += 1
                col = 0

        u = getUniProtIds(g)
        getUniProtInfo(u)

    f = open('protids.txt', 'w')
    for protid in protids:
        print(protid, file=f)
    f.close()
    workbook.close()
Example #43
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Sep 23 14:52:24 2017

@author: atmiyashita
"""
# %%  <<<<<<<<   NCBI Search and downloading sequence file in a fasta format >>>>>>>
from Bio import Entrez  # import Entrez
Entrez.email = "*****@*****.**"  # always tell NCBI who you are
search_handle = Entrez.esearch(
    db="nucleotide",
    term=
    '"elongation factor"[PROTEINFULLNAME] AND "arthropods"[porgn] AND 0:10000[Sequence Length]',
    usehistory="y",
    idtype="acc",
    RetMax=1000
)  # open search and search taeget genes in arthropod (limit sequence length less than 10kbp)   !<<----Check!
search_record = Entrez.read(search_handle)  #store search record
search_handle.close()  #close search
print(search_record["Count"])
print(search_record["IdList"])
IDs = search_record["IdList"]  #NCBI IDs (nucleotide) to fetch later
print(search_record.keys())
print(IDs)
out_handle = open(
    "EF-arth.txt", "w"
)  # open file where you save the result to                                 !<<----Check!
for seq_id in IDs:
    fetch_handle = Entrez.efetch(
        db="nucleotide",
Example #44
0
                      usecols=[0, 1],
                      header=False)
snp_tab.columns = colnames
snplist = snp_tab.VAR.values.tolist()

start_time = datetime.datetime.now()
snp_flank_left = []
snp_flank_right = []
for i, snp in enumerate(snplist):
    snp_split = snp.split(':')
    if len(snp_split[2]) == 3:
        chrom = snp_split[0]
        pos = snp_split[1]
        handle = Entrez.efetch(db="nucleotide",
                               id=hg19_chrom[chrom],
                               rettype="fasta",
                               strand=1,
                               seq_start=int(pos) - 1,
                               seq_stop=int(pos) + 1)
        record = SeqIO.read(handle, "fasta")
        handle.close()
        if record.seq[1] == snp_split[2].split('>')[0]:  #compare to ref base
            print "record: ", i, "chrom: ", chrom, "pos: ", pos, "trimer: ", record.seq
            snp_flank_left.append(str(record.seq[0]))
            snp_flank_right.append(str(record.seq[2]))
        else:
            print i, "NA"
            snp_flank_left.append("NA")
            snp_flank_right.append("NA")
    else:
        print i, "NA"
        snp_flank_left.append("NA")
Example #45
0
import flask
from flask import jsonify
import Bio
from Bio import Entrez
from Bio.Align.Applications import ClustalwCommandline
from Bio import AlignIO
from Bio import Phylo

Entrez.email = "*****@*****.**"
extracao = Entrez.einfo()
extracao_lida = Entrez.read(extracao)
bancos = extracao_lida["DbList"]

# para cada elemento da lista de bancos,
# mostra um ID, para cada.

bcs = ""

for b in range(1, len(bancos) + 1):
    print(b, bancos[b - 1])
    bcs += str(b)
    bcs += " - "
    bcs += bancos[b - 1]
    bcs += "  |  "

# iniciar a aplicação
app = flask.Flask("GenBridgePY")
app.config["JSON_AS_ASCII"] = False


# gerar url
Example #46
0
"""
######################################################################################

import time
from Bio import Entrez

Entrez.email = '*****@*****.**'  #input your email

in_file = '/Users/lindsayhopson/Documents/mouse_CensuScope_outputs/newAccList1.2.txt'  #use your appropriate file path

with open('orgNames_newAccList2.2.txt', 'w') as output_File:

    with open(in_file, 'r') as input_File:
        id_list = input_File.readlines()
        count = 0
        for i in id_list:
            handle = Entrez.esummary(db="nucleotide", id=i)
            record = Entrez.read(handle)
            a = str(record[0]['Title'])
            b = str(record[0]['TaxId'])
            a = a.replace(',', ' ')
            handle = Entrez.efetch(db="taxonomy", id=b)
            record = Entrez.read(handle)
            c = record[0]['Lineage']

            output_File.write(str('\n' + i + '\t' + a + '\t' + b + '\n'))
            count = count + 1
            print(i + '***' + a + '***' + b + '***' + c)
            time.sleep(1)
Example #47
0
from Bio import Entrez
from Bio import Medline
import pandas as pd

MAX_COUNT = 300 
#TERM = 'Tuberculosis'
TERM = 'precursor bioink  gelatin'#pubmed has only 2 papers on these key words, google scholar has lot

print('Getting {0} publications containing {1}...'.format(MAX_COUNT, TERM))
Entrez.email = '*****@*****.**'
h = Entrez.esearch(db='pubmed', retmax=MAX_COUNT, term=TERM)
#h = Entrez.esearch(db='pubmed', term=TERM)

#print (h)
#print (type(h))
result = Entrez.read(h)
#print (result)
#print (type(result))
#print (dir(result))
print('Total number of publications containing {0}: {1}'.format(TERM, result['Count']))
ids = result['IdList']
print (ids)
h = Entrez.efetch(db='pubmed', id=ids, rettype='medline', retmode='text')#handle
#h = Entrez.efetch('pubmed', id=ids, retmode='xml')
#records = Entrez.parse(h)
#print (type(h))
#print (dir(h))
#print (h)
records = Medline.parse(h)
#print (records)
#print (dir(records))
Example #48
0
for seq_record in SeqIO.parse(handle, 'fasta'):
    print(seq_record.id)
    print(len(seq_record.seq))
    print(repr(seq_record.seq))
handle.close()

print("Clustal W Files")
handle = open("clustalw.clustal_num")
for seq_record in SeqIO.parse(handle, "clustal"):
    print(seq_record.id)
handle.close()

#--------------------------------------
print("Entrez information")
handle = Entrez.einfo()
result = handle.read()
print(result)

handle = Entrez.efetch(db="nucleotide", id="57240072", rettype="gb", retmode="text")
for i in handle.readlines():
    print(i.strip())
print(handle.readline().strip())
handle.close()

#-----------

print("PDB Files")

pdb1 = PDBList()
pdbFile = pdb1.retrieve_pdb_file('1FAT')
def retrieve_wgsmaster_contigs(uid):
    """Munges a download URL from the passed UID and downloads the
    corresponding archive from NCBI, extracting it to the output
    directory.
    """
    logger.info("Processing wgsmaster UID: %s" % uid)
    summary = Entrez.read(
        Entrez.esummary(db='nuccore', id=uid, rettype='text', validate=False))
    # Assume that the 'Extra' field is present and is well-formatted.
    # Which means that the first six characters of the last part of
    # the 'Extra' string correspond to the download archive filestem.
    dlstem = summary[0]['Extra'].split('|')[-1][:6]
    dlver = summary[0]['Extra'].split('|')[3].split('.')[-1]
    # Download archive to output directory
    # Establish download size; if version number not in sync with
    # download, try again with version number decremented by 1. This
    # may be necessary because genome sequence version and genome/
    # assembly version numbers are not synchronised.
    fsize = None
    while str(dlver) != '0' and not fsize:
        try:
            fname = "%s.%s.fsa_nt.gz" % (dlstem, dlver)
            outfname = os.path.join(args.outdirname, fname)
            url = "http://www.ncbi.nlm.nih.gov/Traces/wgs/?download=%s" % \
                fname
            logger.info("Trying URL: %s" % url)
            response = urlopen(url)
            meta = response.info()
            fsize = int(meta.getheaders("Content-length")[0])
            logger.info("Downloading: %s Bytes: %s" % (fname, fsize))
            fsize_dl = 0
            bsize = 1048576
        except:  # Download didn't work. Assuming it's because of version
            fsize = None
            logger.error("Download failed for (%s)" % url)
            if str(dlver) != '0':
                dlver = int(dlver) - 1
                logger.info("Retrying download with version = %s" % dlver)
            else:
                logger.error("No more versions to try (exiting)")
                sys.exit(1)
    # Download data
    try:
        with open(outfname, 'wb') as fh:
            while True:
                buffer = response.read(bsize)
                if not buffer:
                    break
                fsize_dl += len(buffer)
                fh.write(buffer)
                status = r"%10d  [%3.2f%%]" % (fsize_dl,
                                               fsize_dl * 100. / fsize)
                logger.info(status)
    except:
        logger.error("Download failed for %s (exiting)" % fname)
        logger.error(last_exception())
        sys.exit(1)
    # Extract archive
    asm_summary = entrez_retry(Entrez.esummary,
                               db='assembly',
                               id=asm_uid,
                               rettype='text')
    asm_record = Entrez.read(asm_summary, validate=False)
    gname = asm_record['DocumentSummarySet']['DocumentSummary']\
            [0]['AssemblyAccession']
    extractfname = os.path.join(args.outdirname, '.'.join([gname, 'fasta']))
    try:
        logger.info("Extracting archive %s to %s" % (outfname, extractfname))
        with open(extractfname, 'w') as efh:
            subprocess.call(['gunzip', '-c', outfname],
                            stdout=efh)  # can be subprocess.run in Py3.5
        logger.info("Archive extracted to %s" % extractfname)
    except:
        logger.error("Extracting archive %s failed (exiting)" % outfname)
        logger.error(last_exception())
        sys.exit(1)
    # Get contig_uids
    contig_uids = [s.description for s in SeqIO.parse(extractfname, 'fasta')]
    return contig_uids, extractfname
Example #50
0
#!/usr/bin/python

"""Download infomration on 'water pathogen' search query in NCBI

Command-line application that does a search and return (Adina Howe, 2014).
"""

from Bio import Entrez

Entrez.email = '*****@*****.**'
handle = Entrez.esearch(db="genome", term="water pathogen", retmax=200)
records = Entrez.read(handle)


for i in records['IdList']:
    fp1 = open(i + '.summary', 'w')
    fp2 = open(i + '.links', 'w')

    handle1 = Entrez.esummary(db="genome", id=i)
    records1 = Entrez.read(handle1)
    print i
    fp1.write('%s\t%s\t%s\n' % (i, records1[0]['Organism_Name'], records1[0]['DefLine']))
    handle2 = Entrez.elink(dbfrom="genome", db="nucleotide", id=i)
    records2 = Entrez.read(handle2)
    fp2.write('%s\t' % i)
    list_of_dict_ids = records2[0]['LinkSetDb'][0]['Link']
    for link_id in list_of_dict_ids:
        fp2.write('%s\t' % link_id['Id'])
    fp2.write('\n')
Example #51
0
def fetch_accession_range(acc: str, start: int, stop: int):
    with Entrez.efetch(db='nucleotide', id=acc, rettype='fasta', retmode='text', seq_start=start, seq_stop=stop) as h, \
            TemporaryFile(mode='w+') as temp:
        temp.write(h.read())
        temp.seek(0)
        return SeqIO.read(temp, format='fasta')
def write_contigs(asm_uid, contig_uids):
    """Writes assembly contigs out to a single FASTA file in the script's
    designated output directory.

    FASTA records are returned, as GenBank and even GenBankWithParts format
    records don't reliably give correct sequence in all cases.

    The script returns two strings for each assembly, a 'class' and a 'label'
    string - this is for use with, e.g. pyani.
    """
    # Has duplicate code with get_class_label_info() - needs refactoring
    logger.info("Collecting contig data for %s" % asm_uid)
    # Assembly record - get binomial and strain names
    asm_summary = entrez_retry(Entrez.esummary,
                               db='assembly',
                               id=asm_uid,
                               rettype='text')
    asm_record = Entrez.read(asm_summary, validate=False)
    asm_organism = asm_record['DocumentSummarySet']['DocumentSummary']\
                   [0]['SpeciesName']
    try:
        asm_strain = asm_record['DocumentSummarySet']['DocumentSummary']\
                     [0]['Biosource']['InfraspeciesList'][0]['Sub_value']
    except:
        asm_strain = ""
    # Assembly UID (long form) for the output filename
    gname = asm_record['DocumentSummarySet']['DocumentSummary']\
            [0]['AssemblyAccession']
    outfilename = "%s.fasta" % os.path.join(args.outdirname, gname)

    # Create label and class strings
    genus, species = asm_organism.split(' ', 1)
    ginit = genus[0] + '.'
    labeltxt = "%s\t%s %s %s" % (gname, ginit, species, asm_strain)
    classtxt = "%s\t%s" % (gname, asm_organism)

    # Get FASTA records for contigs
    logger.info("Downloading FASTA records for assembly %s (%s)" %
                (asm_uid, ' '.join([ginit, species, asm_strain])))
    # We're doing an explicit retry loop here because we want to confirm we
    # have the correct data, as well as test for Entrez connection errors,
    # which is all the entrez_retry function does.
    tries, success = 0, False
    while not success and tries < args.retries:
        try:
            records = []  # Holds all return records
            # We may need to batch contigs
            query_uids = ','.join(contig_uids)
            batch_size = 10000
            for start in range(0, len(contig_uids), batch_size):
                logger.info("Batch: %d-%d" % (start, start + batch_size))
                seqdata = entrez_retry(Entrez.efetch,
                                       db='nucleotide',
                                       id=query_uids,
                                       rettype='fasta',
                                       retmode='text',
                                       retstart=start,
                                       retmax=batch_size)
                records.extend(list(SeqIO.parse(seqdata, 'fasta')))
            tries += 1
            # Check only that correct number of records returned.
            if len(records) == len(contig_uids):
                success = True
            else:
                logger.warning("%d contigs expected, %d contigs returned" %
                               (len(contig_uids), len(records)))
                logger.warning("FASTA download for assembly %s failed" %
                               asm_uid)
                logger.warning("try %d/20" % tries)
            # Could also check expected assembly sequence length?
            totlen = sum([len(r) for r in records])
            logger.info("Downloaded genome size: %d" % totlen)
        except:
            logger.warning("FASTA download for assembly %s failed" % asm_uid)
            logger.warning(last_exception())
            logger.warning("try %d/20" % tries)
    if not success:
        # Could place option on command-line to stop or continue here.
        #logger.error("Failed to download records for %s (exiting)" % asm_uid)
        #sys.exit(1)
        logger.error("Failed to download records for %s (continuing)" %
                     asm_uid)

    # Write contigs to file
    retval = SeqIO.write(records, outfilename, 'fasta')
    logger.info("Wrote %d contigs to %s" % (retval, outfilename))
Example #53
0
# <codecell>

from Bio import Entrez, SeqIO
Entrez.email = ""  # Always tell NCBI who you are

# <codecell>

for index, row in data.iterrows():
    for refseq in row['Chromosomes/RefSeq'].split(','):

        filename = "fasta/%s.fasta" % (refseq, )
        if os.path.exists(filename):
            continue
        print "%i/%i" % (index, len(data)), refseq, row['#Organism/Name']
        handle = Entrez.efetch(db="nucleotide",
                               id=refseq,
                               rettype="fasta",
                               retmode="text")
        seq = SeqIO.read(handle, 'fasta')
        output_handle = open(filename, "w")
        SeqIO.write(seq, output_handle, "fasta")
        output_handle.close()

# <codecell>

# <codecell>


def skew_increments(s):
    return [{
        'C': -1,
        'A': 0,
Example #54
0
def main():
    sim = pt.Model(cell_volume=CELL_VOLUME)

    # Download T7 wild-type genbank records
    Entrez.email = "*****@*****.**"
    handle = Entrez.efetch(db="nuccore",
                           id=["NC_001604"],
                           rettype="gb",
                           retmode="text")

    record = SeqIO.read(handle, "genbank")
    genome_length = len(record.seq)
    phage = pt.Genome(name="phage",
                      length=genome_length,
                      transcript_degradation_rate=1e-2,
                      transcript_degradation_rate_ext=1e-5,
                      rnase_speed=20,
                      rnase_footprint=10)

    # phage = pt.Genome(name="phage", length=genome_length)

    for feature in record.features:
        weights = [0.0] * len(record.seq)
        # Convert to inclusive genomic coordinates
        start = feature.location.start.position + 1
        stop = feature.location.end.position
        name = ''
        if "note" in feature.qualifiers:
            name = feature.qualifiers["note"][0]
        # Grab promoters and terminators
        if feature.type == "regulatory":
            if name in IGNORE_REGULATORY:
                continue
            # Construct promoter
            if "promoter" in feature.qualifiers["regulatory_class"]:
                length = stop - start
                if length < 35:
                    start = start - 35
                interactions = get_promoter_interactions(name)
                phage.add_promoter(name, start, stop, interactions)
            # Construct terminator params
            if "terminator" in feature.qualifiers["regulatory_class"]:
                interactions = get_terminator_interactions(name)
                phage.add_terminator(name, start, stop, interactions)
        # Grab genes/CDSes
        if feature.type == "gene":
            if name in IGNORE_GENES:
                continue
            if name in RELABEL_GENES:
                name = RELABEL_GENES[name]
            # Construct CDS parameters for this gene
            phage.add_gene(name=name,
                           start=start,
                           stop=stop,
                           rbs_start=start - 30,
                           rbs_stop=start,
                           rbs_strength=1e7)
            # Recode gene 10A
            if name == "gene 10A":
                gene10_start = start
                gene10_stop = stop
        if feature.type == "CDS":
            weights = compute_cds_weights(record, feature, 1.0, weights)
        if feature.type == "misc_structure":
            print(feature.qualifiers)
            phage.add_rnase_site(start=start, stop=start + 10)
        print(start, stop, name)

    weights[gene10_start:gene10_stop] = [0.1] * (gene10_stop - gene10_start)

    mask_interactions = [
        "rnapol-1", "rnapol-3.5", "ecolipol", "ecolipol-p", "ecolipol-2",
        "ecolipol-2-p"
    ]
    phage.add_mask(500, mask_interactions)

    norm_weights = normalize_weights(weights)
    phage.add_weights(norm_weights)

    sim.register_genome(phage)

    sim.add_polymerase("rnapol-1", 35, 230, 0)
    sim.add_polymerase("rnapol-3.5", 35, 230, 0)
    sim.add_polymerase("ecolipol", 35, 45, 0)
    sim.add_polymerase("ecolipol-p", 35, 45, 0)
    sim.add_polymerase("ecolipol-2", 35, 45, 0)
    sim.add_polymerase("ecolipol-2-p", 35, 45, 0)

    sim.add_ribosome(30, 30, 0)

    sim.add_species("bound_ribosome", 10000)

    sim.add_species("bound_ecolipol", 1800)
    sim.add_species("bound_ecolipol_p", 0)
    sim.add_species("ecoli_genome", 0)
    sim.add_species("ecoli_transcript", 0)

    sim.add_reaction(1e6, ["ecoli_transcript", "__ribosome"],
                     ["bound_ribosome"])

    sim.add_reaction(0.04, ["bound_ribosome"],
                     ["__ribosome", "ecoli_transcript"])

    sim.add_reaction(0.001925, ["ecoli_transcript"], ["degraded_transcript"])

    sim.add_reaction(1e7, ["ecolipol", "ecoli_genome"], ["bound_ecolipol"])

    sim.add_reaction(0.3e7, ["ecolipol-p", "ecoli_genome"],
                     ["bound_ecolipol_p"])

    sim.add_reaction(0.04, ["bound_ecolipol"],
                     ["ecolipol", "ecoli_genome", "ecoli_transcript"])

    sim.add_reaction(0.04, ["bound_ecolipol_p"],
                     ["ecolipol-p", "ecoli_genome", "ecoli_transcript"])

    sim.add_reaction(3.8e7, ["protein_kinase-0.7", "ecolipol"],
                     ["ecolipol-p", "protein_kinase-0.7"])

    sim.add_reaction(3.8e7, ["protein_kinase-0.7", "ecolipol-2"],
                     ["ecolipol-2-p", "protein_kinase-0.7"])

    sim.add_reaction(3.8e7, ["gp-2", "ecolipol"], ["ecolipol-2"])

    sim.add_reaction(3.8e7, ["gp-2", "ecolipol-p"], ["ecolipol-2-p"])

    sim.add_reaction(1.1, ["ecolipol-2-p"], ["gp-2", "ecolipol-p"])

    sim.add_reaction(1.1, ["ecolipol-2"], ["gp-2", "ecolipol"])

    sim.add_reaction(3.8e9, ["lysozyme-3.5", "rnapol-1"], ["rnapol-3.5"])

    sim.add_reaction(3.5, ["rnapol-3.5"], ["lysozyme-3.5", "rnapol-1"])

    sim.seed(32)

    sim.simulate(time_limit=1200,
                 time_step=5,
                 output="phage_degrade_recoded_01_counts.tsv")
Example #55
0
def get_pubmed_esummary(pmid_list):

    handle = Entrez.esummary(db="pubmed", id=pmid_list)
    records = Entrez.read(handle)
    return records
Example #56
0
    def save(self, update=False, *args, **kwargs):
        if not self.pk or update:
            # This code only happens if the objects is not in the database yet.
            # Otherwise it would have pk.
            try:
                #Reference._for_write = True
                if self.pmid or 'pmid' in kwargs:
                    print "Did not failed"
                    return Reference.objects.get(pmid=self.pmid)  #, False
                elif self.title or 'title' in kwargs:
                    #print self.title
                    handle = Entrez.esearch(db='pubmed', term=self.title)
                    print "Got handle"
                    record = Entrez.read(handle)
                    print "Got record", record
                    print record['Count'], type(record['Count'])
                    if record['Count'] == "1":
                        print "Record count is 1"
                        self.pmid = record['IdList'][0]
                        #print self.title, self.pmid
                        Reference.fetch_data(self)
                        print("Saving")
                        super(Reference, self).save(*args, **kwargs)
                        print("Saved")
                    else:
                        from denigma.library import Bibliography  # This statement at the top breaks Denigma for unknown reason.
                        #print("Trying it different. %s" % type(self.title))
                        # Google:
                        bib = Bibliography()
                        #print("googling")
                        r = bib.google(self.title)
                        if r:
                            r = r[0]
                            self.pmid = r.pmid
                            #print("Google successufull: %s" % self.pmid)
                        else:
                            #print("Google failed.")
                            #                           r = bib.find(self.title)[0]
                            #                           self.pmid = r.pmid
                            #                           print self.pmid
                            #print("Trying it different.")
                            r = bib.find(unicode(self.title))
                            if len(r) == 1:
                                r = r[0]
                                self.pmid = r.pmid
                                print self.pmid
                            elif len(r) > 1:
                                title = normalize_title(self.title)
                                for areference in r:
                                    if normalize_title(
                                            areference.title) == title:
                                        r = areference
                        print("datasets.Reference.save()")
                        self.__dict__.update(r.__dict__)
                        print r
                        print vars(r)
                        self.date = normalize_time(r.date)

                        print "# Transforming lists into strings:"
                        self.keywords = "; ".join(self.keywords)
                        self.authors = "; ".join(self.authors)
                        print "calling super"
                        print self.pmid
                        try:
                            super(Reference, self).save(
                                *args,
                                **kwargs)  # Just save the given information.
                        except Exception as e:
                            print e
                        print "called super"
                        # Raise Exception and state the the given information yielded more than one reference.
                else:
                    super(Reference, self).save(*args, **kwargs)
            except Reference.DoesNotExist as e:
                print "Error", e
                Reference.fetch_data(self)
                super(Reference, self).save(*args, **kwargs)
        else:
            super(Reference, self).save(*args, **kwargs)
Example #57
0
print("Baixar sequencias do Genbank (ate 200 acessos)")
print('Adaptado por Tiago Andrade Borges Santos')

from Bio import Entrez
f = open('sequence.gb', 'w')
Entrez.email = "*****@*****.**"  # Always tell NCBI who you are
print(
    "Insira sua lista de acessos do Genbank entre aspas, separados por virgula."
)
print("Exemplo: 'GU479772', 'GU479773'")
record = input("Seqs: ")
gb_list = (record)
gb_str = ",".join(gb_list)
handle = Entrez.efetch(db="nuccore", id=gb_str, rettype="gb", retmode="txt")
text = handle.read()
f.write(str(text + '\n'))
f.close()

print("Sequencias com Genbank Number (1) ou sem Genbank Number (2)?")
resposta = int(input("resposta: "))

from Bio import SeqIO  #importar SeqIO a partir do biopython:
sequencias = SeqIO.parse('sequence.gb', 'genbank')
nseq = 0  #variavel para contar o numero de sequencias processadas
f = open('sequence.fas', 'w')  #abre o arquivo onde vai salvar os resultados

if resposta == 1:
    for seq in sequencias:  #loop que itera cada uma das sequencias
        generoespecie = seq.annotations['organism'].split(' ')
        f.write('>' + generoespecie[0] + '_' + generoespecie[1] + '_' +
                seq.name + '\n')
Example #58
0
from Bio import Entrez
Entrez.email = "*****@*****.**"
# 아 맞다 메일
handle = Entrez.esummary(db="pubmed", id="31651376")
record = Entrez.read(handle)
info = record[0]
print("Journal info\nid: {}\nTitle: {}".format(record[0]["Id"], info["Title"]))
Example #59
0
#!/usr/bin/env python

# Given: A genus name, followed by two dates in YYYY/M/D format.
# Return: The number of Nucleotide GenBank entries for the given genus that were published between the dates specified.

from Bio import Entrez

term = "Nesterenkonia"
start = "2001/03/24"
end = "2011/09/19"

Entrez.email = '*****@*****.**'
handle = Entrez.esearch(db="nucleotide",
                        term='"' + term + '"[Organism] AND ("' + start +
                        '"[PDAT] : "' + end + '"[PDAT])"')
record = Entrez.read(handle)
print record["Count"]
Example #60
0
def get_pubmed_record_from_xml(pmid_list):

    [pmid2title, pmid2abstract] = get_titles_abstracts(pmid_list)

    handle = Entrez.efetch(db="pubmed", id=pmid_list, rettype='xml')
    record = Entrez.read(handle)

    data = []
    for paper in record['PubmedArticle']:
        entry = {}
        entry['pmid'] = int(paper['MedlineCitation']['PMID'])
        article = paper['MedlineCitation']['Article']
        journal = article['Journal']
        # entry['issn'] = journal.get('ISSN')
        entry['journal_abbrev'] = journal.get('ISOAbbreviation')
        entry['journal_title'] = journal.get('Title')
        if journal.get('JournalIssue'):
            entry['issue'] = journal['JournalIssue'].get('Issue')
            entry['volume'] = journal['JournalIssue'].get('Volume')
            if journal['JournalIssue'].get('PubDate'):
                entry['year'] = journal['JournalIssue']['PubDate'].get('Year')

        entry['title'] = article.get('ArticleTitle')

        ## the titles from XML format preserve all special charaters
        ## while the titles from medline format do not so try very best
        ## to use the titles from XML unless there are html tags in the
        ## titles - in those cases, XML format will mess up the title parsing
        titleFromXML = entry['title']
        titleFromTXT = pmid2title.get(entry['pmid'])
        if titleFromTXT is not None:
            wordsFromXML = titleFromXML.split(" ")
            wordsFromTXT = titleFromTXT.split(" ")
            if len(wordsFromTXT) > len(wordsFromXML) + 2:
                entry['title'] = titleFromTXT

        abstract = pmid2abstract.get(entry['pmid'])
        if abstract is not None:
            entry['abstract'] = abstract

        if paper['MedlineCitation'].get('DateRevised'):
            dateRevised = paper['MedlineCitation']['DateRevised']
            entry['date_revised'] = dateRevised['Year'] + "-" + dateRevised[
                'Month'] + "-" + dateRevised['Day']

        if article.get('Pagination'):
            entry['page'] = article['Pagination'].get('MedlinePgn')

        if article.get('PublicationTypeList'):
            types = []
            for type in article['PublicationTypeList']:
                types.append(str(type))
            entry['pubtypes'] = types

        if article.get('AuthorList'):
            authors = []
            orcid4author = {}
            for author in article['AuthorList']:
                if author.get('LastName') is None or author.get(
                        'Initials') is None:
                    continue
                authorName = author['LastName'] + " " + author['Initials']
                authors.append(authorName)
                ident = author.get('Identifier')
                if len(ident) == 0:
                    continue
                if ident[0].attributes.get('Source') is None:
                    continue
                if ident[0].attributes.get('Source') == 'ORCID':
                    orcid = str(ident[0]).replace("http://orcid.org/",
                                                  "").replace(
                                                      "https://orcid.org/", "")
                    orcid4author[authorName] = orcid
            entry['authors'] = authors
            entry['orcid'] = orcid4author

        if paper['PubmedData'].get('PublicationStatus'):
            entry['publication_status'] = paper['PubmedData'].get(
                'PublicationStatus')

        if paper['PubmedData'].get('ArticleIdList'):
            for item in paper['PubmedData'].get('ArticleIdList'):
                if item.attributes.get(
                        'IdType') is not None and item.attributes.get(
                            'IdType') == 'pmc':
                    entry['pmc'] = str(item)
                if item.attributes.get(
                        'IdType') is not None and item.attributes.get(
                            'IdType') == 'doi':
                    entry['doi'] = str(item)

        # print entry, "\n"

        data.append(entry)

    return data