def add_paper(pmid, nex_session=None): if nex_session is None: nex_session = get_session() records = get_pubmed_record(str(pmid)) # print records[0] rec_file = StringIO(records[0]) record = Medline.read(rec_file) # print record source_id = get_source_id(nex_session, 'NCBI') ## insert into DBENTITY/REFERENCEDBENTITY/REFERENCEDOCUMENT [reference_id, authors, doi_url, pmc_url] = insert_referencedbentity(nex_session, pmid, source_id, record) # print reference_id, authors, doi_url, pmc_url insert_authors(nex_session, reference_id, authors, source_id) insert_pubtypes(nex_session, pmid, reference_id, record.get('PT', []), source_id) insert_urls(nex_session, pmid, reference_id, doi_url, pmc_url, source_id) insert_relations(nex_session, pmid, reference_id, record) return reference_id
def test_medline_from_url(self): """Test Entrez into Medline.read from URL""" efetch = Entrez.efetch(db="pubmed", id="19304878", rettype="medline", retmode="text") record = Medline.read(efetch) self.assertTrue(isinstance(record, dict)) self.assertEqual("19304878", record["PMID"]) self.assertEqual("10.1093/bioinformatics/btp163 [doi]", record["LID"])
def processInput(k): print "Querying PMID: "+str(k)+"." getall = Medline.read(Entrez.efetch(db="pubmed", id=k, rettype="medline", retmode="text")) singlemesh = getall.get("MH") singledate = getall.get("EDAT") for j1 in range(len(singlemesh)): cur.execute("INSERT INTO MeSH002(PMID, MeSH, Dates) VALUES("+str(k)+",'" + getall.get("MH")[j1][0:24].translate(None, "'*&")+"','" + str(singledate[0:10]) +"')" )
def test_read(self): with open("Medline/pubmed_result1.txt") as handle: record = Medline.read(handle) self.assertEqual(record["PMID"], "12230038") self.assertEqual(record["OWN"], "NLM") self.assertEqual(record["STAT"], "MEDLINE") self.assertEqual(record["DA"], "20020916") self.assertEqual(record["DCOM"], "20030606") self.assertEqual(record["LR"], "20041117") self.assertEqual(record["PUBM"], "Print") self.assertEqual(record["IS"], "1467-5463 (Print)") self.assertEqual(record["VI"], "3") self.assertEqual(record["IP"], "3") self.assertEqual(record["DP"], "2002 Sep") self.assertEqual(record["TI"], "The Bio* toolkits--a brief overview.") self.assertEqual(record["PG"], "296-302") self.assertEqual(record["AB"], "Bioinformatics research is often difficult to do with commercial software. The Open Source BioPerl, BioPython and Biojava projects provide toolkits with multiple functionality that make it easier to create customised pipelines or analysis. This review briefly compares the quirks of the underlying languages and the functionality, documentation, utility and relative advantages of the Bio counterparts, particularly from the point of view of the beginning biologist programmer.") self.assertEqual(record["AD"], "tacg Informatics, Irvine, CA 92612, USA. [email protected]") self.assertEqual(record["FAU"], ["Mangalam, Harry"]) self.assertEqual(record["AU"], ["Mangalam H"]) self.assertEqual(record["LA"], ["eng"]) self.assertEqual(record["PT"], ["Journal Article"]) self.assertEqual(record["PL"], "England") self.assertEqual(record["TA"], "Brief Bioinform") self.assertEqual(record["JT"], "Briefings in bioinformatics") self.assertEqual(record["JID"], "100912837") self.assertEqual(record["SB"], "IM") self.assertEqual(record["MH"], ["*Computational Biology", "Computer Systems", "Humans", "Internet", "*Programming Languages", "*Software", "User-Computer Interface"]) self.assertEqual(record["EDAT"], "2002/09/17 10:00") self.assertEqual(record["MHDA"], "2003/06/07 05:00") self.assertEqual(record["PST"], "ppublish") self.assertEqual(record["SO"], "Brief Bioinform. 2002 Sep;3(3):296-302.")
def medline_to_csv(): """ Create one record per text using XML abstracts scraped from PubMed """ files = pickle.load(open('pickles/pubmed_records.p', 'rb')) files = ['pubmed/' + f + '.txt' for f in files] f_out = 'csv/sentences_pubmed.csv' sentence_splitter = set_up_tokenizer() with open(f_out, 'wb') as csv_out: csv_writer = csv.DictWriter(csv_out, ['id', 'sent_num', 'text'], delimiter=',') csv_writer.writeheader() for f in files: with open(f, 'rb') as f_in: record = Medline.read(f_in) # use medline parser to extract relevant data from the file pid = record['PMID'] text = record['TI'] + ' ' + record['AB'] sentences = sentence_splitter.tokenize(text) for i, s in enumerate(sentences): # dict comprehension here to hack the unicode into csv writer dict_row = {'id': pid, 'sent_num': i, 'text': s.encode('utf-8')} csv_writer.writerow(dict_row)
def update_database_batch(nex_session, fw, records, pmid_to_reference, reference_id_to_urls, source_id): for rec in records: rec_file = StringIO(rec) record = Medline.read(rec_file) pmid = record.get('PMID') if pmid is None: continue x = pmid_to_reference.get(int(pmid)) if x is None: continue pmc_url = None if record.get('PMC'): pmc_url = PMC_ROOT + record['PMC'] + '/' doi, doi_url = get_doi(record) doi_url = doi_url.replace("<", "<").replace(">", ">") update_urls(nex_session, fw, pmid, x.dbentity_id, pmc_url, doi_url, reference_id_to_urls[x.dbentity_id], source_id)
def medline_to_db(): """ Create one record per text using medline abstracts scraped from PubMed """ sentence_splitter = set_up_tokenizer() files = set(pickle.load(open('pickles/pubmed_records_new.p', 'rb'))) with sqlite3.connect(db_path) as db: cursor = db.cursor() # don't want to add the same abstract multiple times, so get existing ones first cursor.execute('SELECT DISTINCT pubmed_id FROM sentences') # using sets to hopefully speed things up existing = {p[0] for p in cursor} files = {f for f in files if f not in existing} files = ['pubmed/' + str(f) + '.txt' for f in files] for f in files: with open(f, 'rb') as f_in: record = Medline.read(f_in) # use medline parser to extract relevant data from the file pid = record['PMID'] try: text = record['TI'] + ' ' + record['AB'] # bti is for books? the value is a list for some reason so just take first element except: text = record['BTI'][0] + ' ' + record['AB'] sentences = sentence_splitter.tokenize(text) for i, s in enumerate(sentences): cursor.execute('''INSERT INTO sentences VALUES (NULL, ?, ?, ?, ?);''', (pid, i, s, 'pubmed'))
def handle_one_record(db_session, records, gene_list, alias_to_name): i = 1 for rec in records: rec_file = StringIO(rec) record = Medline.read(rec_file) pmid = record.get('PMID') pubmed_url = 'http://www.ncbi.nlm.nih.gov/pubmed/' + str(pmid) doi_url = "" if record.get('AID'): # ['S0167-7012(17)30042-8 [pii]', '10.1016/j.mimet.2017.02.002 [doi]'] doi = None for id in record['AID']: if id.endswith('[doi]'): doi = id.replace(' [doi]', '') break if doi: doi_url = "/".join(['http://dx.doi.org', doi]) title = record.get('TI', '') authors = record.get('AU', []) pubdate = record.get('DP', '') # 'PubDate': '2012 Mar 20' year = pubdate.split(' ')[0] journal = record.get('TA', '') volume = record.get('VI', '') issue = record.get('IP', '') pages = record.get('PG', '') citation = set_cite(title, authors, year, journal, volume, issue, pages) abstract = record.get('AB', '') gene_names = extract_gene_names(abstract, gene_list, alias_to_name) # insert formatted data to DB insert_reference(db_session, pmid, citation, doi_url, abstract, " ".join(gene_names))
def test_read(self): handle = open("Medline/pubmed_result1.txt") record = Medline.read(handle) handle.close() self.assertEqual(record["PMID"], "12230038") self.assertEqual(record["OWN"], "NLM") self.assertEqual(record["STAT"], "MEDLINE") self.assertEqual(record["DA"], "20020916") self.assertEqual(record["DCOM"], "20030606") self.assertEqual(record["LR"], "20041117") self.assertEqual(record["PUBM"], "Print") self.assertEqual(record["IS"], "1467-5463 (Print)") self.assertEqual(record["VI"], "3") self.assertEqual(record["IP"], "3") self.assertEqual(record["DP"], "2002 Sep") self.assertEqual(record["TI"], "The Bio* toolkits--a brief overview.") self.assertEqual(record["PG"], "296-302") self.assertEqual(record["AB"], "Bioinformatics research is often difficult to do with commercial software. The Open Source BioPerl, BioPython and Biojava projects provide toolkits with multiple functionality that make it easier to create customised pipelines or analysis. This review briefly compares the quirks of the underlying languages and the functionality, documentation, utility and relative advantages of the Bio counterparts, particularly from the point of view of the beginning biologist programmer.") self.assertEqual(record["AD"], "tacg Informatics, Irvine, CA 92612, USA. [email protected]") self.assertEqual(record["FAU"], ["Mangalam, Harry"]) self.assertEqual(record["AU"], ["Mangalam H"]) self.assertEqual(record["LA"], ["eng"]) self.assertEqual(record["PT"], ["Journal Article"]) self.assertEqual(record["PL"], "England") self.assertEqual(record["TA"], "Brief Bioinform") self.assertEqual(record["JT"], "Briefings in bioinformatics") self.assertEqual(record["JID"], "100912837") self.assertEqual(record["SB"], "IM") self.assertEqual(record["MH"], ["*Computational Biology", "Computer Systems", "Humans", "Internet", "*Programming Languages", "*Software", "User-Computer Interface"]) self.assertEqual(record["EDAT"], "2002/09/17 10:00") self.assertEqual(record["MHDA"], "2003/06/07 05:00") self.assertEqual(record["PST"], "ppublish") self.assertEqual(record["SO"], "Brief Bioinform. 2002 Sep;3(3):296-302.")
def handle_query(query: str, records_to_retrieve: set, retrieve_max: int = 100) -> List[Dict]: ''' Takes a query, searches for the relevant papers and returns the abstracts Abbreviations: AB = Abstract, AID = Article Identifier, (F)AU = (Full) Author, DP = Date of Publication, JT = Journal Title, OT = Other Term, PMID = PubMed Unique Identifier, TI = Title https://biopython.org/docs/1.75/api/Bio.Medline.html#Bio.Medline.Record ''' rec_handler = search_medline(query=query, retmax=retrieve_max) records = [] for rec_id in rec_handler['IdList']: rec = fetch_details(rec_id=rec_id, entrez_handle=rec_handler) rec_file = StringIO(rec) medline_rec = Medline.read(rec_file) record = {} for key, value in medline_rec.items(): if key in records_to_retrieve: record[key] = value records.append(record) # if 'AB' in medline_rec: # print(medline_rec['AB'])] return records
def main(query, email): rec_handler = search_medline(query, email) for rec_id in rec_handler['IdList']: rec = fetch_rec(rec_id, rec_handler) rec_file = StringIO(rec) medline_rec = Medline.read(rec_file) if 'AB' in medline_rec: print(medline_rec['AB'])
def test_medline_from_url(self): """Test Entrez into Medline.read from URL""" efetch = Entrez.efetch(db="pubmed", id='19304878', rettype="medline", retmode="text") record = Medline.read(efetch) self.assertTrue(isinstance(record, dict)) self.assertEqual('19304878', record['PMID']) self.assertEqual('10.1093/bioinformatics/btp163 [doi]', record['LID'])
def parse_medline(self, text): self.medline = Medline.read(text.split('\n')) self.title = self.medline['TI'] self.journal = self.medline['JT'] self.citation = self.medline['SO'] self.date_pub = self.medline['DP'] try: self.authors = self.medline['AU'] except KeyError: self.authors = self.medline['IR']
def test_multiline_mesh(self): with open("Medline/pubmed_result3.txt") as handle: record = Medline.read(handle) self.assertEqual(record["PMID"], "23039619") self.assertEqual(record["MH"], [ "Blood Circulation", "High-Intensity Focused Ultrasound Ablation/adverse effects/instrumentation/*methods", "Humans", "Models, Biological", "Sonication", "Temperature", "Time Factors", "Transducers" ])
def retrieve_abstract(PMID): Entrez.email = app.config['EMAIL'] handle = Entrez.efetch(db="pubmed",rettype="medline", retmode="text", id=PMID) record = Medline.read(handle) handle.close() try: abstract = record['AB'] except: abstract = '' return abstract
def test_medline_from_url(self): """Test Entrez into Medline.read from URL.""" handle = Entrez.efetch(db="pubmed", id="19304878", rettype="medline", retmode="text") record = Medline.read(handle) handle.close() self.assertIsInstance(record, dict) self.assertEqual("19304878", record["PMID"]) self.assertEqual("10.1093/bioinformatics/btp163 [doi]", record["LID"])
def test_multiline_mesh(self): with open("Medline/pubmed_result3.txt") as handle: record = Medline.read(handle) self.assertEqual(record["PMID"], "23039619") self.assertEqual(record["MH"], ["Blood Circulation", "High-Intensity Focused Ultrasound Ablation/adverse effects/instrumentation/*methods", "Humans", "Models, Biological", "Sonication", "Temperature", "Time Factors", "Transducers"])
def test_medline_from_url(self): """Test Entrez into Medline.read from URL""" handle = Entrez.efetch(db="pubmed", id="19304878", rettype="medline", retmode="text") self.assertTrue(handle.url.startswith(URL_HEAD + "efetch.fcgi?"), handle.url) self.assertTrue(URL_TOOL in handle.url) self.assertTrue(URL_EMAIL in handle.url) self.assertTrue("id=19304878" in handle.url) record = Medline.read(handle) handle.close() self.assertTrue(isinstance(record, dict)) self.assertEqual("19304878", record["PMID"]) self.assertEqual("10.1093/bioinformatics/btp163 [doi]", record["LID"])
def test_pubmed_16381885(self): """Bio.TogoWS.entry("pubmed", "16381885")""" # Gives Medline plain text handle = TogoWS.entry("pubmed", "16381885") data = Medline.read(handle) handle.close() self.assertEqual(data["TI"], 'From genomics to chemical genomics: new developments in KEGG.') self.assertEqual(data["AU"], ['Kanehisa M', 'Goto S', 'Hattori M', 'Aoki-Kinoshita KF', 'Itoh M', 'Kawashima S', 'Katayama T', 'Araki M', 'Hirakawa M'])
def test_pubmed_16381885(self): """Bio.TogoWS.entry("pubmed", "16381885")""" #Gives Medline plain text handle = TogoWS.entry("pubmed", "16381885") data = Medline.read(handle) handle.close() self.assertEqual(data["TI"], 'From genomics to chemical genomics: new developments in KEGG.') self.assertEqual(data["AU"], ['Kanehisa M', 'Goto S', 'Hattori M', 'Aoki-Kinoshita KF', 'Itoh M', 'Kawashima S', 'Katayama T', 'Araki M', 'Hirakawa M'])
def fetchMetadata2(lala, search): pmid_string = ",".join(lala) print(pmid_string) records = [] efetch = Entrez.efetch(db, id=pmid_string, rettype="medline", retmode="text") for i in range(len(lala)): record = Medline.read(efetch) records.append(record) print("receiving data") print("This is records", records) alldata = getdata1(records, search) return alldata
def processInput(k): print "Querying PMID: " + str(k) + "." getall = Medline.read( Entrez.efetch(db="pubmed", id=k, rettype="medline", retmode="text")) singlemesh = getall.get("MH") singledate = getall.get("EDAT") for j1 in range(len(singlemesh)): cur.execute("INSERT INTO MeSH002(PMID, MeSH, Dates) VALUES(" + str(k) + ",'" + getall.get("MH")[j1][0:24].translate(None, "'*&") + "','" + str(singledate[0:10]) + "')")
def test_pubmed_16381885(self): """Bio.TogoWS.entry("pubmed", "16381885").""" # Gives Medline plain text handle = TogoWS.entry("pubmed", "16381885") data = Medline.read(handle) handle.close() self.assertEqual(data["TI"], "From genomics to chemical genomics: " "new developments in KEGG.") self.assertEqual(data["AU"], ["Kanehisa M", "Goto S", "Hattori M", "Aoki-Kinoshita KF", "Itoh M", "Kawashima S", "Katayama T", "Araki M", "Hirakawa M"])
def test_medline_from_url(self): """Test Entrez into Medline.read from URL""" handle = Entrez.efetch(db="pubmed", id='19304878', rettype="medline", retmode="text") self.assertTrue(handle.url.startswith(URL_HEAD + "efetch.fcgi?"), handle.url) self.assertIn(URL_TOOL, handle.url) self.assertIn(URL_EMAIL, handle.url) self.assertIn("id=19304878", handle.url) record = Medline.read(handle) handle.close() self.assertTrue(isinstance(record, dict)) self.assertEqual('19304878', record['PMID']) self.assertEqual('10.1093/bioinformatics/btp163 [doi]', record['LID'])
def fetchMetadata(pmid_list, search): all_metadata = [] for i in pmid_list: time.sleep(1) print("fetching") efetch = Entrez.efetch(db, id=i, rettype="medline", retmode="text") records = Medline.read(efetch) metadata = [i] metadata = getdata(records, metadata, search) all_metadata.append(metadata) print(all_metadata) return all_metadata
def get_abstracts(file_name): pubmed_ids = [] with open(file_name) as f: for line in f: pubmed_ids.append(int(line.rstrip('\n'))) abstracts = [] for pubmed_id in pubmed_ids: fetch_handler = Entrez.efetch(db='pubmed', rettype='medline', retmode='text', id=str(pubmed_id)) record = Medline.read(fetch_handler) abstracts.append(record['AB']) return abstracts
def fetch_all_dates(doi): ''' :param doi: DOI of paper to use for searching :return: the date of publication when applicable ''' handle = search_pubmed(doi) if len(handle['IdList']) > 0: record_id = handle['IdList'][0] paper = fetch_paper_date(record_id) rec_file = StringIO(paper) medline_rec = Medline.read(rec_file) return medline_rec['PHST'][-1].partition(' ')[0] else: return None
def update_database_batch(nex_session, fw, records, pmid_to_reference, key_to_type, source_to_id): for rec in records: rec_file = StringIO(rec) record = Medline.read(rec_file) pmid = record.get('PMID') if pmid is None: continue x = pmid_to_reference.get(int(pmid)) if x is None: continue update_comment_erratum(nex_session, fw, record, int(pmid), pmid_to_reference, key_to_type, source_to_id)
def get_new_reference_info(request): MAX_PUBS_ADDED = 10 try: params = request.json_body if not params: raise ValueError('Please enter at least 1 PMID.') pmids = params['pmids'] int_pmids = convert_space_separated_pmids_to_list(pmids) if len(int_pmids) > MAX_PUBS_ADDED: raise ValueError('Only ' + str(MAX_PUBS_ADDED) + ' may be added at once.') # avoid repeat PMIDs repeat_pmids = [ x for x, count in collections.Counter(int_pmids).items() if count > 1 ] if len(repeat_pmids): str_pmids = [str(x) for x in repeat_pmids] str_pmids = ', '.join(str_pmids) msg = 'A PMID was repeated: ' + str_pmids raise ValueError(msg) confirmation_list = [] for x in int_pmids: is_in_db = DBSession.query(Referencedbentity).filter( Referencedbentity.pmid == x).one_or_none() if is_in_db: raise ValueError( 'At least 1 PMID is already in the database: ' + str(x)) record = Medline.read( Entrez.efetch(db='pubmed', id=str(x), rettype='medline')) warning = Referencedbentity.get_deletion_warnings(x) journal_title = record.get('JT', '') if len(journal_title) <= 1: raise ValueError('Cannot import PMID ' + str(x) + ' because journal title is blank.') confirmation_item = { 'name': record.get('TI') + ' PMID: ' + str(x), 'pmid': x, 'warning': warning } confirmation_list.append(confirmation_item) return {'references': confirmation_list} except Exception as e: traceback.print_exc() log.error(e) DBSession.rollback() return HTTPBadRequest(body=json.dumps({'message': str(e)}), content_type='text/json')
def update_database_batch(nex_session, fw, records, pmid_to_reference, journal_id_to_abbrev, source_id): for rec in records: rec_file = StringIO(rec) record = Medline.read(rec_file) pmid = record.get('PMID') if pmid is None: continue x = pmid_to_reference.get(int(pmid)) if x is None: continue update_database(nex_session, fw, record, int(pmid), pmid_to_reference, journal_id_to_abbrev, source_id)
def pmid2abstract_info(pmid): from Bio import Medline handle = Entrez.efetch(db="pubmed", id=pmid, rettype="medline", retmode="text") record = Medline.read(handle) print record pmid_data = {} pmid_data["title"] = record.get("TI", "?") pmid_data["authors"] = record.get("AU", "?") pmid_data["source"] = record.get("SO", "?") pmid_data["abstract"] = record.get("AB", "?") pmid_data["pmid"] = pmid return pmid_data
def test_medline_from_url(self): """Test Entrez into Medline.read from URL.""" handle = Entrez.efetch(db="pubmed", id="19304878", rettype="medline", retmode="text") url = handle.url self.assertTrue(url.startswith(URL_HEAD + "efetch.fcgi?"), url) self.assertIn(URL_TOOL, url) self.assertIn(URL_EMAIL, url) self.assertIn(URL_API_KEY, url) self.assertIn("id=19304878", url) record = Medline.read(handle) handle.close() self.assertIsInstance(record, dict) self.assertEqual("19304878", record["PMID"]) self.assertEqual("10.1093/bioinformatics/btp163 [doi]", record["LID"])
def update_database_batch(nex_session, fw, records, pmid_to_reference, reference_id_to_authors, source_id): for rec in records: rec_file = StringIO(rec) record = Medline.read(rec_file) pmid = record.get('PMID') if pmid is None: continue x = pmid_to_reference.get(int(pmid)) if x is None: continue authors = record.get('AU', '') update_authors(nex_session, fw, pmid, x.dbentity_id, authors, reference_id_to_authors.get(x.dbentity_id), source_id)
def get_first_last_authors(paper_id): """Given a paper, returns the first and last authors of the paper. Arguments: paper_id - str; paper ids Returns: authors - list of strs; list of full names of the first and last authors of the provided paper id """ handle = Entrez.efetch(db='pubmed', id=paper_id, rettype='medline', retmode="text", retmax=200) record = Medline.read(handle) authors = record.get("FAU", "?") first_last_authors = [authors[0], authors[-1]] return first_last_authors
def update_database_batch(nex_session, fw, records, pmid_to_reference, reference_id_to_pubtypes, source_id): for rec in records: rec_file = StringIO(rec) record = Medline.read(rec_file) pmid = record.get('PMID') if pmid is None: continue x = pmid_to_reference.get(int(pmid)) if x is None: continue pubtypes = record.get('PT', []) ## a list of types update_reftypes(nex_session, fw, pmid, x.dbentity_id, pubtypes, reference_id_to_pubtypes.get(x.dbentity_id), source_id)
def downloadBibliography(): from Bio import Entrez import re from Bio import Medline Entrez.email = "*****@*****.**" fp1 = open("SNP_pubmed_result.txt", "a") fp2 = open("new_pubmed_result.txt", "w") ''' all_text_tatol = fp1.read() all_text_new = fp2.read() ''' input = open("SNP_pubmed_result.txt") medline_exist = Medline.read(input) input.close() for line in open("title.txt", "r").readlines()[:5]: title = line.replace("\n", "") handle = Entrez.esearch(db="pubmed", term=title) records = Entrez.read(handle) if int(records["Count"]) == 1: term = re.sub(" ", "+", title) url = "https://www.ncbi.nlm.nih.gov/pubmed/?term=" url += ''.join(term) url += ''.join("&report=medline&format=text") medline_handle = Entrez.efetch(db="pubmed", id=records["IdList"], rettype="medline", retmode="text") medline_res = Medline.parse(medline_handle) medline_res_list = list(medline_res) PMID = -1 for record in medline_res_list: PMID = record.get("PMID", "?") ''' if len(all_text_tatol) == 0 : writeBibliography(url, fp1, fp2) else : ''' if PMID not in medline_exist["PMID"]: writeBibliography(url, fp1, fp2) fp1.close() fp2.close()
def test_pubmed_16381885(self): """Bio.TogoWS.entry("pubmed", "16381885")""" # Gives Medline plain text handle = TogoWS.entry("pubmed", "16381885") data = Medline.read(handle) handle.close() self.assertEqual(data["TI"], "From genomics to chemical genomics: new developments in KEGG.") self.assertEqual( data["AU"], [ "Kanehisa M", "Goto S", "Hattori M", "Aoki-Kinoshita KF", "Itoh M", "Kawashima S", "Katayama T", "Araki M", "Hirakawa M", ], )
def fetch_from_entrez(index, cache_dir=False): logger = logging.getLogger('build') # slugify the index for the cache filename (some indices have symbols not allowed in file names (e.g. /)) index_slug= slugify(index) cache_file_path = '{}/{}'.format('/'.join(cache_dir), index_slug) # try fetching from cache if cache_dir: d = fetch_from_cache(cache_dir, index_slug) if d: logger.info('Fetched {} from cache'.format(cache_file_path)) return d # if nothing is found in the cache, use the web API logger.info('Fetching {} from Entrez'.format(index)) tries = 0 max_tries = 5 while tries < max_tries: if tries > 0: logger.warning('Failed fetching {}, retrying'.format(full_url)) try: Entrez.email = '*****@*****.**' handle = Entrez.efetch( db="pubmed", id=str(index), rettype="medline", retmode="text" ) except: tries += 1 time.sleep(2) else: d = Medline.read(handle) # save to cache save_to_cache(cache_dir, index_slug, d) logger.info('Saved entry for {} in cache'.format(cache_file_path)) return d
def fetcher(self): handle = Entrez.efetch(db='pubmed', id=self.name, retmode='text', rettype='medline') return Medline.read(handle)