def id_lookup(paper_id, idtype): """Take an ID of type PMID, PMCID, or DOI and lookup the other IDs. If the DOI is not found in Pubmed, try to obtain the DOI by doing a reverse-lookup of the DOI in CrossRef using article metadata. Parameters ---------- paper_id : str ID of the article. idtype : str Type of the ID: 'pmid', 'pmcid', or 'doi Returns ------- ids : dict A dictionary with the following keys: pmid, pmcid and doi. """ if idtype not in ('pmid', 'pmcid', 'doi'): raise ValueError("Invalid idtype %s; must be 'pmid', 'pmcid', " "or 'doi'." % idtype) ids = {'doi': None, 'pmid': None, 'pmcid': None} pmc_id_results = pmc_client.id_lookup(paper_id, idtype) # Start with the results of the PMC lookup and then override with the # provided ID ids['pmid'] = pmc_id_results.get('pmid') ids['pmcid'] = pmc_id_results.get('pmcid') ids['doi'] = pmc_id_results.get('doi') ids[idtype] = paper_id # If we gave a DOI, then our work is done after looking for PMID and PMCID if idtype == 'doi': return ids # If we gave a PMID or PMCID, we need to check to see if we got a DOI. # If we got a DOI back, we're done. elif ids.get('doi'): return ids # If we get here, then we've given PMID or PMCID and don't have a DOI yet. # If we gave a PMCID and have neither a PMID nor a DOI, then we'll run # into problems later on when we try to the reverse lookup using CrossRef. # So we bail here and return what we have (PMCID only) with a warning. if ids.get('pmcid') and ids.get('doi') is None and ids.get('pmid') is None: logger.warning('%s: PMCID without PMID or DOI' % ids.get('pmcid')) return ids # To clarify the state of things at this point: assert ids.get('pmid') is not None assert ids.get('doi') is None # As a last result, we try to get the DOI from CrossRef (which internally # tries to get the DOI from Pubmed in the process of collecting the # necessary metadata for the lookup): ids['doi'] = crossref_client.doi_query(ids['pmid']) # It may still be None, but at this point there's nothing we can do... return ids
def id_lookup(paper_id, idtype): """Take an ID of type PMID, PMCID, or DOI and lookup the other IDs. If the DOI is not found in Pubmed, try to obtain the DOI by doing a reverse-lookup of the DOI in CrossRef using article metadata. Parameters ---------- paper_id : string ID of the article. idtype : 'pmid', 'pmcid', or 'doi Type of the ID. Returns ------- ids : dict A dictionary with the following keys: pmid, pmcid and doi. """ if idtype not in ('pmid', 'pmcid', 'doi'): raise ValueError("Invalid idtype %s; must be 'pmid', 'pmcid', " "or 'doi'." % idtype) ids = {'doi': None, 'pmid': None, 'pmcid': None} pmc_id_results = pmc_client.id_lookup(paper_id, idtype) # Start with the results of the PMC lookup and then override with the # provided ID ids['pmid'] = pmc_id_results.get('pmid') ids['pmcid'] = pmc_id_results.get('pmcid') ids['doi'] = pmc_id_results.get('doi') ids[idtype] = paper_id # If we gave a DOI, then our work is done after looking for PMID and PMCID if idtype == 'doi': return ids # If we gave a PMID or PMCID, we need to check to see if we got a DOI. # If we got a DOI back, we're done. elif ids.get('doi'): return ids # If we get here, then we've given PMID or PMCID and don't have a DOI yet. # If we gave a PMCID and have neither a PMID nor a DOI, then we'll run # into problems later on when we try to the reverse lookup using CrossRef. # So we bail here and return what we have (PMCID only) with a warning. if ids.get('pmcid') and ids.get('doi') is None and ids.get('pmid') is None: logger.warning('%s: PMCID without PMID or DOI' % ids.get('pmcid')) return ids # To clarify the state of things at this point: assert ids.get('pmid') is not None assert ids.get('doi') is None # As a last result, we try to get the DOI from CrossRef (which internally # tries to get the DOI from Pubmed in the process of collecting the # necessary metadata for the lookup): ids['doi'] = crossref_client.doi_query(ids['pmid']) # It may still be None, but at this point there's nothing we can do... return ids
def get_missing_pmids(self, tr_data): "Try to get missing pmids using the pmc client." num_missing = 0 num_found = 0 logger.debug("Getting missing pmids.") # TODO: This is very slow...should find a way to speed it up. for tr_entry in tr_data: if tr_entry['pmid'] is None: num_missing += 1 ret = id_lookup(tr_entry['pmcid']) if 'pmid' in ret.keys(): tr_entry['pmid'] = ret['pmid'] num_found += 1 ''' # The web api does not support this much access, sadly. thread_list = [] for tr_entry in tr_data: if tr_entry['pmid'] is None: th = Thread(target=lookup_pmid, args=[tr_entry]) thread_list.append(th) N = min(10, len(thread_list)) logger.debug("Starting %d threading pool." % N) active_threads = [] for _ in range(N): th = thread_list.pop() th.start() active_threads.append(th) while len(thread_list): for th in active_threads[:]: if not th.is_alive(): th.join() active_threads.remove(th) if len(thread_list): new_th = thread_list.pop() new_th.start() active_threads.append(th) sleep(0.1) for th in active_threads: th.join() ''' logger.debug("Found %d/%d new pmids." % (num_found, num_missing)) return
def get_text_content_for_pmids(pmids): """Get text content for articles given a list of their pmids Parameters ---------- pmids : list of str Returns ------- text_content : list of str """ pmc_pmids = set(pmc_client.filter_pmids(pmids, source_type='fulltext')) pmc_ids = [] for pmid in pmc_pmids: pmc_id = pmc_client.id_lookup(pmid, idtype='pmid')['pmcid'] if pmc_id: pmc_ids.append(pmc_id) else: pmc_pmids.discard(pmid) pmc_xmls = [] failed = set() for pmc_id in pmc_ids: if pmc_id is not None: pmc_xmls.append(pmc_client.get_xml(pmc_id)) else: failed.add(pmid) time.sleep(0.5) remaining_pmids = set(pmids) - pmc_pmids | failed abstracts = [] for pmid in remaining_pmids: abstract = pubmed_client.get_abstract(pmid) abstracts.append(abstract) time.sleep(0.5) return [ text_content for source in (pmc_xmls, abstracts) for text_content in source if text_content is not None ]
def get_text_content_for_pmids(pmids): """Get text content for articles given a list of their pmids Parameters ---------- pmids : list of str Returns ------- text_content : list of str """ pmc_pmids = set(pmc_client.filter_pmids(pmids, source_type='fulltext')) pmc_ids = [] for pmid in pmc_pmids: pmc_id = pmc_client.id_lookup(pmid, idtype='pmid')['pmcid'] if pmc_id: pmc_ids.append(pmc_id) else: pmc_pmids.discard(pmid) pmc_xmls = [] failed = set() for pmc_id in pmc_ids: if pmc_id is not None: pmc_xmls.append(pmc_client.get_xml(pmc_id)) else: failed.append(pmid) time.sleep(0.5) remaining_pmids = set(pmids) - pmc_pmids | failed abstracts = [] for pmid in remaining_pmids: abstract = pubmed_client.get_abstract(pmid) abstracts.append(abstract) time.sleep(0.5) return [text_content for source in (pmc_xmls, abstracts) for text_content in source if text_content is not None]
def test_id_lookup_pmcid_no_prefix_idtype(): ids = pmc_client.id_lookup('4322985', idtype='pmcid') assert ids['doi'] == example_ids['doi'] assert ids['pmid'] == example_ids['pmid'] assert ids['pmcid'] == example_ids['pmcid'] assert unicode_strs(ids)
def test_id_lookup_pmcid_idtype(): ids = pmc_client.id_lookup('PMC4322985', idtype='pmcid') assert (ids['doi'] == example_ids['doi']) assert (ids['pmid'] == example_ids['pmid']) assert (ids['pmcid'] == example_ids['pmcid']) assert unicode_strs(ids)
def test_invalid_idtype(): ids = pmc_client.id_lookup('DOI10.18632/oncotarget.2555', idtype='foo')
def test_id_lookup_doi_prefix_no_idtype(): ids = pmc_client.id_lookup('DOI10.18632/oncotarget.2555') assert ids['doi'] == example_ids['doi'] assert ids['pmid'] == example_ids['pmid'] assert ids['pmcid'] == example_ids['pmcid'] assert unicode_strs(ids)
def test_id_lookup_pmid_with_prefix_no_idtype(): ids = pmc_client.id_lookup('PMID25361007') assert ids['doi'] == example_ids['doi'] assert ids['pmid'] == example_ids['pmid'] assert ids['pmcid'] == example_ids['pmcid'] assert unicode_strs(ids)
def test_id_lookup_pmid_no_prefix_no_idtype(): ids = pmc_client.id_lookup("25361007") assert ids["doi"] == example_ids["doi"] assert ids["pmid"] == example_ids["pmid"] assert ids["pmcid"] == example_ids["pmcid"] assert unicode_strs(ids)
def test_invalid_idtype(): ids = pmc_client.id_lookup("DOI10.18632/oncotarget.2555", idtype="foo")
def test_id_lookup_doi_prefix_no_idtype(): ids = pmc_client.id_lookup("DOI10.18632/oncotarget.2555") assert ids["doi"] == example_ids["doi"] assert ids["pmid"] == example_ids["pmid"] assert ids["pmcid"] == example_ids["pmcid"] assert unicode_strs(ids)
def test_id_lookup_pmcid_no_prefix_idtype(): ids = pmc_client.id_lookup("4322985", idtype="pmcid") assert ids["doi"] == example_ids["doi"] assert ids["pmid"] == example_ids["pmid"] assert ids["pmcid"] == example_ids["pmcid"] assert unicode_strs(ids)