def id_lookup(paper_id, idtype): """Take an ID of type PMID, PMCID, or DOI and lookup the other IDs. If the DOI is not found in Pubmed, try to obtain the DOI by doing a reverse-lookup of the DOI in CrossRef using article metadata. Parameters ---------- paper_id : str ID of the article. idtype : str Type of the ID: 'pmid', 'pmcid', or 'doi Returns ------- ids : dict A dictionary with the following keys: pmid, pmcid and doi. """ if idtype not in ('pmid', 'pmcid', 'doi'): raise ValueError("Invalid idtype %s; must be 'pmid', 'pmcid', " "or 'doi'." % idtype) ids = {'doi': None, 'pmid': None, 'pmcid': None} pmc_id_results = pmc_client.id_lookup(paper_id, idtype) # Start with the results of the PMC lookup and then override with the # provided ID ids['pmid'] = pmc_id_results.get('pmid') ids['pmcid'] = pmc_id_results.get('pmcid') ids['doi'] = pmc_id_results.get('doi') ids[idtype] = paper_id # If we gave a DOI, then our work is done after looking for PMID and PMCID if idtype == 'doi': return ids # If we gave a PMID or PMCID, we need to check to see if we got a DOI. # If we got a DOI back, we're done. elif ids.get('doi'): return ids # If we get here, then we've given PMID or PMCID and don't have a DOI yet. # If we gave a PMCID and have neither a PMID nor a DOI, then we'll run # into problems later on when we try to the reverse lookup using CrossRef. # So we bail here and return what we have (PMCID only) with a warning. if ids.get('pmcid') and ids.get('doi') is None and ids.get('pmid') is None: logger.warning('%s: PMCID without PMID or DOI' % ids.get('pmcid')) return ids # To clarify the state of things at this point: assert ids.get('pmid') is not None assert ids.get('doi') is None # As a last result, we try to get the DOI from CrossRef (which internally # tries to get the DOI from Pubmed in the process of collecting the # necessary metadata for the lookup): ids['doi'] = crossref_client.doi_query(ids['pmid']) # It may still be None, but at this point there's nothing we can do... return ids
def id_lookup(paper_id, idtype): """Take an ID of type PMID, PMCID, or DOI and lookup the other IDs. If the DOI is not found in Pubmed, try to obtain the DOI by doing a reverse-lookup of the DOI in CrossRef using article metadata. Parameters ---------- paper_id : string ID of the article. idtype : 'pmid', 'pmcid', or 'doi Type of the ID. Returns ------- ids : dict A dictionary with the following keys: pmid, pmcid and doi. """ if idtype not in ('pmid', 'pmcid', 'doi'): raise ValueError("Invalid idtype %s; must be 'pmid', 'pmcid', " "or 'doi'." % idtype) ids = {'doi': None, 'pmid': None, 'pmcid': None} pmc_id_results = pmc_client.id_lookup(paper_id, idtype) # Start with the results of the PMC lookup and then override with the # provided ID ids['pmid'] = pmc_id_results.get('pmid') ids['pmcid'] = pmc_id_results.get('pmcid') ids['doi'] = pmc_id_results.get('doi') ids[idtype] = paper_id # If we gave a DOI, then our work is done after looking for PMID and PMCID if idtype == 'doi': return ids # If we gave a PMID or PMCID, we need to check to see if we got a DOI. # If we got a DOI back, we're done. elif ids.get('doi'): return ids # If we get here, then we've given PMID or PMCID and don't have a DOI yet. # If we gave a PMCID and have neither a PMID nor a DOI, then we'll run # into problems later on when we try to the reverse lookup using CrossRef. # So we bail here and return what we have (PMCID only) with a warning. if ids.get('pmcid') and ids.get('doi') is None and ids.get('pmid') is None: logger.warning('%s: PMCID without PMID or DOI' % ids.get('pmcid')) return ids # To clarify the state of things at this point: assert ids.get('pmid') is not None assert ids.get('doi') is None # As a last result, we try to get the DOI from CrossRef (which internally # tries to get the DOI from Pubmed in the process of collecting the # necessary metadata for the lookup): ids['doi'] = crossref_client.doi_query(ids['pmid']) # It may still be None, but at this point there's nothing we can do... return ids
def get_citation_count_for_pmid(pmid: str) -> Union[int, None]: """Return the citation count for a given PMID. This uses the CrossRef API to get the DOI for the PMID, and then calls the COCI API to get the citation count for the DOI. If the DOI lookup failed, this returns None. Note that the COCI API returns a count of 0 for DOIs that are not indexed. Parameters ---------- pmid : The PMID to get the citation count for. Returns ------- : The citation count for the PMID. """ doi = doi_query(pmid) if not doi: return None return get_citation_count_for_doi(doi)
def test_doi_query(): mapped_doi = crossref_client.doi_query(example_ids['pmid']) assert mapped_doi == example_ids['doi'] assert unicode_strs(mapped_doi)
pmid_map = {} with open('pmid_pmcid_doi_map.txt') as f: csvreader = csv.reader(f, delimiter='\t') for row in csvreader: doi = None if row[2] == '' else row[2] pmid_map[row[0]] = (row[1], doi) with open('no_cached_doi.pkl') as f: no_cached_doi = [line.strip('\n') for line in f.readlines()] # Get random sample of 100 non-cached DOIs row_indices = range(len(no_cached_doi)) sample_indices = np.random.choice(row_indices, size=100, replace=False) total_tests = 0 num_passes = 0 for sample_ix in sample_indices: ref = no_cached_doi[sample_ix] pm_doi = pmid_map[ref][1] xref_doi = crossref_client.doi_query(ref) if xref_doi: total_tests += 1 if xref_doi.lower() == pm_doi.lower(): num_passes += 1 print "Pass: %s / %s" % (num_passes, total_tests) else: print "Fail: pm: %s, xref: %s" % (pm_doi, xref_doi) else: print "No DOI found, skipping"
with open('missing_dois.txt') as f: missing_dois = [line.strip('\n') for line in f.readlines()] def save(doi_cache, counter): with open('doi_cache_%.5d.txt' % counter, 'w') as f: print "Writing to doi cache" csvwriter = csv.writer(f, delimiter='\t') for k, v in doi_cache.iteritems(): csvwriter.writerow((k, v)) for counter, ref in enumerate(missing_dois): if doi_cache.get(ref): continue title = pubmed_client.get_title(ref) if not title: print "No title, skipping", ref continue doi = crossref_client.doi_query(title) if doi: doi_cache[ref] = doi print "%d: %s --> %s" % (counter, ref, doi) else: print "No DOI for %s: %s" % (ref, title) continue if counter % 100 == 0: save(doi_cache, counter) save(doi_cache, counter)