Esempio n. 1
0
def id_lookup(paper_id, idtype):
    """Take an ID of type PMID, PMCID, or DOI and lookup the other IDs.

    If the DOI is not found in Pubmed, try to obtain the DOI by doing a
    reverse-lookup of the DOI in CrossRef using article metadata.

    Parameters
    ----------
    paper_id : str
        ID of the article.
    idtype : str
        Type of the ID: 'pmid', 'pmcid', or 'doi

    Returns
    -------
    ids : dict
        A dictionary with the following keys: pmid, pmcid and doi.
    """
    if idtype not in ('pmid', 'pmcid', 'doi'):
        raise ValueError("Invalid idtype %s; must be 'pmid', 'pmcid', "
                         "or 'doi'." % idtype)

    ids = {'doi': None, 'pmid': None, 'pmcid': None}
    pmc_id_results = pmc_client.id_lookup(paper_id, idtype)
    # Start with the results of the PMC lookup and then override with the
    # provided ID
    ids['pmid'] = pmc_id_results.get('pmid')
    ids['pmcid'] = pmc_id_results.get('pmcid')
    ids['doi'] = pmc_id_results.get('doi')
    ids[idtype] = paper_id
    # If we gave a DOI, then our work is done after looking for PMID and PMCID
    if idtype == 'doi':
        return ids
    # If we gave a PMID or PMCID, we need to check to see if we got a DOI.
    # If we got a DOI back, we're done.
    elif ids.get('doi'):
        return ids
    # If we get here, then we've given PMID or PMCID and don't have a DOI yet.
    # If we gave a PMCID and have neither a PMID nor a DOI, then we'll run
    # into problems later on when we try to the reverse lookup using CrossRef.
    # So we bail here and return what we have (PMCID only) with a warning.
    if ids.get('pmcid') and ids.get('doi') is None and ids.get('pmid') is None:
        logger.warning('%s: PMCID without PMID or DOI' % ids.get('pmcid'))
        return ids
    # To clarify the state of things at this point:
    assert ids.get('pmid') is not None
    assert ids.get('doi') is None
    # As a last result, we try to get the DOI from CrossRef (which internally
    # tries to get the DOI from Pubmed in the process of collecting the
    # necessary metadata for the lookup):
    ids['doi'] = crossref_client.doi_query(ids['pmid'])
    # It may still be None, but at this point there's nothing we can do...
    return ids
Esempio n. 2
0
def id_lookup(paper_id, idtype):
    """Take an ID of type PMID, PMCID, or DOI and lookup the other IDs.

    If the DOI is not found in Pubmed, try to obtain the DOI by doing a
    reverse-lookup of the DOI in CrossRef using article metadata.

    Parameters
    ----------
    paper_id : string
        ID of the article.
    idtype : 'pmid', 'pmcid', or 'doi
        Type of the ID.

    Returns
    -------
    ids : dict
        A dictionary with the following keys: pmid, pmcid and doi.
    """
    if idtype not in ('pmid', 'pmcid', 'doi'):
        raise ValueError("Invalid idtype %s; must be 'pmid', 'pmcid', "
                         "or 'doi'." % idtype)

    ids = {'doi': None, 'pmid': None, 'pmcid': None}
    pmc_id_results = pmc_client.id_lookup(paper_id, idtype)
    # Start with the results of the PMC lookup and then override with the
    # provided ID
    ids['pmid'] = pmc_id_results.get('pmid')
    ids['pmcid'] = pmc_id_results.get('pmcid')
    ids['doi'] = pmc_id_results.get('doi')
    ids[idtype] = paper_id
    # If we gave a DOI, then our work is done after looking for PMID and PMCID
    if idtype == 'doi':
        return ids
    # If we gave a PMID or PMCID, we need to check to see if we got a DOI.
    # If we got a DOI back, we're done.
    elif ids.get('doi'):
        return ids
    # If we get here, then we've given PMID or PMCID and don't have a DOI yet.
    # If we gave a PMCID and have neither a PMID nor a DOI, then we'll run
    # into problems later on when we try to the reverse lookup using CrossRef.
    # So we bail here and return what we have (PMCID only) with a warning.
    if ids.get('pmcid') and ids.get('doi') is None and ids.get('pmid') is None:
        logger.warning('%s: PMCID without PMID or DOI' % ids.get('pmcid'))
        return ids
    # To clarify the state of things at this point:
    assert ids.get('pmid') is not None
    assert ids.get('doi') is None
    # As a last result, we try to get the DOI from CrossRef (which internally
    # tries to get the DOI from Pubmed in the process of collecting the
    # necessary metadata for the lookup):
    ids['doi'] = crossref_client.doi_query(ids['pmid'])
    # It may still be None, but at this point there's nothing we can do...
    return ids
Esempio n. 3
0
def get_citation_count_for_pmid(pmid: str) -> Union[int, None]:
    """Return the citation count for a given PMID.

    This uses the CrossRef API to get the DOI for the PMID, and then
    calls the COCI API to get the citation count for the DOI.

    If the DOI lookup failed, this returns None. Note that
    the COCI API returns a count of 0 for DOIs that are not
    indexed.

    Parameters
    ----------
    pmid :
        The PMID to get the citation count for.

    Returns
    -------
    :
        The citation count for the PMID.
    """
    doi = doi_query(pmid)
    if not doi:
        return None
    return get_citation_count_for_doi(doi)
Esempio n. 4
0
def test_doi_query():
    mapped_doi = crossref_client.doi_query(example_ids['pmid'])
    assert mapped_doi == example_ids['doi']
    assert unicode_strs(mapped_doi)
Esempio n. 5
0
pmid_map = {}
with open('pmid_pmcid_doi_map.txt') as f:
    csvreader = csv.reader(f, delimiter='\t')
    for row in csvreader:
        doi = None if row[2] == '' else row[2]
        pmid_map[row[0]] = (row[1], doi)

with open('no_cached_doi.pkl') as f:
    no_cached_doi = [line.strip('\n') for line in f.readlines()]

# Get random sample of 100 non-cached DOIs
row_indices = range(len(no_cached_doi))
sample_indices = np.random.choice(row_indices, size=100, replace=False)

total_tests = 0
num_passes = 0
for sample_ix in sample_indices:
    ref = no_cached_doi[sample_ix]
    pm_doi = pmid_map[ref][1]
    xref_doi = crossref_client.doi_query(ref)
    if xref_doi:
        total_tests += 1
        if xref_doi.lower() == pm_doi.lower():
            num_passes += 1
            print "Pass: %s / %s" % (num_passes, total_tests)
        else:
            print "Fail: pm: %s, xref: %s" % (pm_doi, xref_doi)
    else:
        print "No DOI found, skipping"

Esempio n. 6
0
def test_doi_query():
    mapped_doi = crossref_client.doi_query(example_ids['pmid'])
    assert mapped_doi == example_ids['doi']
    assert unicode_strs(mapped_doi)
Esempio n. 7
0
with open('missing_dois.txt') as f:
    missing_dois = [line.strip('\n') for line in f.readlines()]

def save(doi_cache, counter):
    with open('doi_cache_%.5d.txt' % counter, 'w') as f:
        print "Writing to doi cache"
        csvwriter = csv.writer(f, delimiter='\t')
        for k, v in doi_cache.iteritems():
            csvwriter.writerow((k, v))

for counter, ref in enumerate(missing_dois):
    if doi_cache.get(ref):
        continue
    title = pubmed_client.get_title(ref)
    if not title:
        print "No title, skipping", ref
        continue
    doi = crossref_client.doi_query(title)
    if doi:
        doi_cache[ref] = doi
        print "%d: %s --> %s" % (counter, ref, doi)
    else:
        print "No DOI for %s: %s" % (ref, title)
        continue

    if counter % 100 == 0:
        save(doi_cache, counter)

save(doi_cache, counter)