def process_pmc(pmc_id, offline=False): """Return a ReachProcessor by processing a paper with a given PMC id. Uses the PMC client to obtain the full text. If it's not available, None is returned. Parameters ---------- pmc_id : str The ID of a PubmedCentral article. The string may start with PMC but passing just the ID also works. Examples: 3717945, PMC3717945 https://www.ncbi.nlm.nih.gov/pmc/ offline : Optional[bool] If set to True, the REACH system is ran offline. Otherwise (by default) the web service is called. Default: False Returns ------- rp : ReachProcessor A ReachProcessor containing the extracted INDRA Statements in rp.statements. """ xml_str = pmc_client.get_xml(pmc_id) if xml_str is None: return None fname = pmc_id + '.nxml' with open(fname, 'wb') as fh: fh.write(xml_str.encode('utf-8')) ids = id_lookup(pmc_id, 'pmcid') pmid = ids.get('pmid') rp = process_nxml_file(fname, citation=pmid, offline=offline) return rp
def process_pmc(pmc_id, offline=False, output_fname=default_output_fname): """Return a ReachProcessor by processing a paper with a given PMC id. Uses the PMC client to obtain the full text. If it's not available, None is returned. Parameters ---------- pmc_id : str The ID of a PubmedCentral article. The string may start with PMC but passing just the ID also works. Examples: 3717945, PMC3717945 https://www.ncbi.nlm.nih.gov/pmc/ offline : Optional[bool] If set to True, the REACH system is ran offline. Otherwise (by default) the web service is called. Default: False Returns ------- rp : ReachProcessor A ReachProcessor containing the extracted INDRA Statements in rp.statements. """ xml_str = pmc_client.get_xml(pmc_id) if xml_str is None: return None fname = pmc_id + '.nxml' with open(fname, 'wb') as fh: fh.write(xml_str.encode('utf-8')) ids = id_lookup(pmc_id, 'pmcid') pmid = ids.get('pmid') rp = process_nxml_file(fname, citation=pmid, offline=offline, output_fname=output_fname) return rp
def test_extract_text(): pmc_id = '4322985' xml_str = pmc_client.get_xml(pmc_id) text = pmc_client.extract_text(xml_str) assert text is not None assert 'RAS VS BRAF ONCOGENES AND TARGETED THERAPIES' in text assert unicode_strs(text)
def process_pmc(pmc_id, offline=False, url=None, output_fname=default_output_fname): """Return a ReachProcessor by processing a paper with a given PMC id. Uses the PMC client to obtain the full text. If it's not available, None is returned. Parameters ---------- pmc_id : str The ID of a PubmedCentral article. The string may start with PMC but passing just the ID also works. Examples: 3717945, PMC3717945 https://www.ncbi.nlm.nih.gov/pmc/ offline : Optional[bool] If set to True, the REACH system is run offline via a JAR file. Otherwise (by default) the web service is called. Default: False url : Optional[str] URL for a REACH web service instance, which is used for reading if provided. If not provided but offline is set to False (its default value), the Arizona REACH web service is called (http://agathon.sista.arizona.edu:8080/odinweb/api/help). Default: None output_fname : Optional[str] The file to output the REACH JSON output to. Defaults to reach_output.json in current working directory. Returns ------- rp : ReachProcessor A ReachProcessor containing the extracted INDRA Statements in rp.statements. """ # Loading content from PMC first logger.info('Loading %s from PMC' % pmc_id) xml_str = pmc_client.get_xml(pmc_id) if xml_str is None: return None # Write into a file in the working folder fname = pmc_id + '.nxml' with open(fname, 'wb') as fh: fh.write(xml_str.encode('utf-8')) # Try to get the PMID for the paper so that the evidence pmid # attribute can be set correctly logger.info('Looking up PMID for %s' % pmc_id) ids = id_lookup(pmc_id, 'pmcid') pmid = ids.get('pmid') # Now process the NXML file with the provided arguments logger.info('Processing %s with REACH' % pmc_id) rp = process_nxml_file(fname, citation=pmid, offline=offline, url=url, output_fname=output_fname) return rp
def get_sample(pmids, k, fname): random.shuffle(pmids) done = 0 with open(fname, 'w') as fh: for pmid in pmids: ids = id_lookup(pmid, 'pmid') pmcid = ids.get('pmcid') if pmcid: fh.write('%s\n' % pmcid) print('Downloading %s' % pmcid) xml = pmc_client.get_xml(pmcid) if xml: with open('docs/pmc_xmls/%s.nxml' % pmcid, 'w') as xfh: xfh.write(xml) done += 1 if done == k: break
def get_text_content_for_pmids(pmids): """Get text content for articles given a list of their pmids Parameters ---------- pmids : list of str Returns ------- text_content : list of str """ pmc_pmids = set(pmc_client.filter_pmids(pmids, source_type='fulltext')) pmc_ids = [] for pmid in pmc_pmids: pmc_id = pmc_client.id_lookup(pmid, idtype='pmid')['pmcid'] if pmc_id: pmc_ids.append(pmc_id) else: pmc_pmids.discard(pmid) pmc_xmls = [] failed = set() for pmc_id in pmc_ids: if pmc_id is not None: pmc_xmls.append(pmc_client.get_xml(pmc_id)) else: failed.add(pmid) time.sleep(0.5) remaining_pmids = set(pmids) - pmc_pmids | failed abstracts = [] for pmid in remaining_pmids: abstract = pubmed_client.get_abstract(pmid) abstracts.append(abstract) time.sleep(0.5) return [ text_content for source in (pmc_xmls, abstracts) for text_content in source if text_content is not None ]
def get_text_content_for_pmids(pmids): """Get text content for articles given a list of their pmids Parameters ---------- pmids : list of str Returns ------- text_content : list of str """ pmc_pmids = set(pmc_client.filter_pmids(pmids, source_type='fulltext')) pmc_ids = [] for pmid in pmc_pmids: pmc_id = pmc_client.id_lookup(pmid, idtype='pmid')['pmcid'] if pmc_id: pmc_ids.append(pmc_id) else: pmc_pmids.discard(pmid) pmc_xmls = [] failed = set() for pmc_id in pmc_ids: if pmc_id is not None: pmc_xmls.append(pmc_client.get_xml(pmc_id)) else: failed.append(pmid) time.sleep(0.5) remaining_pmids = set(pmids) - pmc_pmids | failed abstracts = [] for pmid in remaining_pmids: abstract = pubmed_client.get_abstract(pmid) abstracts.append(abstract) time.sleep(0.5) return [text_content for source in (pmc_xmls, abstracts) for text_content in source if text_content is not None]
def test_universal_extract_paragraphs_pmc(): pmc_id = 'PMC3262597' xml_str = pmc_client.get_xml(pmc_id) paragraphs = universal_extract_paragraphs(xml_str) assert len(paragraphs) > 1, paragraphs
def test_get_xml_invalid(): pmc_id = '9999999' xml_str = pmc_client.get_xml(pmc_id) assert xml_str is None
def test_get_xml_PMC(): pmc_id = 'PMC4322985' xml_str = pmc_client.get_xml(pmc_id) assert xml_str is not None assert unicode_strs((pmc_id, xml_str))
def test_universal_extract_paragraphs_pmc(): pmc_id = 'PMC3262597' xml_str = pmc_client.get_xml(pmc_id) paragraphs = universal_extract_paragraphs(xml_str) assert len(paragraphs) > 1
def test_extract_text(): pmc_id = '4322985' xml_str = pmc_client.get_xml(pmc_id) text = pmc_client.extract_text(xml_str) assert text is not None assert unicode_strs(text)
def get_full_text(paper_id, idtype, preferred_content_type='text/xml'): """Return the content and the content type of an article. This function retreives the content of an article by its PubMed ID, PubMed Central ID, or DOI. It prioritizes full text content when available and returns an abstract from PubMed as a fallback. Parameters ---------- paper_id : string ID of the article. idtype : 'pmid', 'pmcid', or 'doi Type of the ID. preferred_content_type : Optional[st]r Preference for full-text format, if available. Can be one of 'text/xml', 'text/plain', 'application/pdf'. Default: 'text/xml' Returns ------- content : str The content of the article. content_type : str The content type of the article """ if preferred_content_type not in \ ('text/xml', 'text/plain', 'application/pdf'): raise ValueError("preferred_content_type must be one of 'text/xml', " "'text/plain', or 'application/pdf'.") ids = id_lookup(paper_id, idtype) pmcid = ids.get('pmcid') pmid = ids.get('pmid') doi = ids.get('doi') # First try to find paper via PMC if pmcid: nxml = pmc_client.get_xml(pmcid) if nxml: return nxml, 'pmc_oa_xml' # If we got here, it means we didn't find the full text in PMC, so we'll # need either the DOI (for lookup in CrossRef) and/or the PMID (so we # can fall back on the abstract. If by some strange turn we have neither, # give up now. if not doi and not pmid: return (None, None) # If it does not have PMC NXML then we attempt to obtain the full-text # through the CrossRef Click-through API if doi: # Get publisher publisher = crossref_client.get_publisher(doi) # First check for whether this is Elsevier--if so, use the Elsevier # client directly, because the Clickthrough API key seems unreliable. # Return full XML. if publisher == 'Elsevier BV': logger.info('Elsevier: %s' % pmid) #article = elsevier_client.get_article(doi, output='txt') try: article_xml = elsevier_client.download_article(doi) except Exception as e: logger.error("Error downloading Elsevier article: %s" % e) article_xml = None if article_xml is not None: return (article_xml, 'elsevier_xml') # FIXME FIXME FIXME # Because we don't yet have a way to process non-Elsevier content # obtained from CrossRef, which includes both XML of unknown format # and PDFs, we just comment this section out for now """ # Check if there are any full text links links = crossref_client.get_fulltext_links(doi) if links: headers = {} # Set the Cross Ref Clickthrough API key in the header, if we've # got one cr_api_key = crossref_client.get_api_key() if cr_api_key is not None: headers['CR-Clickthrough-Client-Token'] = cr_api_key # Utility function to get particular links by content-type def lookup_content_type(link_list, content_type): content_list = [l.get('URL') for l in link_list if l.get('content-type') == content_type] return None if not content_list else content_list[0] # First check for what the user asked for if lookup_content_type(links, preferred_content_type): req = requests.get(lookup_content_type(links, preferred_content_type), headers=headers) if req.status_code == 200: req_content_type = req.headers['Content-Type'] return req.text, req_content_type elif req.status_code == 400: logger.warning('Full text query returned 400 (Bad Request): ' 'Perhaps missing CrossRef Clickthrough API ' 'key?') return (None, None) # Check for XML first if lookup_content_type(links, 'text/xml'): req = requests.get(lookup_content_type(links, 'text/xml'), headers=headers) if req.status_code == 200: req_content_type = req.headers['Content-Type'] return req.text, req_content_type elif req.status_code == 400: logger.warning('Full text query returned 400 (Bad Request):' 'Perhaps missing CrossRef Clickthrough API ' 'key?') return (None, None) # Next, plain text elif lookup_content_type(links, 'text/plain'): req = requests.get(lookup_content_type(links, 'text/plain'), headers=headers) if req.status_code == 200: req_content_type = req.headers['Content-Type'] return req.text, req_content_type elif req.status_code == 400: logger.warning('Full text query returned 400 (Bad Request):' 'Perhaps missing CrossRef Clickthrough API ' 'key?') return (None, None) elif lookup_content_type(links, 'application/pdf'): pass # Wiley's links are often of content-type 'unspecified'. elif lookup_content_type(links, 'unspecified'): req = requests.get(lookup_content_type(links, 'unspecified'), headers=headers) if req.status_code == 200: req_content_type = req.headers['Content-Type'] return 'foo', req_content_type elif req.status_code == 400: logger.warning('Full text query returned 400 (Bad Request):' 'Perhaps missing CrossRef Clickthrough API ' 'key?') return (None, None) elif req.status_code == 401: logger.warning('Full text query returned 401 (Unauthorized)') return (None, None) elif req.status_code == 403: logger.warning('Full text query returned 403 (Forbidden)') return (None, None) else: raise Exception("Unknown content type(s): %s" % links) elif publisher == 'American Society for Biochemistry & Molecular ' \ 'Biology (ASBMB)': url = crossref_client.get_url(doi) return get_asbmb_full_text(url) """ # end FIXME FIXME FIXME # No full text links and not a publisher we support. We'll have to # fall back to the abstract. #elif pmid: if pmid: abstract = pubmed_client.get_abstract(pmid) if abstract is None: return (None, None) else: return abstract, 'abstract' # We have a useless DOI and no PMID. Give up. else: return (None, None) # We don't have a DOI but we're guaranteed to have a PMID at this point, # so we fall back to the abstract: else: abstract = pubmed_client.get_abstract(pmid) if abstract is None: return (None, None) else: return abstract, 'abstract' # We'll only get here if we've missed a combination of conditions assert False
def test_get_xml(): pmc_id = '4322985' xml_str = pmc_client.get_xml(pmc_id) assert (xml_str is not None) assert unicode_strs((pmc_id, xml_str))
def test_get_xml_invalid(): pmc_id = "9999999" xml_str = pmc_client.get_xml(pmc_id) assert xml_str is None
def test_get_xml_PMC(): pmc_id = "PMC4322985" xml_str = pmc_client.get_xml(pmc_id) assert xml_str is not None assert unicode_strs((pmc_id, xml_str))
def get_full_text(paper_id, idtype, preferred_content_type='text/xml'): """Return the content and the content type of an article. This function retreives the content of an article by its PubMed ID, PubMed Central ID, or DOI. It prioritizes full text content when available and returns an abstract from PubMed as a fallback. Parameters ---------- paper_id : string ID of the article. idtype : 'pmid', 'pmcid', or 'doi Type of the ID. preferred_content_type : Optional[st]r Preference for full-text format, if available. Can be one of 'text/xml', 'text/plain', 'application/pdf'. Default: 'text/xml' Returns ------- content : str The content of the article. content_type : str The content type of the article """ if preferred_content_type not in \ ('text/xml', 'text/plain', 'application/pdf'): raise ValueError("preferred_content_type must be one of 'text/xml', " "'text/plain', or 'application/pdf'.") ids = id_lookup(paper_id, idtype) pmcid = ids.get('pmcid') pmid = ids.get('pmid') doi = ids.get('doi') # First try to find paper via PMC if pmcid: nxml = pmc_client.get_xml(pmcid) if nxml: return nxml, 'pmc_oa_xml' # If we got here, it means we didn't find the full text in PMC, so we'll # need either the DOI (for lookup in CrossRef) and/or the PMID (so we # can fall back on the abstract. If by some strange turn we have neither, # give up now. if not doi and not pmid: return (None, None) # If it does not have PMC NXML then we attempt to obtain the full-text # through the CrossRef Click-through API if doi: # Get publisher publisher = crossref_client.get_publisher(doi) # First check for whether this is Elsevier--if so, use the Elsevier # client directly, because the Clickthrough API key seems unreliable. # Return full XML. if publisher == 'Elsevier BV': logger.info('Elsevier: %s' % pmid) #article = elsevier_client.get_article(doi, output='txt') try: article_xml = elsevier_client.download_article(doi) except Exception as e: logger.error("Error downloading Elsevier article: %s" % e) article_xml = None if article_xml is not None: return (article_xml, 'elsevier_xml') # FIXME FIXME FIXME # Because we don't yet have a way to process non-Elsevier content # obtained from CrossRef, which includes both XML of unknown format # and PDFs, we just comment this section out for now """ # Check if there are any full text links links = crossref_client.get_fulltext_links(doi) if links: headers = {} # Set the Cross Ref Clickthrough API key in the header, if we've # got one if crossref_client.api_key is not None: headers['CR-Clickthrough-Client-Token'] = \ crossref_client.api_key # Utility function to get particular links by content-type def lookup_content_type(link_list, content_type): content_list = [l.get('URL') for l in link_list if l.get('content-type') == content_type] return None if not content_list else content_list[0] # First check for what the user asked for if lookup_content_type(links, preferred_content_type): req = requests.get(lookup_content_type(links, preferred_content_type), headers=headers) if req.status_code == 200: req_content_type = req.headers['Content-Type'] return req.text, req_content_type elif req.status_code == 400: logger.warning('Full text query returned 400 (Bad Request): ' 'Perhaps missing CrossRef Clickthrough API ' 'key?') return (None, None) # Check for XML first if lookup_content_type(links, 'text/xml'): req = requests.get(lookup_content_type(links, 'text/xml'), headers=headers) if req.status_code == 200: req_content_type = req.headers['Content-Type'] return req.text, req_content_type elif req.status_code == 400: logger.warning('Full text query returned 400 (Bad Request):' 'Perhaps missing CrossRef Clickthrough API ' 'key?') return (None, None) # Next, plain text elif lookup_content_type(links, 'text/plain'): req = requests.get(lookup_content_type(links, 'text/plain'), headers=headers) if req.status_code == 200: req_content_type = req.headers['Content-Type'] return req.text, req_content_type elif req.status_code == 400: logger.warning('Full text query returned 400 (Bad Request):' 'Perhaps missing CrossRef Clickthrough API ' 'key?') return (None, None) elif lookup_content_type(links, 'application/pdf'): pass # Wiley's links are often of content-type 'unspecified'. elif lookup_content_type(links, 'unspecified'): req = requests.get(lookup_content_type(links, 'unspecified'), headers=headers) if req.status_code == 200: req_content_type = req.headers['Content-Type'] return 'foo', req_content_type elif req.status_code == 400: logger.warning('Full text query returned 400 (Bad Request):' 'Perhaps missing CrossRef Clickthrough API ' 'key?') return (None, None) elif req.status_code == 401: logger.warning('Full text query returned 401 (Unauthorized)') return (None, None) elif req.status_code == 403: logger.warning('Full text query returned 403 (Forbidden)') return (None, None) else: raise Exception("Unknown content type(s): %s" % links) elif publisher == 'American Society for Biochemistry & Molecular ' \ 'Biology (ASBMB)': url = crossref_client.get_url(doi) return get_asbmb_full_text(url) """ # end FIXME FIXME FIXME # No full text links and not a publisher we support. We'll have to # fall back to the abstract. #elif pmid: if pmid: abstract = pubmed_client.get_abstract(pmid) if abstract is None: return (None, None) else: return abstract, 'abstract' # We have a useless DOI and no PMID. Give up. else: return (None, None) # We don't have a DOI but we're guaranteed to have a PMID at this point, # so we fall back to the abstract: else: abstract = pubmed_client.get_abstract(pmid) if abstract is None: return (None, None) else: return abstract, 'abstract' # We'll only get here if we've missed a combination of conditions assert False
def test_get_xml_invalid(): pmc_id = '9999999' xml_str = pmc_client.get_xml(pmc_id) assert(xml_str is None)
def get_nxml(pmc_id): xml_str = pmc_client.get_xml(pmc_id) fname = pmc_id + '.nxml' with open(fname, 'wb') as fh: fh.write(xml_str.encode('utf-8'))
def test_get_xml_PMC(): pmc_id = 'PMC4322985' xml_str = pmc_client.get_xml(pmc_id) assert(xml_str is not None)