Exemple #1
0
def process_pmc(pmc_id, offline=False):
    """Return a ReachProcessor by processing a paper with a given PMC id.

    Uses the PMC client to obtain the full text. If it's not available,
    None is returned.

    Parameters
    ----------
    pmc_id : str
        The ID of a PubmedCentral article. The string may start with PMC but
        passing just the ID also works.
        Examples: 3717945, PMC3717945
        https://www.ncbi.nlm.nih.gov/pmc/
    offline : Optional[bool]
        If set to True, the REACH system is ran offline. Otherwise (by default)
        the web service is called. Default: False

    Returns
    -------
    rp : ReachProcessor
        A ReachProcessor containing the extracted INDRA Statements
        in rp.statements.
    """
    xml_str = pmc_client.get_xml(pmc_id)
    if xml_str is None:
        return None
    fname = pmc_id + '.nxml'
    with open(fname, 'wb') as fh:
        fh.write(xml_str.encode('utf-8'))
    ids = id_lookup(pmc_id, 'pmcid')
    pmid = ids.get('pmid')
    rp = process_nxml_file(fname, citation=pmid, offline=offline)
    return rp
Exemple #2
0
def process_pmc(pmc_id, offline=False, output_fname=default_output_fname):
    """Return a ReachProcessor by processing a paper with a given PMC id.

    Uses the PMC client to obtain the full text. If it's not available,
    None is returned.

    Parameters
    ----------
    pmc_id : str
        The ID of a PubmedCentral article. The string may start with PMC but
        passing just the ID also works.
        Examples: 3717945, PMC3717945
        https://www.ncbi.nlm.nih.gov/pmc/
    offline : Optional[bool]
        If set to True, the REACH system is ran offline. Otherwise (by default)
        the web service is called. Default: False

    Returns
    -------
    rp : ReachProcessor
        A ReachProcessor containing the extracted INDRA Statements
        in rp.statements.
    """
    xml_str = pmc_client.get_xml(pmc_id)
    if xml_str is None:
        return None
    fname = pmc_id + '.nxml'
    with open(fname, 'wb') as fh:
        fh.write(xml_str.encode('utf-8'))
    ids = id_lookup(pmc_id, 'pmcid')
    pmid = ids.get('pmid')
    rp = process_nxml_file(fname, citation=pmid, offline=offline,
                           output_fname=output_fname)
    return rp
Exemple #3
0
def test_extract_text():
    pmc_id = '4322985'
    xml_str = pmc_client.get_xml(pmc_id)
    text = pmc_client.extract_text(xml_str)
    assert text is not None
    assert 'RAS VS BRAF ONCOGENES AND TARGETED THERAPIES' in text
    assert unicode_strs(text)
Exemple #4
0
def process_pmc(pmc_id,
                offline=False,
                url=None,
                output_fname=default_output_fname):
    """Return a ReachProcessor by processing a paper with a given PMC id.

    Uses the PMC client to obtain the full text. If it's not available,
    None is returned.

    Parameters
    ----------
    pmc_id : str
        The ID of a PubmedCentral article. The string may start with PMC but
        passing just the ID also works.
        Examples: 3717945, PMC3717945
        https://www.ncbi.nlm.nih.gov/pmc/
    offline : Optional[bool]
        If set to True, the REACH system is run offline via a JAR file.
        Otherwise (by default) the web service is called. Default: False
    url : Optional[str]
        URL for a REACH web service instance, which is used for reading if
        provided. If not provided but offline is set to False (its default
        value), the Arizona REACH web service is called
        (http://agathon.sista.arizona.edu:8080/odinweb/api/help).
        Default: None
    output_fname : Optional[str]
        The file to output the REACH JSON output to.
        Defaults to reach_output.json in current working directory.

    Returns
    -------
    rp : ReachProcessor
        A ReachProcessor containing the extracted INDRA Statements
        in rp.statements.
    """
    # Loading content from PMC first
    logger.info('Loading %s from PMC' % pmc_id)
    xml_str = pmc_client.get_xml(pmc_id)
    if xml_str is None:
        return None
    # Write into a file in the working folder
    fname = pmc_id + '.nxml'
    with open(fname, 'wb') as fh:
        fh.write(xml_str.encode('utf-8'))
    # Try to get the PMID for the paper so that the evidence pmid
    # attribute can be set correctly
    logger.info('Looking up PMID for %s' % pmc_id)
    ids = id_lookup(pmc_id, 'pmcid')
    pmid = ids.get('pmid')
    # Now process the NXML file with the provided arguments
    logger.info('Processing %s with REACH' % pmc_id)
    rp = process_nxml_file(fname,
                           citation=pmid,
                           offline=offline,
                           url=url,
                           output_fname=output_fname)
    return rp
Exemple #5
0
def get_sample(pmids, k, fname):
    random.shuffle(pmids)
    done = 0
    with open(fname, 'w') as fh:
        for pmid in pmids:
            ids = id_lookup(pmid, 'pmid')
            pmcid = ids.get('pmcid')
            if pmcid:
                fh.write('%s\n' % pmcid)
                print('Downloading %s' % pmcid)
                xml = pmc_client.get_xml(pmcid)
                if xml:
                    with open('docs/pmc_xmls/%s.nxml' % pmcid, 'w') as xfh:
                        xfh.write(xml)
                    done += 1
                    if done == k:
                        break
Exemple #6
0
def get_text_content_for_pmids(pmids):
    """Get text content for articles given a list of their pmids

    Parameters
    ----------
    pmids : list of str

    Returns
    -------
    text_content : list of str
    """
    pmc_pmids = set(pmc_client.filter_pmids(pmids, source_type='fulltext'))

    pmc_ids = []
    for pmid in pmc_pmids:
        pmc_id = pmc_client.id_lookup(pmid, idtype='pmid')['pmcid']
        if pmc_id:
            pmc_ids.append(pmc_id)
        else:
            pmc_pmids.discard(pmid)

    pmc_xmls = []
    failed = set()
    for pmc_id in pmc_ids:
        if pmc_id is not None:
            pmc_xmls.append(pmc_client.get_xml(pmc_id))
        else:
            failed.add(pmid)
        time.sleep(0.5)

    remaining_pmids = set(pmids) - pmc_pmids | failed
    abstracts = []
    for pmid in remaining_pmids:
        abstract = pubmed_client.get_abstract(pmid)
        abstracts.append(abstract)
        time.sleep(0.5)

    return [
        text_content for source in (pmc_xmls, abstracts)
        for text_content in source if text_content is not None
    ]
Exemple #7
0
def get_text_content_for_pmids(pmids):
    """Get text content for articles given a list of their pmids

    Parameters
    ----------
    pmids : list of str

    Returns
    -------
    text_content : list of str
    """
    pmc_pmids = set(pmc_client.filter_pmids(pmids, source_type='fulltext'))

    pmc_ids = []
    for pmid in pmc_pmids:
        pmc_id = pmc_client.id_lookup(pmid, idtype='pmid')['pmcid']
        if pmc_id:
            pmc_ids.append(pmc_id)
        else:
            pmc_pmids.discard(pmid)

    pmc_xmls = []
    failed = set()
    for pmc_id in pmc_ids:
        if pmc_id is not None:
            pmc_xmls.append(pmc_client.get_xml(pmc_id))
        else:
            failed.append(pmid)
        time.sleep(0.5)

    remaining_pmids = set(pmids) - pmc_pmids | failed
    abstracts = []
    for pmid in remaining_pmids:
        abstract = pubmed_client.get_abstract(pmid)
        abstracts.append(abstract)
        time.sleep(0.5)

    return [text_content for source in (pmc_xmls, abstracts)
            for text_content in source if text_content is not None]
Exemple #8
0
def test_universal_extract_paragraphs_pmc():
    pmc_id = 'PMC3262597'
    xml_str = pmc_client.get_xml(pmc_id)
    paragraphs = universal_extract_paragraphs(xml_str)
    assert len(paragraphs) > 1, paragraphs
Exemple #9
0
def test_get_xml_invalid():
    pmc_id = '9999999'
    xml_str = pmc_client.get_xml(pmc_id)
    assert xml_str is None
Exemple #10
0
def test_get_xml_PMC():
    pmc_id = 'PMC4322985'
    xml_str = pmc_client.get_xml(pmc_id)
    assert xml_str is not None
    assert unicode_strs((pmc_id, xml_str))
Exemple #11
0
def test_universal_extract_paragraphs_pmc():
    pmc_id = 'PMC3262597'
    xml_str = pmc_client.get_xml(pmc_id)
    paragraphs = universal_extract_paragraphs(xml_str)
    assert len(paragraphs) > 1
Exemple #12
0
def test_extract_text():
    pmc_id = '4322985'
    xml_str = pmc_client.get_xml(pmc_id)
    text = pmc_client.extract_text(xml_str)
    assert text is not None
    assert unicode_strs(text)
Exemple #13
0
def get_full_text(paper_id, idtype, preferred_content_type='text/xml'):
    """Return the content and the content type of an article.

    This function retreives the content of an article by its PubMed ID,
    PubMed Central ID, or DOI. It prioritizes full text content when available
    and returns an abstract from PubMed as a fallback.

    Parameters
    ----------
    paper_id : string
        ID of the article.
    idtype : 'pmid', 'pmcid', or 'doi
        Type of the ID.
    preferred_content_type : Optional[st]r
        Preference for full-text format, if available. Can be one of
        'text/xml', 'text/plain', 'application/pdf'. Default: 'text/xml'

    Returns
    -------
    content : str
        The content of the article.
    content_type : str
        The content type of the article
    """
    if preferred_content_type not in \
            ('text/xml', 'text/plain', 'application/pdf'):
        raise ValueError("preferred_content_type must be one of 'text/xml', "
                         "'text/plain', or 'application/pdf'.")
    ids = id_lookup(paper_id, idtype)
    pmcid = ids.get('pmcid')
    pmid = ids.get('pmid')
    doi = ids.get('doi')
    # First try to find paper via PMC
    if pmcid:
        nxml = pmc_client.get_xml(pmcid)
        if nxml:
            return nxml, 'pmc_oa_xml'
    # If we got here, it means we didn't find the full text in PMC, so we'll
    # need either the DOI (for lookup in CrossRef) and/or the PMID (so we
    # can fall back on the abstract. If by some strange turn we have neither,
    # give up now.
    if not doi and not pmid:
        return (None, None)

    # If it does not have PMC NXML then we attempt to obtain the full-text
    # through the CrossRef Click-through API
    if doi:
        # Get publisher
        publisher = crossref_client.get_publisher(doi)

        # First check for whether this is Elsevier--if so, use the Elsevier
        # client directly, because the Clickthrough API key seems unreliable.
        # Return full XML.
        if publisher == 'Elsevier BV':
            logger.info('Elsevier: %s' % pmid)
            #article = elsevier_client.get_article(doi, output='txt')
            try:
                article_xml = elsevier_client.download_article(doi)
            except Exception as e:
                logger.error("Error downloading Elsevier article: %s" % e)
                article_xml = None
            if article_xml is not None:
                return (article_xml, 'elsevier_xml')

        # FIXME FIXME FIXME
        # Because we don't yet have a way to process non-Elsevier content
        # obtained from CrossRef, which includes both XML of unknown format
        # and PDFs, we just comment this section out for now
        """
        # Check if there are any full text links
        links = crossref_client.get_fulltext_links(doi)
        if links:
            headers = {}
            # Set the Cross Ref Clickthrough API key in the header, if we've
            # got one
            cr_api_key = crossref_client.get_api_key()
            if cr_api_key is not None:
                headers['CR-Clickthrough-Client-Token'] = cr_api_key
            # Utility function to get particular links by content-type
            def lookup_content_type(link_list, content_type):
                content_list = [l.get('URL') for l in link_list
                                if l.get('content-type') == content_type]
                return None if not content_list else content_list[0]
            # First check for what the user asked for
            if lookup_content_type(links, preferred_content_type):
                req = requests.get(lookup_content_type(links,
                                                       preferred_content_type),
                                   headers=headers)
                if req.status_code == 200:
                    req_content_type = req.headers['Content-Type']
                    return req.text, req_content_type
                elif req.status_code == 400:
                    logger.warning('Full text query returned 400 (Bad Request): '
                                  'Perhaps missing CrossRef Clickthrough API '
                                  'key?')
                    return (None, None)
            # Check for XML first
            if lookup_content_type(links, 'text/xml'):
                req = requests.get(lookup_content_type(links, 'text/xml'),
                                   headers=headers)
                if req.status_code == 200:
                    req_content_type = req.headers['Content-Type']
                    return req.text, req_content_type
                elif req.status_code == 400:
                    logger.warning('Full text query returned 400 (Bad Request):'
                                  'Perhaps missing CrossRef Clickthrough API '
                                  'key?')
                    return (None, None)
            # Next, plain text
            elif lookup_content_type(links, 'text/plain'):
                req = requests.get(lookup_content_type(links, 'text/plain'),
                                   headers=headers)
                if req.status_code == 200:
                    req_content_type = req.headers['Content-Type']
                    return req.text, req_content_type
                elif req.status_code == 400:
                    logger.warning('Full text query returned 400 (Bad Request):'
                                  'Perhaps missing CrossRef Clickthrough API '
                                  'key?')
                    return (None, None)
            elif lookup_content_type(links, 'application/pdf'):
                pass
            # Wiley's links are often of content-type 'unspecified'.
            elif lookup_content_type(links, 'unspecified'):
                req = requests.get(lookup_content_type(links, 'unspecified'),
                                   headers=headers)
                if req.status_code == 200:
                    req_content_type = req.headers['Content-Type']
                    return 'foo', req_content_type
                elif req.status_code == 400:
                    logger.warning('Full text query returned 400 (Bad Request):'
                                  'Perhaps missing CrossRef Clickthrough API '
                                  'key?')
                    return (None, None)
                elif req.status_code == 401:
                    logger.warning('Full text query returned 401 (Unauthorized)')
                    return (None, None)
                elif req.status_code == 403:
                    logger.warning('Full text query returned 403 (Forbidden)')
                    return (None, None)
            else:
                raise Exception("Unknown content type(s): %s" % links)
        elif publisher == 'American Society for Biochemistry & Molecular ' \
                          'Biology (ASBMB)':
            url = crossref_client.get_url(doi)
            return get_asbmb_full_text(url)
        """
        # end FIXME FIXME FIXME

        # No full text links and not a publisher we support. We'll have to
        # fall back to the abstract.
        #elif pmid:
        if pmid:
            abstract = pubmed_client.get_abstract(pmid)
            if abstract is None:
                return (None, None)
            else:
                return abstract, 'abstract'
        # We have a useless DOI and no PMID. Give up.
        else:
            return (None, None)
    # We don't have a DOI but we're guaranteed to have a PMID at this point,
    # so we fall back to the abstract:
    else:
        abstract = pubmed_client.get_abstract(pmid)
        if abstract is None:
            return (None, None)
        else:
            return abstract, 'abstract'
    # We'll only get here if we've missed a combination of conditions
    assert False
Exemple #14
0
def test_get_xml():
    pmc_id = '4322985'
    xml_str = pmc_client.get_xml(pmc_id)
    assert (xml_str is not None)
    assert unicode_strs((pmc_id, xml_str))
Exemple #15
0
def test_get_xml_invalid():
    pmc_id = "9999999"
    xml_str = pmc_client.get_xml(pmc_id)
    assert xml_str is None
Exemple #16
0
def test_get_xml_PMC():
    pmc_id = "PMC4322985"
    xml_str = pmc_client.get_xml(pmc_id)
    assert xml_str is not None
    assert unicode_strs((pmc_id, xml_str))
Exemple #17
0
def test_extract_text():
    pmc_id = '4322985'
    xml_str = pmc_client.get_xml(pmc_id)
    text = pmc_client.extract_text(xml_str)
    assert text is not None
    assert unicode_strs(text)
Exemple #18
0
def get_full_text(paper_id, idtype, preferred_content_type='text/xml'):
    """Return the content and the content type of an article.

    This function retreives the content of an article by its PubMed ID,
    PubMed Central ID, or DOI. It prioritizes full text content when available
    and returns an abstract from PubMed as a fallback.

    Parameters
    ----------
    paper_id : string
        ID of the article.
    idtype : 'pmid', 'pmcid', or 'doi
        Type of the ID.
    preferred_content_type : Optional[st]r
        Preference for full-text format, if available. Can be one of
        'text/xml', 'text/plain', 'application/pdf'. Default: 'text/xml'

    Returns
    -------
    content : str
        The content of the article.
    content_type : str
        The content type of the article
    """
    if preferred_content_type not in \
            ('text/xml', 'text/plain', 'application/pdf'):
        raise ValueError("preferred_content_type must be one of 'text/xml', "
                         "'text/plain', or 'application/pdf'.")
    ids = id_lookup(paper_id, idtype)
    pmcid = ids.get('pmcid')
    pmid = ids.get('pmid')
    doi = ids.get('doi')
    # First try to find paper via PMC
    if pmcid:
        nxml = pmc_client.get_xml(pmcid)
        if nxml:
            return nxml, 'pmc_oa_xml'
    # If we got here, it means we didn't find the full text in PMC, so we'll
    # need either the DOI (for lookup in CrossRef) and/or the PMID (so we
    # can fall back on the abstract. If by some strange turn we have neither,
    # give up now.
    if not doi and not pmid:
        return (None, None)

    # If it does not have PMC NXML then we attempt to obtain the full-text
    # through the CrossRef Click-through API
    if doi:
        # Get publisher
        publisher = crossref_client.get_publisher(doi)

        # First check for whether this is Elsevier--if so, use the Elsevier
        # client directly, because the Clickthrough API key seems unreliable.
        # Return full XML.
        if publisher == 'Elsevier BV':
            logger.info('Elsevier: %s' % pmid)
            #article = elsevier_client.get_article(doi, output='txt')
            try:
                article_xml = elsevier_client.download_article(doi)
            except Exception as e:
                logger.error("Error downloading Elsevier article: %s" % e)
                article_xml = None
            if article_xml is not None:
                return (article_xml, 'elsevier_xml')

        # FIXME FIXME FIXME
        # Because we don't yet have a way to process non-Elsevier content
        # obtained from CrossRef, which includes both XML of unknown format
        # and PDFs, we just comment this section out for now
        """
        # Check if there are any full text links
        links = crossref_client.get_fulltext_links(doi)
        if links:
            headers = {}
            # Set the Cross Ref Clickthrough API key in the header, if we've
            # got one
            if crossref_client.api_key is not None:
                headers['CR-Clickthrough-Client-Token'] = \
                        crossref_client.api_key
            # Utility function to get particular links by content-type
            def lookup_content_type(link_list, content_type):
                content_list = [l.get('URL') for l in link_list
                                if l.get('content-type') == content_type]
                return None if not content_list else content_list[0]
            # First check for what the user asked for
            if lookup_content_type(links, preferred_content_type):
                req = requests.get(lookup_content_type(links,
                                                       preferred_content_type),
                                   headers=headers)
                if req.status_code == 200:
                    req_content_type = req.headers['Content-Type']
                    return req.text, req_content_type
                elif req.status_code == 400:
                    logger.warning('Full text query returned 400 (Bad Request): '
                                  'Perhaps missing CrossRef Clickthrough API '
                                  'key?')
                    return (None, None)
            # Check for XML first
            if lookup_content_type(links, 'text/xml'):
                req = requests.get(lookup_content_type(links, 'text/xml'),
                                   headers=headers)
                if req.status_code == 200:
                    req_content_type = req.headers['Content-Type']
                    return req.text, req_content_type
                elif req.status_code == 400:
                    logger.warning('Full text query returned 400 (Bad Request):'
                                  'Perhaps missing CrossRef Clickthrough API '
                                  'key?')
                    return (None, None)
            # Next, plain text
            elif lookup_content_type(links, 'text/plain'):
                req = requests.get(lookup_content_type(links, 'text/plain'),
                                   headers=headers)
                if req.status_code == 200:
                    req_content_type = req.headers['Content-Type']
                    return req.text, req_content_type
                elif req.status_code == 400:
                    logger.warning('Full text query returned 400 (Bad Request):'
                                  'Perhaps missing CrossRef Clickthrough API '
                                  'key?')
                    return (None, None)
            elif lookup_content_type(links, 'application/pdf'):
                pass
            # Wiley's links are often of content-type 'unspecified'.
            elif lookup_content_type(links, 'unspecified'):
                req = requests.get(lookup_content_type(links, 'unspecified'),
                                   headers=headers)
                if req.status_code == 200:
                    req_content_type = req.headers['Content-Type']
                    return 'foo', req_content_type
                elif req.status_code == 400:
                    logger.warning('Full text query returned 400 (Bad Request):'
                                  'Perhaps missing CrossRef Clickthrough API '
                                  'key?')
                    return (None, None)
                elif req.status_code == 401:
                    logger.warning('Full text query returned 401 (Unauthorized)')
                    return (None, None)
                elif req.status_code == 403:
                    logger.warning('Full text query returned 403 (Forbidden)')
                    return (None, None)
            else:
                raise Exception("Unknown content type(s): %s" % links)
        elif publisher == 'American Society for Biochemistry & Molecular ' \
                          'Biology (ASBMB)':
            url = crossref_client.get_url(doi)
            return get_asbmb_full_text(url)
        """
        # end FIXME FIXME FIXME

        # No full text links and not a publisher we support. We'll have to
        # fall back to the abstract.
        #elif pmid:
        if pmid:
            abstract = pubmed_client.get_abstract(pmid)
            if abstract is None:
                return (None, None)
            else:
                return abstract, 'abstract'
        # We have a useless DOI and no PMID. Give up.
        else:
            return (None, None)
    # We don't have a DOI but we're guaranteed to have a PMID at this point,
    # so we fall back to the abstract:
    else:
        abstract = pubmed_client.get_abstract(pmid)
        if abstract is None:
            return (None, None)
        else:
            return abstract, 'abstract'
    # We'll only get here if we've missed a combination of conditions
    assert False
Exemple #19
0
def test_get_xml_invalid():
    pmc_id = '9999999'
    xml_str = pmc_client.get_xml(pmc_id)
    assert(xml_str is None)
Exemple #20
0
def get_nxml(pmc_id):
    xml_str = pmc_client.get_xml(pmc_id)
    fname = pmc_id + '.nxml'
    with open(fname, 'wb') as fh:
        fh.write(xml_str.encode('utf-8'))
Exemple #21
0
def test_get_xml_PMC():
    pmc_id = 'PMC4322985'
    xml_str = pmc_client.get_xml(pmc_id)
    assert(xml_str is not None)