def test_get_rawtext():
    """Make sure we can get content of an article that has content in
    xocs:rawtext"""
    # PMID: 20072652
    doi = '10.1593/neo.91196'
    xml_str = ec.download_article(doi)
    body = ec.extract_text(xml_str)
    assert body
Beispiel #2
0
def test_universal_extract_paragraphs_elsevier():
    doi = '10.1016/B978-0-12-416673-8.00004-6'
    xml_str = elsevier_client.download_article(doi)
    paragraphs = universal_extract_paragraphs(xml_str)
    if len(paragraphs) <= 1:
        logger.warning('Unable to extract paragraphs from XML string:\n'
                       '%s...' % xml_str[:2000])
    assert len(paragraphs) > 1
Beispiel #3
0
def test_universal_extract_paragraphs_elsevier():
    doi = '10.1016/B978-0-12-416673-8.00004-6'
    xml_str = elsevier_client.download_article(doi)
    paragraphs = universal_extract_paragraphs(xml_str)
    if len(paragraphs) <= 1:
        logger.warning('Unable to extract paragraphs from XML string:\n'
                       '%s...' % xml_str[:2000])
    assert len(paragraphs) > 1
Beispiel #4
0
def test_get_rawtext():
    """Make sure we can get content of an article that has content in
    xocs:rawtext"""
    # PMID: 20072652
    doi = '10.1593/neo.91196'
    xml_str = ec.download_article(doi)
    body = ec.extract_text(xml_str)
    assert body
def test_get_converted_article_body():
    """Make sure we can get fulltext of an article that has
    ja:converted-article as its principal sub-element."""
    # PMID: 11851341
    doi = '10.1006/jmbi.2001.5334'
    xml_str = ec.download_article(doi)
    body = ec.extract_text(xml_str)
    assert body
Beispiel #6
0
def test_get_converted_article_body():
    """Make sure we can get fulltext of an article that has
    ja:converted-article as its principal sub-element."""
    # PMID: 11851341
    doi = '10.1006/jmbi.2001.5334'
    xml_str = ec.download_article(doi)
    body = ec.extract_text(xml_str)
    assert body
def test_get_converted_article_body():
    """Make sure we can get fulltext of an article that has
    ja:converted-article as its principal sub-element."""
    # PMID: 11851341
    doi = '10.1006/jmbi.2001.5334'
    xml_str = ec.download_article(doi)
    body = ec.extract_text(xml_str)
    if not body:
        logger.warning('Unable to extract text from XML string:\n'
                       '%s...' % xml_str[:2000])
    assert body
def test_get_rawtext():
    """Make sure we can get content of an article that has content in
    xocs:rawtext"""
    # PMID: 20072652
    doi = '10.1593/neo.91196'
    xml_str = ec.download_article(doi)
    body = ec.extract_text(xml_str)
    if not body:
        logger.warning('Unable to extract text from XML string:\n'
                       '%s...' % xml_str[:2000])
    assert body
def test_get_rawtext():
    """Make sure we can get content of an article that has content in
    xocs:rawtext"""
    # PMID: 20072652
    doi = '10.1593/neo.91196'
    xml_str = ec.download_article(doi)
    body = ec.extract_text(xml_str)
    if not body:
        logger.warning('Unable to extract text from XML string:\n'
                       '%s...' % xml_str[:2000])
    assert body
Beispiel #10
0
def test_get_converted_article_body():
    """Make sure we can get fulltext of an article that has
    ja:converted-article as its principal sub-element."""
    # PMID: 11851341
    doi = '10.1006/jmbi.2001.5334'
    xml_str = ec.download_article(doi)
    body = ec.extract_text(xml_str)
    if not body:
        logger.warning('Unable to extract text from XML string:\n'
                       '%s...' % xml_str[:2000])
    assert body
Beispiel #11
0
def read_piis(piis):
    """Return texts extracted from articles with given PIIs.

    Parameters
    ----------
    piis : list[str]
        A list of PIIs to extract texts from.

    Returns
    -------
    texts : dict
        A dictionary representing PIIs as keys and extracted texts as values.
    """
    texts = {}
    for pii in piis:
        try:
            xml = elsevier_client.download_article(pii, id_type='pii')
            # If we got an empty xml or bad response
            if not xml:
                logger.info('Could not get article content for %s' % pii)
                continue
        # Handle Connection and other errors
        except Exception as e:
            logger.info('Could not get article content for %s because of %s'
                        % (pii, e))
            continue
        try:
            txt = elsevier_client.extract_text(xml)
            # If we could find relevant xml parts
            if not txt:
                logger.info('Could not extract article text for %s' % pii)
                continue
        # Handle Connection and other errors
        except Exception as e:
            logger.info('Could not extract article text for %s because of %s'
                        % (pii, e))
        texts[pii] = txt
    logger.info('Got text back for %d articles.' % len(texts))
    return texts
Beispiel #12
0
def on_search(b):
    global titles
    global articles
    global search_term
    piis = elsevier_client.get_piis(search_term.value, start_year=2017)
    print('We found a total of %d papers%s.' %
          (len(piis),
           (', I\'ll show you the first 10' if len(piis) > 10 else '')))
    articles = [
        elsevier_client.download_article(pii, id_type='pii')
        for pii in piis[:10]
    ]
    titles = [
        ET.fromstring(content).findall(
            '*/dc:title',
            namespaces=elsevier_client.elsevier_ns)[0].text.strip()
        for content in articles
    ]
    for idx, title in enumerate(titles):
        clean_pii = piis[idx].replace('(', '').replace(')', '')
        printmd(
            '* %d: <a href="https://www.sciencedirect.com/science/article/pii/%s" target="_blank">%s</a>'
            % (idx, clean_pii, title))
Beispiel #13
0
def get_full_text(paper_id, idtype, preferred_content_type='text/xml'):
    """Return the content and the content type of an article.

    This function retreives the content of an article by its PubMed ID,
    PubMed Central ID, or DOI. It prioritizes full text content when available
    and returns an abstract from PubMed as a fallback.

    Parameters
    ----------
    paper_id : string
        ID of the article.
    idtype : 'pmid', 'pmcid', or 'doi
        Type of the ID.
    preferred_content_type : Optional[st]r
        Preference for full-text format, if available. Can be one of
        'text/xml', 'text/plain', 'application/pdf'. Default: 'text/xml'

    Returns
    -------
    content : str
        The content of the article.
    content_type : str
        The content type of the article
    """
    if preferred_content_type not in \
            ('text/xml', 'text/plain', 'application/pdf'):
        raise ValueError("preferred_content_type must be one of 'text/xml', "
                         "'text/plain', or 'application/pdf'.")
    ids = id_lookup(paper_id, idtype)
    pmcid = ids.get('pmcid')
    pmid = ids.get('pmid')
    doi = ids.get('doi')
    # First try to find paper via PMC
    if pmcid:
        nxml = pmc_client.get_xml(pmcid)
        if nxml:
            return nxml, 'pmc_oa_xml'
    # If we got here, it means we didn't find the full text in PMC, so we'll
    # need either the DOI (for lookup in CrossRef) and/or the PMID (so we
    # can fall back on the abstract. If by some strange turn we have neither,
    # give up now.
    if not doi and not pmid:
        return (None, None)

    # If it does not have PMC NXML then we attempt to obtain the full-text
    # through the CrossRef Click-through API
    if doi:
        # Get publisher
        publisher = crossref_client.get_publisher(doi)

        # First check for whether this is Elsevier--if so, use the Elsevier
        # client directly, because the Clickthrough API key seems unreliable.
        # Return full XML.
        if publisher == 'Elsevier BV':
            logger.info('Elsevier: %s' % pmid)
            #article = elsevier_client.get_article(doi, output='txt')
            try:
                article_xml = elsevier_client.download_article(doi)
            except Exception as e:
                logger.error("Error downloading Elsevier article: %s" % e)
                article_xml = None
            if article_xml is not None:
                return (article_xml, 'elsevier_xml')

        # FIXME FIXME FIXME
        # Because we don't yet have a way to process non-Elsevier content
        # obtained from CrossRef, which includes both XML of unknown format
        # and PDFs, we just comment this section out for now
        """
        # Check if there are any full text links
        links = crossref_client.get_fulltext_links(doi)
        if links:
            headers = {}
            # Set the Cross Ref Clickthrough API key in the header, if we've
            # got one
            if crossref_client.api_key is not None:
                headers['CR-Clickthrough-Client-Token'] = \
                        crossref_client.api_key
            # Utility function to get particular links by content-type
            def lookup_content_type(link_list, content_type):
                content_list = [l.get('URL') for l in link_list
                                if l.get('content-type') == content_type]
                return None if not content_list else content_list[0]
            # First check for what the user asked for
            if lookup_content_type(links, preferred_content_type):
                req = requests.get(lookup_content_type(links,
                                                       preferred_content_type),
                                   headers=headers)
                if req.status_code == 200:
                    req_content_type = req.headers['Content-Type']
                    return req.text, req_content_type
                elif req.status_code == 400:
                    logger.warning('Full text query returned 400 (Bad Request): '
                                  'Perhaps missing CrossRef Clickthrough API '
                                  'key?')
                    return (None, None)
            # Check for XML first
            if lookup_content_type(links, 'text/xml'):
                req = requests.get(lookup_content_type(links, 'text/xml'),
                                   headers=headers)
                if req.status_code == 200:
                    req_content_type = req.headers['Content-Type']
                    return req.text, req_content_type
                elif req.status_code == 400:
                    logger.warning('Full text query returned 400 (Bad Request):'
                                  'Perhaps missing CrossRef Clickthrough API '
                                  'key?')
                    return (None, None)
            # Next, plain text
            elif lookup_content_type(links, 'text/plain'):
                req = requests.get(lookup_content_type(links, 'text/plain'),
                                   headers=headers)
                if req.status_code == 200:
                    req_content_type = req.headers['Content-Type']
                    return req.text, req_content_type
                elif req.status_code == 400:
                    logger.warning('Full text query returned 400 (Bad Request):'
                                  'Perhaps missing CrossRef Clickthrough API '
                                  'key?')
                    return (None, None)
            elif lookup_content_type(links, 'application/pdf'):
                pass
            # Wiley's links are often of content-type 'unspecified'.
            elif lookup_content_type(links, 'unspecified'):
                req = requests.get(lookup_content_type(links, 'unspecified'),
                                   headers=headers)
                if req.status_code == 200:
                    req_content_type = req.headers['Content-Type']
                    return 'foo', req_content_type
                elif req.status_code == 400:
                    logger.warning('Full text query returned 400 (Bad Request):'
                                  'Perhaps missing CrossRef Clickthrough API '
                                  'key?')
                    return (None, None)
                elif req.status_code == 401:
                    logger.warning('Full text query returned 401 (Unauthorized)')
                    return (None, None)
                elif req.status_code == 403:
                    logger.warning('Full text query returned 403 (Forbidden)')
                    return (None, None)
            else:
                raise Exception("Unknown content type(s): %s" % links)
        elif publisher == 'American Society for Biochemistry & Molecular ' \
                          'Biology (ASBMB)':
            url = crossref_client.get_url(doi)
            return get_asbmb_full_text(url)
        """
        # end FIXME FIXME FIXME

        # No full text links and not a publisher we support. We'll have to
        # fall back to the abstract.
        #elif pmid:
        if pmid:
            abstract = pubmed_client.get_abstract(pmid)
            if abstract is None:
                return (None, None)
            else:
                return abstract, 'abstract'
        # We have a useless DOI and no PMID. Give up.
        else:
            return (None, None)
    # We don't have a DOI but we're guaranteed to have a PMID at this point,
    # so we fall back to the abstract:
    else:
        abstract = pubmed_client.get_abstract(pmid)
        if abstract is None:
            return (None, None)
        else:
            return abstract, 'abstract'
    # We'll only get here if we've missed a combination of conditions
    assert False
Beispiel #14
0
def get_full_text(paper_id, idtype, preferred_content_type='text/xml'):
    """Return the content and the content type of an article.

    This function retreives the content of an article by its PubMed ID,
    PubMed Central ID, or DOI. It prioritizes full text content when available
    and returns an abstract from PubMed as a fallback.

    Parameters
    ----------
    paper_id : string
        ID of the article.
    idtype : 'pmid', 'pmcid', or 'doi
        Type of the ID.
    preferred_content_type : Optional[st]r
        Preference for full-text format, if available. Can be one of
        'text/xml', 'text/plain', 'application/pdf'. Default: 'text/xml'

    Returns
    -------
    content : str
        The content of the article.
    content_type : str
        The content type of the article
    """
    if preferred_content_type not in \
            ('text/xml', 'text/plain', 'application/pdf'):
        raise ValueError("preferred_content_type must be one of 'text/xml', "
                         "'text/plain', or 'application/pdf'.")
    ids = id_lookup(paper_id, idtype)
    pmcid = ids.get('pmcid')
    pmid = ids.get('pmid')
    doi = ids.get('doi')
    # First try to find paper via PMC
    if pmcid:
        nxml = pmc_client.get_xml(pmcid)
        if nxml:
            return nxml, 'pmc_oa_xml'
    # If we got here, it means we didn't find the full text in PMC, so we'll
    # need either the DOI (for lookup in CrossRef) and/or the PMID (so we
    # can fall back on the abstract. If by some strange turn we have neither,
    # give up now.
    if not doi and not pmid:
        return (None, None)

    # If it does not have PMC NXML then we attempt to obtain the full-text
    # through the CrossRef Click-through API
    if doi:
        # Get publisher
        publisher = crossref_client.get_publisher(doi)

        # First check for whether this is Elsevier--if so, use the Elsevier
        # client directly, because the Clickthrough API key seems unreliable.
        # Return full XML.
        if publisher == 'Elsevier BV':
            logger.info('Elsevier: %s' % pmid)
            #article = elsevier_client.get_article(doi, output='txt')
            try:
                article_xml = elsevier_client.download_article(doi)
            except Exception as e:
                logger.error("Error downloading Elsevier article: %s" % e)
                article_xml = None
            if article_xml is not None:
                return (article_xml, 'elsevier_xml')

        # FIXME FIXME FIXME
        # Because we don't yet have a way to process non-Elsevier content
        # obtained from CrossRef, which includes both XML of unknown format
        # and PDFs, we just comment this section out for now
        """
        # Check if there are any full text links
        links = crossref_client.get_fulltext_links(doi)
        if links:
            headers = {}
            # Set the Cross Ref Clickthrough API key in the header, if we've
            # got one
            cr_api_key = crossref_client.get_api_key()
            if cr_api_key is not None:
                headers['CR-Clickthrough-Client-Token'] = cr_api_key
            # Utility function to get particular links by content-type
            def lookup_content_type(link_list, content_type):
                content_list = [l.get('URL') for l in link_list
                                if l.get('content-type') == content_type]
                return None if not content_list else content_list[0]
            # First check for what the user asked for
            if lookup_content_type(links, preferred_content_type):
                req = requests.get(lookup_content_type(links,
                                                       preferred_content_type),
                                   headers=headers)
                if req.status_code == 200:
                    req_content_type = req.headers['Content-Type']
                    return req.text, req_content_type
                elif req.status_code == 400:
                    logger.warning('Full text query returned 400 (Bad Request): '
                                  'Perhaps missing CrossRef Clickthrough API '
                                  'key?')
                    return (None, None)
            # Check for XML first
            if lookup_content_type(links, 'text/xml'):
                req = requests.get(lookup_content_type(links, 'text/xml'),
                                   headers=headers)
                if req.status_code == 200:
                    req_content_type = req.headers['Content-Type']
                    return req.text, req_content_type
                elif req.status_code == 400:
                    logger.warning('Full text query returned 400 (Bad Request):'
                                  'Perhaps missing CrossRef Clickthrough API '
                                  'key?')
                    return (None, None)
            # Next, plain text
            elif lookup_content_type(links, 'text/plain'):
                req = requests.get(lookup_content_type(links, 'text/plain'),
                                   headers=headers)
                if req.status_code == 200:
                    req_content_type = req.headers['Content-Type']
                    return req.text, req_content_type
                elif req.status_code == 400:
                    logger.warning('Full text query returned 400 (Bad Request):'
                                  'Perhaps missing CrossRef Clickthrough API '
                                  'key?')
                    return (None, None)
            elif lookup_content_type(links, 'application/pdf'):
                pass
            # Wiley's links are often of content-type 'unspecified'.
            elif lookup_content_type(links, 'unspecified'):
                req = requests.get(lookup_content_type(links, 'unspecified'),
                                   headers=headers)
                if req.status_code == 200:
                    req_content_type = req.headers['Content-Type']
                    return 'foo', req_content_type
                elif req.status_code == 400:
                    logger.warning('Full text query returned 400 (Bad Request):'
                                  'Perhaps missing CrossRef Clickthrough API '
                                  'key?')
                    return (None, None)
                elif req.status_code == 401:
                    logger.warning('Full text query returned 401 (Unauthorized)')
                    return (None, None)
                elif req.status_code == 403:
                    logger.warning('Full text query returned 403 (Forbidden)')
                    return (None, None)
            else:
                raise Exception("Unknown content type(s): %s" % links)
        elif publisher == 'American Society for Biochemistry & Molecular ' \
                          'Biology (ASBMB)':
            url = crossref_client.get_url(doi)
            return get_asbmb_full_text(url)
        """
        # end FIXME FIXME FIXME

        # No full text links and not a publisher we support. We'll have to
        # fall back to the abstract.
        #elif pmid:
        if pmid:
            abstract = pubmed_client.get_abstract(pmid)
            if abstract is None:
                return (None, None)
            else:
                return abstract, 'abstract'
        # We have a useless DOI and no PMID. Give up.
        else:
            return (None, None)
    # We don't have a DOI but we're guaranteed to have a PMID at this point,
    # so we fall back to the abstract:
    else:
        abstract = pubmed_client.get_abstract(pmid)
        if abstract is None:
            return (None, None)
        else:
            return abstract, 'abstract'
    # We'll only get here if we've missed a combination of conditions
    assert False
Beispiel #15
0
def test_article():
    # PMID: 11302724
    doi = '10.1006/bbrc.2001.4693'
    xml_str = ec.download_article(doi)
    body = ec.extract_text(xml_str)
    assert body is None
Beispiel #16
0
                for pii in piis:
                    fh.write('%s\n' % pii)
        else:
            print('Loading PIIs for %s' % search_term)
            with open(fname, 'r') as fh:
                piis = [l.strip() for l in fh.readlines()]
        all_piis += piis
    all_piis = list(set(all_piis))
    print('Got %d PIIs' % len(all_piis))

    # Download all the XML content
    for pii in all_piis:
        fname = 'xml/%s.xml' % pii.replace('/', '_')
        if not os.path.exists(fname):
            print('Donwloading %s' % pii)
            res = elsevier_client.download_article(pii, 'pii')
            with open(fname, 'wb') as fh:
                fh.write(res.encode('utf-8'))
        else:
            print('Cached %s' % pii)

    # Strip out the text from all the XML content
    for pii in all_piis:
        fname = 'xml/%s.xml' % pii.replace('/', '_')
        with open(fname, 'rb') as fh:
            xml_content = fh.read().decode('utf-8')
        txt = elsevier_client.extract_text(xml_content)
        if not txt:
            continue
        txt_fname = 'txt/%s.txt' % pii.replace('/', '_')
        with open(txt_fname, 'wb') as fh:
Beispiel #17
0
def test_universal_extract_paragraphs_elsevier():
    doi = '10.1016/B978-0-12-416673-8.00004-6'
    xml_str = elsevier_client.download_article(doi)
    paragraphs = universal_extract_paragraphs(xml_str)
    assert len(paragraphs) > 1
def test_article():
    # PMID: 11302724
    doi = '10.1006/bbrc.2001.4693'
    xml_str = ec.download_article(doi)
    body = ec.extract_text(xml_str)
    assert body is None
Beispiel #19
0
def test_universal_extract_paragraphs_elsevier():
    doi = '10.1016/B978-0-12-416673-8.00004-6'
    xml_str = elsevier_client.download_article(doi)
    paragraphs = universal_extract_paragraphs(xml_str)
    assert len(paragraphs) > 1