Beispiel #1
0
def process_paper(model_name, pmid):
    json_path = os.path.join(model_name, 'jsons', 'PMID%s.json' % pmid)

    if pmid.startswith('api') or pmid.startswith('PMID'):
        logger.warning('Invalid PMID: %s' % pmid)
    # If the paper has been read, use the json output file
    if os.path.exists(json_path):
        rp = reach.process_json_file(json_path, citation=pmid)
        txt_format = 'existing_json'
    # If the paper has not been read, download the text and read
    else:
        try:
            txt, txt_format = get_full_text(pmid, 'pmid')
        except:
            return None, None

        if txt_format == 'pmc_oa_xml':
            rp = reach.process_nxml_str(txt, citation=pmid, offline=True)
            if os.path.exists('reach_output.json'):
                shutil.move('reach_output.json', json_path)
        elif txt_format == 'elsevier_xml':
            # Extract the raw text from the Elsevier XML
            txt = elsevier_client.extract_text(txt)
            rp = reach.process_text(txt, citation=pmid, offline=True)
            if os.path.exists('reach_output.json'):
                shutil.move('reach_output.json', json_path)
        elif txt_format == 'abstract':
            rp = reach.process_text(txt, citation=pmid, offline=True)
            if os.path.exists('reach_output.json'):
                shutil.move('reach_output.json', json_path)
        else:
            rp = None
    if rp is not None:
        check_pmids(rp.statements)
    return rp, txt_format
Beispiel #2
0
def test_get_converted_article_body():
    """Make sure we can get fulltext of an article that has
    ja:converted-article as its principal sub-element."""
    # PMID: 11851341
    doi = '10.1006/jmbi.2001.5334'
    xml_str = ec.download_article(doi)
    body = ec.extract_text(xml_str)
    assert body
Beispiel #3
0
def test_get_rawtext():
    """Make sure we can get content of an article that has content in
    xocs:rawtext"""
    # PMID: 20072652
    doi = '10.1593/neo.91196'
    xml_str = ec.download_article(doi)
    body = ec.extract_text(xml_str)
    assert body
def test_get_rawtext():
    """Make sure we can get content of an article that has content in
    xocs:rawtext"""
    # PMID: 20072652
    doi = '10.1593/neo.91196'
    xml_str = ec.download_article(doi)
    body = ec.extract_text(xml_str)
    assert body
def test_get_converted_article_body():
    """Make sure we can get fulltext of an article that has
    ja:converted-article as its principal sub-element."""
    # PMID: 11851341
    doi = '10.1006/jmbi.2001.5334'
    xml_str = ec.download_article(doi)
    body = ec.extract_text(xml_str)
    assert body
Beispiel #6
0
def get_upload_content(pmid, force_fulltext_lookup=False):
    """Get full text and/or abstract for paper and upload to S3."""
    # Make sure that the PMID doesn't start with PMID so that it doesn't
    # screw up the literature clients
    if pmid.startswith('PMID'):
        pmid = pmid[4:]
    # First, check S3:
    (ft_content_s3, ft_content_type_s3) = get_full_text(pmid)
    # The abstract is on S3 but there is no full text; if we're not forcing
    # fulltext lookup, then we're done
    if ft_content_type_s3 == 'abstract' and not force_fulltext_lookup:
        return (ft_content_s3, ft_content_type_s3)
    # If there's nothing (even an abstract on S3), or if there's an abstract
    # and we're forcing fulltext lookup, do the lookup
    elif ft_content_type_s3 is None or \
            (ft_content_type_s3 == 'abstract' and force_fulltext_lookup) or \
            (ft_content_type_s3 == 'elsevier_xml' and
                    not elsevier_client.extract_text(ft_content_s3)):
        # FIXME FIXME FIXME
        if ft_content_type_s3 == 'elsevier_xml':
            logger.info('elsevier_xml for %s missing full text element, '
                        'getting again.' % pmid)
        # FIXME FIXME FIXME
        # Try to retrieve from literature client
        logger.info("PMID%s: getting content using literature client" % pmid)
        (ft_content, ft_content_type) = lit.get_full_text(pmid, 'pmid')
        assert ft_content_type in ('pmc_oa_xml', 'elsevier_xml', 'abstract',
                                   None)
        # If we tried to get the full text and didn't even get the abstract,
        # then there was probably a problem with the web service or the DOI
        if ft_content_type is None:
            return (None, None)
        # If we got the abstract, and we already had the abstract on S3, then
        # do nothing
        elif ft_content_type == 'abstract' and ft_content_type_s3 == 'abstract':
            logger.info("PMID%s: found abstract but already had it on " \
                        "S3; skipping" % pmid)
            return (ft_content, ft_content_type)
        # If we got the abstract, and we had nothing on S3, then upload
        elif ft_content_type == 'abstract' and ft_content_type_s3 is None:
            logger.info("PMID%s: found abstract, uploading to S3" % pmid)
            put_abstract(pmid, ft_content)
            return (ft_content, ft_content_type)
        # We got a full text (or something other than None or abstract...)
        else:
            logger.info("PMID%s: uploading %s" % (pmid, ft_content_type))
            put_full_text(pmid, ft_content, full_text_type=ft_content_type)
            return (ft_content, ft_content_type)
    # Some form of full text is already on S3
    else:
        # TODO
        # In future, could check for abstract even if full text is found, and
        # upload it just to have it
        return (ft_content_s3, ft_content_type_s3)
    # We should always return before we get here
    assert False
Beispiel #7
0
def process_paper(model_name, pmid):
    """Process a paper with the given pubmed identifier

    Parameters
    ----------
    model_name : str
        The directory for the INDRA machine
    pmid : str
        The PMID to process.

    Returns
    -------
    rp : ReachProcessor
        A ReachProcessor containing the extracted INDRA Statements
        in rp.statements.
    txt_format : str
        A string representing the format of the text
    """
    json_directory = os.path.join(model_name, 'jsons')
    json_path = os.path.join(json_directory, 'PMID%s.json' % pmid)

    if pmid.startswith('api') or pmid.startswith('PMID'):
        logger.warning('Invalid PMID: %s' % pmid)
    # If the paper has been read, use the json output file
    if os.path.exists(json_path):
        rp = reach.process_json_file(json_path, citation=pmid)
        txt_format = 'existing_json'
    # If the paper has not been read, download the text and read
    else:
        try:
            txt, txt_format = get_full_text(pmid, 'pmid')
        except Exception:
            return None, None

        if txt_format == 'pmc_oa_xml':
            rp = reach.process_nxml_str(txt,
                                        citation=pmid,
                                        offline=True,
                                        output_fname=json_path)
        elif txt_format == 'elsevier_xml':
            # Extract the raw text from the Elsevier XML
            txt = elsevier_client.extract_text(txt)
            rp = reach.process_text(txt,
                                    citation=pmid,
                                    offline=True,
                                    output_fname=json_path)
        elif txt_format == 'abstract':
            rp = reach.process_text(txt,
                                    citation=pmid,
                                    offline=True,
                                    output_fname=json_path)
        else:
            rp = None
    if rp is not None:
        check_pmids(rp.statements)
    return rp, txt_format
def test_get_rawtext():
    """Make sure we can get content of an article that has content in
    xocs:rawtext"""
    # PMID: 20072652
    doi = '10.1593/neo.91196'
    xml_str = ec.download_article(doi)
    body = ec.extract_text(xml_str)
    if not body:
        logger.warning('Unable to extract text from XML string:\n'
                       '%s...' % xml_str[:2000])
    assert body
def test_get_rawtext():
    """Make sure we can get content of an article that has content in
    xocs:rawtext"""
    # PMID: 20072652
    doi = '10.1593/neo.91196'
    xml_str = ec.download_article(doi)
    body = ec.extract_text(xml_str)
    if not body:
        logger.warning('Unable to extract text from XML string:\n'
                       '%s...' % xml_str[:2000])
    assert body
Beispiel #10
0
def test_get_converted_article_body():
    """Make sure we can get fulltext of an article that has
    ja:converted-article as its principal sub-element."""
    # PMID: 11851341
    doi = '10.1006/jmbi.2001.5334'
    xml_str = ec.download_article(doi)
    body = ec.extract_text(xml_str)
    if not body:
        logger.warning('Unable to extract text from XML string:\n'
                       '%s...' % xml_str[:2000])
    assert body
def test_get_converted_article_body():
    """Make sure we can get fulltext of an article that has
    ja:converted-article as its principal sub-element."""
    # PMID: 11851341
    doi = '10.1006/jmbi.2001.5334'
    xml_str = ec.download_article(doi)
    body = ec.extract_text(xml_str)
    if not body:
        logger.warning('Unable to extract text from XML string:\n'
                       '%s...' % xml_str[:2000])
    assert body
Beispiel #12
0
 def get_text():
     # Add timeout here for PubMed
     time.sleep(0.5)
     # full_pmid = s3_client.check_pmid(pmid)
     # Look for the full text
     content, content_type = s3_client.get_upload_content(
         pmid, force_fulltext_lookup=force_fulltext)
     content_path = None
     # Write the contents to a file
     if content_type is None or content is None:
         # No content found on S3, skipping
         content_source = 'content_not_found'
     elif content_type == 'pmc_oa_xml':
         content_source = 'pmc_oa_xml'
         content_path = os.path.join(input_dir, '%s.nxml' % pmid)
     elif content_type == 'pmc_auth_xml':
         content_source = 'pmc_auth_xml'
         content_path = os.path.join(input_dir, '%s.nxml' % pmid)
     elif content_type == 'pmc_oa_txt':
         content_source = 'pmc_oa_txt'
         content_path = os.path.join(input_dir, '%s.txt' % pmid)
     elif content_type == 'elsevier_xml':
         content = elsevier_client.extract_text(content)
         # Couldn't get text from Elsevier XML
         if content is None:
             content_source = 'elsevier_extract_text_failure'
         else:
             content_source = 'elsevier_xml'
             content_path = os.path.join(input_dir, '%s.txt' % pmid)
     elif content_type == 'txt':
         content_source = 'txt'
         content_path = os.path.join(input_dir, '%s.txt' % pmid)
     elif content_type == 'abstract':
         content_source = 'abstract'
         content_path = os.path.join(input_dir, '%s.txt' % pmid)
     # Unhandled content type, skipping
     else:
         content_source = 'unhandled_content_type_%s' % content_type
     # If we got content, write the content to a file with the appropriate
     # extension
     if content_path:
         with open(content_path, 'wb') as f:
             # The XML string is Unicode
             enc = content.encode('utf-8')
             f.write(enc)
     # Return dict of results for this PMID
     result = {
         pmid: {
             'content_source': content_source,
             'content_path': content_path
         }
     }
     return result
Beispiel #13
0
def process_paper(model_name, pmid):
    """Process a paper with the given pubmed identifier

    Parameters
    ----------
    model_name : str
        The directory for the INDRA machine
    pmid : str
        The PMID to process.

    Returns
    -------
    rp : ReachProcessor
        A ReachProcessor containing the extracted INDRA Statements
        in rp.statements.
    txt_format : str
        A string representing the format of the text
    """
    json_directory = os.path.join(model_name, 'jsons')
    json_path = os.path.join(json_directory, 'PMID%s.json' % pmid)

    if pmid.startswith('api') or pmid.startswith('PMID'):
        logger.warning('Invalid PMID: %s' % pmid)
    # If the paper has been read, use the json output file
    if os.path.exists(json_path):
        rp = reach.process_json_file(json_path, citation=pmid)
        txt_format = 'existing_json'
    # If the paper has not been read, download the text and read
    else:
        try:
            txt, txt_format = get_full_text(pmid, 'pmid')
        except Exception:
            return None, None

        if txt_format == 'pmc_oa_xml':
            rp = reach.process_nxml_str(txt, citation=pmid, offline=True,
                                        output_fname=json_path)
        elif txt_format == 'elsevier_xml':
            # Extract the raw text from the Elsevier XML
            txt = elsevier_client.extract_text(txt)
            rp = reach.process_text(txt, citation=pmid, offline=True,
                                    output_fname=json_path)
        elif txt_format == 'abstract':
            rp = reach.process_text(txt, citation=pmid, offline=True,
                                    output_fname=json_path)
        else:
            rp = None
    if rp is not None:
        check_pmids(rp.statements)
    return rp, txt_format
Beispiel #14
0
def on_read(b):
    global articles
    global statements
    raw_txt = elsevier_client.extract_text(articles[int(paper_id.value)])
    if 'Internal Server Error' in raw_txt:
        print('Sorry, that paper was not accessible for reading.')
        statements = []
    ep = eidos.process_text(raw_txt, webservice='http://localhost:5000')
    statements = ep.statements
    print('We extracted %d statements:' % len(statements))
    for stmt in statements:
        sg = stmt.subj.db_refs['UN'][0][0].split('/')[-1]
        og = stmt.obj.db_refs['UN'][0][0].split('/')[-1]
        printmd('* **%s**(%s) %s **%s**(%s)' %
                (sg, stmt.subj.name, '->'
                 if stmt.overall_polarity() == 1 else '-|', og, stmt.obj.name))
Beispiel #15
0
def read_piis(piis):
    """Return texts extracted from articles with given PIIs.

    Parameters
    ----------
    piis : list[str]
        A list of PIIs to extract texts from.

    Returns
    -------
    texts : dict
        A dictionary representing PIIs as keys and extracted texts as values.
    """
    texts = {}
    for pii in piis:
        try:
            xml = elsevier_client.download_article(pii, id_type='pii')
            # If we got an empty xml or bad response
            if not xml:
                logger.info('Could not get article content for %s' % pii)
                continue
        # Handle Connection and other errors
        except Exception as e:
            logger.info('Could not get article content for %s because of %s'
                        % (pii, e))
            continue
        try:
            txt = elsevier_client.extract_text(xml)
            # If we could find relevant xml parts
            if not txt:
                logger.info('Could not extract article text for %s' % pii)
                continue
        # Handle Connection and other errors
        except Exception as e:
            logger.info('Could not extract article text for %s because of %s'
                        % (pii, e))
        texts[pii] = txt
    logger.info('Got text back for %d articles.' % len(texts))
    return texts
Beispiel #16
0
    print('Got %d PIIs' % len(all_piis))

    # Download all the XML content
    for pii in all_piis:
        fname = 'xml/%s.xml' % pii.replace('/', '_')
        if not os.path.exists(fname):
            print('Donwloading %s' % pii)
            res = elsevier_client.download_article(pii, 'pii')
            with open(fname, 'wb') as fh:
                fh.write(res.encode('utf-8'))
        else:
            print('Cached %s' % pii)

    # Strip out the text from all the XML content
    for pii in all_piis:
        fname = 'xml/%s.xml' % pii.replace('/', '_')
        with open(fname, 'rb') as fh:
            xml_content = fh.read().decode('utf-8')
        txt = elsevier_client.extract_text(xml_content)
        if not txt:
            continue
        txt_fname = 'txt/%s.txt' % pii.replace('/', '_')
        with open(txt_fname, 'wb') as fh:
            fh.write(txt.encode('utf-8'))

    # Now run Eidos on the documents
    in_folder = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'txt')
    out_folder = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                              'jsonld')
    eidos_cli.extract_from_directory(in_folder, out_folder)
Beispiel #17
0
     logger.info('No content found on S3 for %s, skipping' % pmid)
     continue
 elif content_type == 'pmc_oa_xml':
     num_pmc_oa_xml += 1
     text_sources[full_pmid] = 'pmc_oa_xml'
     content_path = os.path.join(input_dir, '%s.nxml' % pmid)
 elif content_type == 'pmc_auth_xml':
     num_pmc_auth_xml += 1
     text_sources[full_pmid] = 'pmc_auth_xml'
     content_path = os.path.join(input_dir, '%s.nxml' % pmid)
 elif content_type == 'pmc_oa_txt':
     num_txt += 1
     text_sources[full_pmid] = 'pmc_oa_txt'
     content_path = os.path.join(input_dir, '%s.txt' % pmid)
 elif content_type == 'elsevier_xml':
     content = elsevier_client.extract_text(content)
     if content is None:
         logger.info("%s: Couldn't get text from Elsevier XML" % pmid)
         num_elsevier_xml_fail += 1
         continue
     num_elsevier_xml += 1
     text_sources[full_pmid] = 'elsevier_xml'
     content_path = os.path.join(input_dir, '%s.txt' % pmid)
 elif content_type == 'txt':
     num_txt += 1
     text_sources[full_pmid] = 'txt'
     content_path = os.path.join(input_dir, '%s.txt' % pmid)
 elif content_type == 'abstract':
     num_abstract += 1
     text_sources[full_pmid] = 'abstract'
     content_path = os.path.join(input_dir, '%s.txt' % pmid)
Beispiel #18
0
def test_article():
    # PMID: 11302724
    doi = '10.1006/bbrc.2001.4693'
    xml_str = ec.download_article(doi)
    body = ec.extract_text(xml_str)
    assert body is None
def test_article():
    # PMID: 11302724
    doi = '10.1006/bbrc.2001.4693'
    xml_str = ec.download_article(doi)
    body = ec.extract_text(xml_str)
    assert body is None
Beispiel #20
0
     logger.info('No content found on S3 for %s, skipping' % pmid)
     continue
 elif content_type == 'pmc_oa_xml':
     num_pmc_oa_xml += 1
     text_sources[full_pmid] = 'pmc_oa_xml'
     content_path = os.path.join(input_dir, '%s.nxml' % pmid)
 elif content_type == 'pmc_auth_xml':
     num_pmc_auth_xml += 1
     text_sources[full_pmid] = 'pmc_auth_xml'
     content_path = os.path.join(input_dir, '%s.nxml' % pmid)
 elif content_type == 'pmc_oa_txt':
     num_txt += 1
     text_sources[full_pmid] = 'pmc_oa_txt'
     content_path = os.path.join(input_dir, '%s.txt' % pmid)
 elif content_type == 'elsevier_xml':
     content = elsevier_client.extract_text(content)
     if content is None:
         logger.info("%s: Couldn't get text from Elsevier XML" % pmid)
         num_elsevier_xml_fail += 1
         continue
     num_elsevier_xml += 1
     text_sources[full_pmid] = 'elsevier_xml'
     content_path = os.path.join(input_dir, '%s.txt' % pmid)
 elif content_type == 'txt':
     num_txt += 1
     text_sources[full_pmid] = 'txt'
     content_path = os.path.join(input_dir, '%s.txt' % pmid)
 elif content_type == 'abstract':
     num_abstract += 1
     text_sources[full_pmid] = 'abstract'
     content_path = os.path.join(input_dir, '%s.txt' % pmid)