Example #1
0
def process_paper(model_name, pmid):
    json_path = os.path.join(model_name, 'jsons', 'PMID%s.json' % pmid)

    if pmid.startswith('api') or pmid.startswith('PMID'):
        logger.warning('Invalid PMID: %s' % pmid)
    # If the paper has been read, use the json output file
    if os.path.exists(json_path):
        rp = reach.process_json_file(json_path, citation=pmid)
        txt_format = 'existing_json'
    # If the paper has not been read, download the text and read
    else:
        try:
            txt, txt_format = get_full_text(pmid, 'pmid')
        except:
            return None, None

        if txt_format == 'pmc_oa_xml':
            rp = reach.process_nxml_str(txt, citation=pmid, offline=True)
            if os.path.exists('reach_output.json'):
                shutil.move('reach_output.json', json_path)
        elif txt_format == 'elsevier_xml':
            # Extract the raw text from the Elsevier XML
            txt = elsevier_client.extract_text(txt)
            rp = reach.process_text(txt, citation=pmid, offline=True)
            if os.path.exists('reach_output.json'):
                shutil.move('reach_output.json', json_path)
        elif txt_format == 'abstract':
            rp = reach.process_text(txt, citation=pmid, offline=True)
            if os.path.exists('reach_output.json'):
                shutil.move('reach_output.json', json_path)
        else:
            rp = None
    if rp is not None:
        check_pmids(rp.statements)
    return rp, txt_format
Example #2
0
def run_reading(text_contents, cached=True):
    organism_preference = None
    stmts = {}
    for trid, text_content in text_contents.items():
        print('Reading %s' % trid)
        output_fname = os.path.join(NEW_REACH_PATH, '%s.json' % trid)
        if cached and os.path.exists(output_fname):
            rp = reach.process_json_file(output_fname)
            if rp is None:
                continue
        else:
            if text_content.startswith('<!DOCTYPE'):
                rp = reach.process_nxml_str(
                    text_content,
                    url=reach.local_nxml_url,
                    output_fname=output_fname,
                    organism_priority=organism_preference)
            else:
                rp = reach.process_text(text_content,
                                        url=reach.local_text_url,
                                        output_fname=output_fname,
                                        organism_priority=organism_preference)
        if rp is not None:
            stmts[trid] = rp.statements
    return stmts
Example #3
0
def process_paper(model_name, pmid):
    """Process a paper with the given pubmed identifier

    Parameters
    ----------
    model_name : str
        The directory for the INDRA machine
    pmid : str
        The PMID to process.

    Returns
    -------
    rp : ReachProcessor
        A ReachProcessor containing the extracted INDRA Statements
        in rp.statements.
    txt_format : str
        A string representing the format of the text
    """
    json_directory = os.path.join(model_name, 'jsons')
    json_path = os.path.join(json_directory, 'PMID%s.json' % pmid)

    if pmid.startswith('api') or pmid.startswith('PMID'):
        logger.warning('Invalid PMID: %s' % pmid)
    # If the paper has been read, use the json output file
    if os.path.exists(json_path):
        rp = reach.process_json_file(json_path, citation=pmid)
        txt_format = 'existing_json'
    # If the paper has not been read, download the text and read
    else:
        try:
            txt, txt_format = get_full_text(pmid, 'pmid')
        except Exception:
            return None, None

        if txt_format == 'pmc_oa_xml':
            rp = reach.process_nxml_str(txt,
                                        citation=pmid,
                                        offline=True,
                                        output_fname=json_path)
        elif txt_format == 'elsevier_xml':
            # Extract the raw text from the Elsevier XML
            txt = elsevier_client.extract_text(txt)
            rp = reach.process_text(txt,
                                    citation=pmid,
                                    offline=True,
                                    output_fname=json_path)
        elif txt_format == 'abstract':
            rp = reach.process_text(txt,
                                    citation=pmid,
                                    offline=True,
                                    output_fname=json_path)
        else:
            rp = None
    if rp is not None:
        check_pmids(rp.statements)
    return rp, txt_format
Example #4
0
def reach_processor(article_content, fname, offline=True):
    if offline:
        url = reach.local_nxml_url
    else:
        url = reach.reach_nxml_url

    return reach.process_nxml_str(article_content,
                                  output_fname=fname,
                                  offline=False,
                                  url=url)
Example #5
0
def process_paper(model_name, pmid):
    """Process a paper with the given pubmed identifier

    Parameters
    ----------
    model_name : str
        The directory for the INDRA machine
    pmid : str
        The PMID to process.

    Returns
    -------
    rp : ReachProcessor
        A ReachProcessor containing the extracted INDRA Statements
        in rp.statements.
    txt_format : str
        A string representing the format of the text
    """
    json_directory = os.path.join(model_name, 'jsons')
    json_path = os.path.join(json_directory, 'PMID%s.json' % pmid)

    if pmid.startswith('api') or pmid.startswith('PMID'):
        logger.warning('Invalid PMID: %s' % pmid)
    # If the paper has been read, use the json output file
    if os.path.exists(json_path):
        rp = reach.process_json_file(json_path, citation=pmid)
        txt_format = 'existing_json'
    # If the paper has not been read, download the text and read
    else:
        try:
            txt, txt_format = get_full_text(pmid, 'pmid')
        except Exception:
            return None, None

        if txt_format == 'pmc_oa_xml':
            rp = reach.process_nxml_str(txt, citation=pmid, offline=True,
                                        output_fname=json_path)
        elif txt_format == 'elsevier_xml':
            # Extract the raw text from the Elsevier XML
            txt = elsevier_client.extract_text(txt)
            rp = reach.process_text(txt, citation=pmid, offline=True,
                                    output_fname=json_path)
        elif txt_format == 'abstract':
            rp = reach.process_text(txt, citation=pmid, offline=True,
                                    output_fname=json_path)
        else:
            rp = None
    if rp is not None:
        check_pmids(rp.statements)
    return rp, txt_format