def process_paper(model_name, pmid): json_path = os.path.join(model_name, 'jsons', 'PMID%s.json' % pmid) if pmid.startswith('api') or pmid.startswith('PMID'): logger.warning('Invalid PMID: %s' % pmid) # If the paper has been read, use the json output file if os.path.exists(json_path): rp = reach.process_json_file(json_path, citation=pmid) txt_format = 'existing_json' # If the paper has not been read, download the text and read else: try: txt, txt_format = get_full_text(pmid, 'pmid') except: return None, None if txt_format == 'pmc_oa_xml': rp = reach.process_nxml_str(txt, citation=pmid, offline=True) if os.path.exists('reach_output.json'): shutil.move('reach_output.json', json_path) elif txt_format == 'elsevier_xml': # Extract the raw text from the Elsevier XML txt = elsevier_client.extract_text(txt) rp = reach.process_text(txt, citation=pmid, offline=True) if os.path.exists('reach_output.json'): shutil.move('reach_output.json', json_path) elif txt_format == 'abstract': rp = reach.process_text(txt, citation=pmid, offline=True) if os.path.exists('reach_output.json'): shutil.move('reach_output.json', json_path) else: rp = None if rp is not None: check_pmids(rp.statements) return rp, txt_format
def run_reading(text_contents, cached=True): organism_preference = None stmts = {} for trid, text_content in text_contents.items(): print('Reading %s' % trid) output_fname = os.path.join(NEW_REACH_PATH, '%s.json' % trid) if cached and os.path.exists(output_fname): rp = reach.process_json_file(output_fname) if rp is None: continue else: if text_content.startswith('<!DOCTYPE'): rp = reach.process_nxml_str( text_content, url=reach.local_nxml_url, output_fname=output_fname, organism_priority=organism_preference) else: rp = reach.process_text(text_content, url=reach.local_text_url, output_fname=output_fname, organism_priority=organism_preference) if rp is not None: stmts[trid] = rp.statements return stmts
def process_paper(model_name, pmid): """Process a paper with the given pubmed identifier Parameters ---------- model_name : str The directory for the INDRA machine pmid : str The PMID to process. Returns ------- rp : ReachProcessor A ReachProcessor containing the extracted INDRA Statements in rp.statements. txt_format : str A string representing the format of the text """ json_directory = os.path.join(model_name, 'jsons') json_path = os.path.join(json_directory, 'PMID%s.json' % pmid) if pmid.startswith('api') or pmid.startswith('PMID'): logger.warning('Invalid PMID: %s' % pmid) # If the paper has been read, use the json output file if os.path.exists(json_path): rp = reach.process_json_file(json_path, citation=pmid) txt_format = 'existing_json' # If the paper has not been read, download the text and read else: try: txt, txt_format = get_full_text(pmid, 'pmid') except Exception: return None, None if txt_format == 'pmc_oa_xml': rp = reach.process_nxml_str(txt, citation=pmid, offline=True, output_fname=json_path) elif txt_format == 'elsevier_xml': # Extract the raw text from the Elsevier XML txt = elsevier_client.extract_text(txt) rp = reach.process_text(txt, citation=pmid, offline=True, output_fname=json_path) elif txt_format == 'abstract': rp = reach.process_text(txt, citation=pmid, offline=True, output_fname=json_path) else: rp = None if rp is not None: check_pmids(rp.statements) return rp, txt_format
def reach_processor(article_content, fname, offline=True): if offline: url = reach.local_nxml_url else: url = reach.reach_nxml_url return reach.process_nxml_str(article_content, output_fname=fname, offline=False, url=url)