def on_message(data): global last_seen_msg_id global stmts if isinstance(data, dict) and data['id'] != last_seen_msg_id: last_seen_msg_id = data['id'] if {'id': user_id} in data['targets']: if data['comment'].startswith('indra:'): text = data['comment'][6:].strip() if text.strip().lower() in ['start over', 'cls', 'clear']: clear_model(data['userName']) # Retrieve cached REACH JSON for demo purposes elif text.strip().lower().startswith('read pmc4338247'): pmcid = 'PMC4338247' say("%s: Got it. Reading %s via INDRA. This usually " "takes about a minute." % (data['userName'], pmcid)) rp = reach.process_json_file('reach_PMC4338247.json') stmts += rp.statements assemble_model(data['userName']) elif text.strip().lower().startswith('read'): pmcid = text[4:].strip() update_model_from_paper(pmcid, data['userName']) elif text.strip().lower().startswith('remove'): remove_arg = text[6:].strip() if len(remove_arg.split(' ')) == 1: remove_agent(remove_arg, data['userName']) print "Remove agent:", remove_arg else: remove_mechanism(remove_arg, data['userName']) print "Remove mechanism:", remove_arg else: update_model_from_text(text, data['userName']) if data['comment'] == 'biopax': print 'BIOPAX' call_biopax() print '<%s> %s' % (data['userName'], data['comment'])
def process_paper(model_name, pmid): json_path = os.path.join(model_path, model_name, 'jsons', 'PMID%s.json' % pmid) if pmid.startswith('api') or pmid.startswith('PMID'): logger.warning('Invalid PMID: %s' % pmid) # If the paper has been read, use the json output file if os.path.exists(json_path): rp = reach.process_json_file(json_path, citation=pmid) txt_format = 'existing_json' # If the paper has not been read, download the text and read else: txt, txt_format = get_full_text(pmid, 'pmid') if txt_format == 'pmc_oa_xml': rp = reach.process_nxml_str(txt, citation=pmid, offline=True) if os.path.exists('reach_output.json'): shutil.move('reach_output.json', json_path) elif txt_format == 'elsevier_xml': # Extract the raw text from the Elsevier XML txt = elsevier_client.extract_text(txt) rp = reach.process_text(txt, citation=pmid, offline=True) if os.path.exists('reach_output.json'): shutil.move('reach_output.json', json_path) elif txt_format == 'abstract': rp = reach.process_text(txt, citation=pmid, offline=True) if os.path.exists('reach_output.json'): shutil.move('reach_output.json', json_path) else: rp = None if rp is not None: check_pmids(rp.statements) return rp, txt_format
from indra import trips, reach from indra.literature import id_lookup from assembly_eval import have_file, run_assembly if __name__ == "__main__": pmc_ids = ["PMC1234335", "PMC3178447", "PMC3690480", "PMC4345513", "PMC534114"] pmids = [id_lookup(pmcid)["pmid"] for pmcid in pmc_ids] for pmid, pmcid in zip(pmids, pmc_ids): print "Processing %s..." % pmcid trips_fname = "trips/" + pmcid + "-20160503T1152.ekb" tp = trips.process_xml(open(trips_fname).read()) for s in tp.statements: for e in s.evidence: e.pmid = pmid reach_fname = "reach/" + pmcid + ".json" rp = reach.process_json_file(reach_fname) all_statements = tp.statements + rp.statements run_assembly(all_statements, "combined", pmcid)
rerun = False # Download the papers if they are not available yet for pmcid in pmc_ids: prefix = folder + "/" + pmcid if not have_file(prefix + ".nxml") and not have_file(prefix + ".txt"): txt, txt_format = get_full_text(pmcid) if txt_format == "nxml": fname = prefix + ".nxml" else: fname = prefix + ".txt" with open(fname, "wt") as fh: fh.write(txt.encode("utf-8")) # Read each paper if it hasn't been read yet. # Otherwise use the existing json extractions. for pmcid, pmid in zip(pmc_ids, pmids): prefix = folder + "/" + pmcid print "Processing %s..." % pmcid # If REACH already processed it then don't run it again if rerun or not have_file(prefix + ".json"): if have_file(prefix + ".txt"): txt = open(prefix + ".txt").read().decode("utf-8") rp = reach.process_text(txt, citation=pmid, offline=True) elif have_file(prefix + ".nxml"): rp = reach.process_nxml_file(prefix + ".nxml", citation=pmid, offline=True) shutil.move("reach_output.json", prefix + ".json") else: rp = reach.process_json_file(prefix + ".json", citation=pmid) run_assembly(rp.statements, folder, pmcid)
for pmcid in pmc_ids: prefix = folder + '/' + pmcid if not have_file(prefix + '.nxml') and\ not have_file(prefix + '.txt'): txt, txt_format = get_full_text(pmcid) if txt_format == 'nxml': fname = prefix + '.nxml' else: fname = prefix + '.txt' with open(fname, 'wt') as fh: fh.write(txt.encode('utf-8')) pmids.append(id_lookup(pmcid)['pmid']) # Read each paper if it hasn't been read yet. # Otherwise use the existing json extractions. for pmcid, pmid in zip(pmc_ids, pmids): prefix = folder + '/' + pmcid print 'Processing %s...' % pmcid # If REACH already processed it then don't run it again if rerun or not have_file(prefix + '.json'): if have_file(prefix + '.txt'): txt = open(prefix + '.txt').read().decode('utf-8') rp = reach.process_text(txt, citation=pmid) elif have_file(prefix + '.nxml'): rp = reach.process_nxml_file(prefix + '.nxml', citation=pmid) shutil.move('reach_output.json', prefix + '.json') else: rp = reach.process_json_file(prefix + '.json', citation=pmid) run_assembly(rp.statements, folder, pmcid)
import sys import pickle from indra import reach from indra.assemblers import GraphAssembler if len(sys.argv) < 2: process_type = 'text' else: process_type = sys.argv[1] if process_type == 'text': txt = open('ras_pathway.txt', 'rt').read() rp = reach.process_text(txt, offline=True) st = rp.statements elif process_type == 'json': rp = reach.process_json_file('reach_output.json') st = rp.statements else: st = pickle.load(open('statements.pkl', 'rb')) for s in st: print '%s\t%s' % (s, s.evidence[0].text) graphpr = {'rankdir': 'TD'} nodepr = {'fontsize': 12, 'shape': 'plaintext', 'margin': '0,0', 'pad': 0} ga = GraphAssembler(st, graph_properties=graphpr, node_properties=nodepr) ga.make_model() ga.save_dot('ras_pathway.dot') ga.save_pdf('ras_pathway.pdf')