def process_paper(model_name, pmid): json_path = os.path.join(model_name, 'jsons', 'PMID%s.json' % pmid) if pmid.startswith('api') or pmid.startswith('PMID'): logger.warning('Invalid PMID: %s' % pmid) # If the paper has been read, use the json output file if os.path.exists(json_path): rp = reach.process_json_file(json_path, citation=pmid) txt_format = 'existing_json' # If the paper has not been read, download the text and read else: try: txt, txt_format = get_full_text(pmid, 'pmid') except: return None, None if txt_format == 'pmc_oa_xml': rp = reach.process_nxml_str(txt, citation=pmid, offline=True) if os.path.exists('reach_output.json'): shutil.move('reach_output.json', json_path) elif txt_format == 'elsevier_xml': # Extract the raw text from the Elsevier XML txt = elsevier_client.extract_text(txt) rp = reach.process_text(txt, citation=pmid, offline=True) if os.path.exists('reach_output.json'): shutil.move('reach_output.json', json_path) elif txt_format == 'abstract': rp = reach.process_text(txt, citation=pmid, offline=True) if os.path.exists('reach_output.json'): shutil.move('reach_output.json', json_path) else: rp = None if rp is not None: check_pmids(rp.statements) return rp, txt_format
def get_upload_content(pmid, force_fulltext_lookup=False): """Get full text and/or abstract for paper and upload to S3.""" # Make sure that the PMID doesn't start with PMID so that it doesn't # screw up the literature clients if pmid.startswith('PMID'): pmid = pmid[4:] # First, check S3: (ft_content_s3, ft_content_type_s3) = get_full_text(pmid) # The abstract is on S3 but there is no full text; if we're not forcing # fulltext lookup, then we're done if ft_content_type_s3 == 'abstract' and not force_fulltext_lookup: return (ft_content_s3, ft_content_type_s3) # If there's nothing (even an abstract on S3), or if there's an abstract # and we're forcing fulltext lookup, do the lookup elif ft_content_type_s3 is None or \ (ft_content_type_s3 == 'abstract' and force_fulltext_lookup) or \ (ft_content_type_s3 == 'elsevier_xml' and not elsevier_client.extract_text(ft_content_s3)): # FIXME FIXME FIXME if ft_content_type_s3 == 'elsevier_xml': logger.info('elsevier_xml for %s missing full text element, ' 'getting again.' % pmid) # FIXME FIXME FIXME # Try to retrieve from literature client logger.info("PMID%s: getting content using literature client" % pmid) (ft_content, ft_content_type) = lit.get_full_text(pmid, 'pmid') assert ft_content_type in ('pmc_oa_xml', 'elsevier_xml', 'abstract', None) # If we tried to get the full text and didn't even get the abstract, # then there was probably a problem with the web service or the DOI if ft_content_type is None: return (None, None) # If we got the abstract, and we already had the abstract on S3, then # do nothing elif ft_content_type == 'abstract' and ft_content_type_s3 == 'abstract': logger.info("PMID%s: found abstract but already had it on " \ "S3; skipping" % pmid) return (ft_content, ft_content_type) # If we got the abstract, and we had nothing on S3, then upload elif ft_content_type == 'abstract' and ft_content_type_s3 is None: logger.info("PMID%s: found abstract, uploading to S3" % pmid) put_abstract(pmid, ft_content) return (ft_content, ft_content_type) # We got a full text (or something other than None or abstract...) else: logger.info("PMID%s: uploading %s" % (pmid, ft_content_type)) put_full_text(pmid, ft_content, full_text_type=ft_content_type) return (ft_content, ft_content_type) # Some form of full text is already on S3 else: # TODO # In future, could check for abstract even if full text is found, and # upload it just to have it return (ft_content_s3, ft_content_type_s3) # We should always return before we get here assert False
def process_paper(model_name, pmid): """Process a paper with the given pubmed identifier Parameters ---------- model_name : str The directory for the INDRA machine pmid : str The PMID to process. Returns ------- rp : ReachProcessor A ReachProcessor containing the extracted INDRA Statements in rp.statements. txt_format : str A string representing the format of the text """ json_directory = os.path.join(model_name, 'jsons') json_path = os.path.join(json_directory, 'PMID%s.json' % pmid) if pmid.startswith('api') or pmid.startswith('PMID'): logger.warning('Invalid PMID: %s' % pmid) # If the paper has been read, use the json output file if os.path.exists(json_path): rp = reach.process_json_file(json_path, citation=pmid) txt_format = 'existing_json' # If the paper has not been read, download the text and read else: try: txt, txt_format = get_full_text(pmid, 'pmid') except Exception: return None, None if txt_format == 'pmc_oa_xml': rp = reach.process_nxml_str(txt, citation=pmid, offline=True, output_fname=json_path) elif txt_format == 'elsevier_xml': # Extract the raw text from the Elsevier XML txt = elsevier_client.extract_text(txt) rp = reach.process_text(txt, citation=pmid, offline=True, output_fname=json_path) elif txt_format == 'abstract': rp = reach.process_text(txt, citation=pmid, offline=True, output_fname=json_path) else: rp = None if rp is not None: check_pmids(rp.statements) return rp, txt_format
def process_paper_aws(pmid, start_time_local): try: metadata, content_type = get_full_text(pmid, metadata=True) except Exception as e: logger.error('Could not get content from S3: %s' % e) return None, None logger.info('Downloading %s output from AWS' % pmid) reach_json_str = get_reader_json_str('reach', pmid) if not reach_json_str: logger.info('Could not get output.') return None, content_type rp = reach.process_json_str(reach_json_str) current_time_local = datetime.datetime.now(tzlocal.get_localzone()) dt_script = current_time_local - start_time_local last_mod_remote = metadata['LastModified'] dt = (current_time_local - last_mod_remote) # If it was not modified since the script started if dt > dt_script: content_type = 'existing_json' return rp, content_type
def test_get_full_text_pmc(): txt, txt_format = get_full_text('PMC4322985', 'pmcid') assert (txt_format == 'pmc_oa_xml') assert (len(txt) > 300000) assert unicode_strs((txt, txt_format))
def test_get_full_text_pubmed_abstract(): # DOI lookup in CrossRef fails for this one because of page mismatch txt, txt_format = get_full_text('27075779', 'pmid') assert (txt_format == 'abstract') assert (len(txt) > 800) assert unicode_strs((txt, txt_format))
def test_get_full_text_doi(): txt, txt_format = get_full_text('10.18632/oncotarget.2555', 'doi') assert (txt_format == 'pmc_oa_xml') assert (len(txt) > 300000) assert unicode_strs((txt, txt_format))
def test_gene_network(): # Chunk 1: this is tested in _get_gene_network_stmts # from indra.tools.gene_network import GeneNetwork # gn = GeneNetwork(['H2AX']) # biopax_stmts = gn.get_biopax_stmts() # bel_stmts = gn.get_bel_stmts() # Chunk 2 from indra import literature pmids = literature.pubmed_client.get_ids_for_gene('H2AX') # Chunk 3 from indra import literature paper_contents = {} for pmid in pmids: content, content_type = literature.get_full_text(pmid, 'pmid') if content_type == 'abstract': paper_contents[pmid] = content if len(paper_contents) == 5: # Is 10 in actual code break # Chunk 4 from indra.sources import reach literature_stmts = [] for pmid, content in paper_contents.items(): rp = reach.process_text(content, url=reach.local_text_url) literature_stmts += rp.statements print('Got %d statements' % len(literature_stmts)) assert literature_stmts # replaces a print statements # Chunk 6 from indra.tools import assemble_corpus as ac # stmts = biopax_stmts + bel_stmts + literature_stmts # tested elsewhere stmts = gn_stmts + literature_stmts # Added instead of above line stmts = ac.map_grounding(stmts) stmts = ac.map_sequence(stmts) stmts = ac.run_preassembly(stmts) assert stmts # Chunk 7 from indra.assemblers.cx import CxAssembler from indra.databases import ndex_client cxa = CxAssembler(stmts) cx_str = cxa.make_model() assert cx_str # Chunk 8 # ndex_cred = {'user': '******', 'password': '******'} # network_id = ndex_client.create_network(cx_str, ndex_cred) # print(network_id) # Chunk 9 from indra.assemblers.indranet import IndraNetAssembler indranet_assembler = IndraNetAssembler(statements=stmts) indranet = indranet_assembler.make_model() assert len(indranet.nodes) > 0, 'indranet conatins no nodes' assert len(indranet.edges) > 0, 'indranet conatins no edges' # Chunk 10 import networkx as nx paths = nx.single_source_shortest_path(G=indranet, source='H2AX', cutoff=1) assert paths # Chunk 11 from indra.assemblers.pysb import PysbAssembler pysb = PysbAssembler(statements=stmts) pysb_model = pysb.make_model() assert pysb_model
#exploring pubmed search from indra import literature from indra.sources import reach #some constants retmax = 100000 query = 'Magnesium AND CKD' #'elevated homocysteine' #'cardiac arrest' #'atrial fibrillation' #intention to run against an array of query strings #code pmids = literature.pubmed_client.get_ids(query, retmax=retmax) print('Got %d pmids' % len(pmids)) #print(pmids) paper_contents = {} for pmid in pmids: content, content_type = literature.get_full_text(pmid, 'pmid') if content_type == 'abstract': paper_contents[pmid] = content if len(paper_contents) == 10: break #TODO need to move analysis up into the above loop # in order to add PMID to each into the statements made from Reach so we can track docID # WAIT! Below, we know what the pmid is; we can use that literature_stmts = [] for pmid, content in paper_contents.items(): print(pmid) print(content) rp = reach.process_text(content, url=reach.local_text_url) #TODO add a forloop here which takes all statements and adds pmid to the statement # then adds that statement to literature_stmts # turns out that doesn't work: sx == statement which is not JSON
from indra import reach from indra.literature import pmc_client, get_full_text, id_lookup from assembly_eval import have_file, run_assembly if __name__ == "__main__": folder = "reach" pmc_ids = [s.strip() for s in open("pmcids.txt", "rt").readlines()] pmids = [id_lookup(pmcid)["pmid"] for pmcid in pmc_ids] # Set to True only if reading should be ran again rerun = False # Download the papers if they are not available yet for pmcid in pmc_ids: prefix = folder + "/" + pmcid if not have_file(prefix + ".nxml") and not have_file(prefix + ".txt"): txt, txt_format = get_full_text(pmcid) if txt_format == "nxml": fname = prefix + ".nxml" else: fname = prefix + ".txt" with open(fname, "wt") as fh: fh.write(txt.encode("utf-8")) # Read each paper if it hasn't been read yet. # Otherwise use the existing json extractions. for pmcid, pmid in zip(pmc_ids, pmids): prefix = folder + "/" + pmcid print "Processing %s..." % pmcid # If REACH already processed it then don't run it again if rerun or not have_file(prefix + ".json"): if have_file(prefix + ".txt"):
def test_get_full_text_pmc(): txt, txt_format = get_full_text('PMC4322985', 'pmcid') assert txt_format == 'pmc_oa_xml' assert len(txt) > 300000 assert unicode_strs((txt, txt_format))
def test_get_full_text_pubmed_abstract(): # DOI lookup in CrossRef fails for this one because of page mismatch txt, txt_format = get_full_text('27075779', 'pmid') assert txt_format == 'abstract' assert len(txt) > 800 assert unicode_strs((txt, txt_format))
def test_get_full_text_doi(): txt, txt_format = get_full_text('10.18632/oncotarget.2555', 'doi') assert txt_format == 'pmc_oa_xml' assert len(txt) > 300000 assert unicode_strs((txt, txt_format))