def get_statements(self, reprocess=False): """General method to create statements.""" if self._statements is None or reprocess: # Handle the case that there is no content. if self.content is None: self._statements = [] return [] # Map to the different processors. if self.reader == ReachReader.name: json_str = json.dumps(self.content) processor = reach.process_json_str(json_str) elif self.reader == SparserReader.name: processor = sparser.process_json_dict(self.content) if processor is not None: processor.set_statements_pmid(None) elif self.reader == TripsReader.name: processor = trips.process_xml(self.content) else: raise ReadingError("Unknown reader: %s." % self.reader) # Get the statements from the processor, if it was resolved. if processor is None: logger.error("Production of statements from %s failed for %s." % (self.reader, self.content_id)) stmts = [] else: stmts = processor.statements self._statements = stmts[:] else: stmts = self._statements[:] return stmts
def get_statements(self): """General method to create statements.""" logger.debug("Making statements from %s." % self.reading_id) if self.reader == ReachReader.name: if self.format == formats.JSON: # Process the reach json into statements. json_str = json.dumps(self.content) processor = reach.process_json_str(json_str) else: raise ReadingError("Incorrect format for Reach output: %s." % self.format) elif self.reader == SparserReader.name: if self.format == formats.JSON: # Process the sparser content into statements processor = sparser.process_json_dict(self.content) else: raise ReadingError( "Sparser should only ever be JSON, not %s." % self.format) if processor is None: logger.error("Production of statements from %s failed for %s." % (self.reader, self.tcid)) stmts = [] else: processor.set_statements_pmid(None) stmts = processor.statements return stmts
def get_statements(self, reprocess=False): """General method to create statements.""" if self._statements is None or reprocess: # Handle the case that there is no content. if self.content is None: self._statements = [] return [] # Map to the different processors. if self.reader == ReachReader.name: json_str = json.dumps(self.content) processor = reach.process_json_str(json_str) elif self.reader == SparserReader.name: processor = sparser.process_json_dict(self.content) if processor is not None: processor.set_statements_pmid(None) elif self.reader == TripsReader.name: processor = trips.process_xml(self.content) else: raise ReadingError("Unknown reader: %s." % self.reader) # Get the statements from the processor, if it was resolved. if processor is None: logger.error( "Production of statements from %s failed for %s." % (self.reader, self.content_id)) stmts = [] else: stmts = processor.statements self._statements = stmts[:] else: stmts = self._statements[:] return stmts
def read_pmids(pmids, date): """Return extracted INDRA Statements per PMID after running reading on AWS. Parameters ---------- pmids : list[str] A list of PMIDs to read. Returns ------- dict[str, list[indra.statements.Statement]] A dict of PMIDs and the list of Statements extracted for the given PMID by reading. """ pmid_fname = 'pmids-%s.txt' % date_str with open(pmid_fname, 'wt') as fh: fh.write('\n'.join(pmids)) job_list = submit_reading('emmaa', pmid_fname, ['reach']) date_str = date.strftime('%Y-%m-%d-%H-%M-%S') wait_for_complete('run_reach_queue', job_list, idle_log_timeout=600, kill_on_log_timeout=True) pmid_stmts = {} for pmid in pmids: reach_json_str = get_reader_json_str('reach', pmid) rp = reach.process_json_str(reach_json_str) if not rp: pmid_stmts[pmid] = [] else: pmid_stmts[pmid] = rp.statements return pmid_stmts
def reach_process_json(): """Process REACH json and return INDRA Statements.""" if request.method == 'OPTIONS': return {} response = request.body.read().decode('utf-8') body = json.loads(response) json_str = body.get('json') rp = reach.process_json_str(json_str) return _stmts_from_proc(rp)
def process_reach_str(reach_json_str, pmid): if reach_json_str is None: raise ValueError('reach_json_str cannot be None') # Run the REACH processor on the JSON try: reach_proc = reach.process_json_str(reach_json_str, citation=pmid) # If there's a problem, skip it except Exception as e: print("Exception processing %s" % pmid) print(e) return [] return reach_proc.statements
def reach_process_json(): """Process REACH json and return INDRA Statements.""" if request.method == 'OPTIONS': return {} response = request.body.read().decode('utf-8') body = json.loads(response) json_str = body.get('json') rp = reach.process_json_str(json_str) if rp and rp.statements: stmts = stmts_to_json(rp.statements) res = {'statements': stmts} return res else: res = {'statements': []} return res
def post(self): """Process REACH json and return INDRA Statements. Parameters ---------- json : str The json string to be processed. Returns ------- statements : list[indra.statements.Statement.to_json()] A list of extracted INDRA Statements. """ args = request.json json_str = args.get('json') rp = reach.process_json_str(json_str) return _stmts_from_proc(rp)
def process_paper_aws(pmid, start_time_local): try: metadata, content_type = get_full_text(pmid, metadata=True) except Exception as e: logger.error('Could not get content from S3: %s' % e) return None, None logger.info('Downloading %s output from AWS' % pmid) reach_json_str = get_reader_json_str('reach', pmid) if not reach_json_str: logger.info('Could not get output.') return None, content_type rp = reach.process_json_str(reach_json_str) current_time_local = datetime.datetime.now(tzlocal.get_localzone()) dt_script = current_time_local - start_time_local last_mod_remote = metadata['LastModified'] dt = (current_time_local - last_mod_remote) # If it was not modified since the script started if dt > dt_script: content_type = 'existing_json' return rp, content_type
def read_pmids(pmids, date): """Return extracted INDRA Statements per PMID after running reading on AWS. Parameters ---------- pmids : list[str] A list of PMIDs to read. date : datetime The date and time associated with the reading, typically the current time. Returns ------- dict[str, list[indra.statements.Statement] A dict of PMIDs and the list of Statements extracted for the given PMID by reading. """ date_str = date.strftime('%Y-%m-%d-%H-%M-%S') pmid_fname = 'pmids-%s.txt' % date_str with open(pmid_fname, 'wt') as fh: fh.write('\n'.join(pmids)) job_list = submit_reading('emmaa', pmid_fname, ['reach']) monitor = BatchMonitor('run_reach_queue', job_list) monitor.watch_and_wait(idle_log_timeout=600, kill_on_log_timeout=True) pmid_stmts = {} for pmid in pmids: reach_json_str = get_reader_json_str('reach', pmid) if reach_json_str is None: pmid_stmts[pmid] = [] continue rp = reach.process_json_str(reach_json_str) if not rp: pmid_stmts[pmid] = [] else: pmid_stmts[pmid] = rp.statements return pmid_stmts
pmid_list_file = sys.argv[1] start_ix = int(sys.argv[2]) end_ix = int(sys.argv[3]) # Load the list of PMIDs from the given file with open(pmid_list_file) as f: pmid_list = [line.strip('\n') for line in f.readlines()] if end_ix > len(pmid_list): end_ix = len(pmid_list) stmts = {} for ix, pmid in enumerate(pmid_list[start_ix:end_ix]): reach_json_str = s3_client.get_reader_json_str('reach', pmid) # Logging message will have been produced by get_reach_output if reach_json_str is None: continue # Run the REACH processor on the JSON try: logger.info('%d: Processing %s' % (ix, pmid)) reach_proc = reach.process_json_str(reach_json_str, citation=pmid) # If there's a problem, skip it except Exception as e: print("Exception processing %s" % pmid) print(e) continue stmts[pmid] = reach_proc.statements with open('reach_stmts_%d_%d.pkl' % (start_ix, end_ix), 'wb') as f: pickle.dump(stmts, f, protocol=2)
def parse_results(content): json_str = json.dumps(content) return reach.process_json_str(json_str)
pmid_list_file = sys.argv[1] start_ix = int(sys.argv[2]) end_ix = int(sys.argv[3]) # Load the list of PMIDs from the given file with open(pmid_list_file) as f: pmid_list = [line.strip('\n') for line in f.readlines()] if end_ix > len(pmid_list): end_ix = len(pmid_list) stmts = {} for ix, pmid in enumerate(pmid_list[start_ix:end_ix]): reach_json_str = s3_client.get_reader_json_str('reach', pmid) # Logging message will have been produced by get_reach_output if reach_json_str is None: continue # Run the REACH processor on the JSON try: logger.info('%d: Processing %s' % (ix, pmid)) reach_proc = reach.process_json_str(reach_json_str, citation=pmid) # If there's a problem, skip it except Exception as e: print("Exception processing %s" % pmid) print(e) continue stmts[pmid] = reach_proc.statements with open('reach_stmts_%d_%d.pkl' % (start_ix, end_ix), 'wb') as f: pickle.dump(stmts, f)
def get_processor(content): json_str = json.dumps(content) return reach.process_json_str(json_str)