def read_pmids(pmids, date): """Return extracted INDRA Statements per PMID after running reading on AWS. Parameters ---------- pmids : list[str] A list of PMIDs to read. Returns ------- dict[str, list[indra.statements.Statement]] A dict of PMIDs and the list of Statements extracted for the given PMID by reading. """ pmid_fname = 'pmids-%s.txt' % date_str with open(pmid_fname, 'wt') as fh: fh.write('\n'.join(pmids)) job_list = submit_reading('emmaa', pmid_fname, ['reach']) date_str = date.strftime('%Y-%m-%d-%H-%M-%S') wait_for_complete('run_reach_queue', job_list, idle_log_timeout=600, kill_on_log_timeout=True) pmid_stmts = {} for pmid in pmids: reach_json_str = get_reader_json_str('reach', pmid) rp = reach.process_json_str(reach_json_str) if not rp: pmid_stmts[pmid] = [] else: pmid_stmts[pmid] = rp.statements return pmid_stmts
def _run_reading(self, db, trids, max_refs=5000): if len(trids) / max_refs >= 1000: raise ReadingUpdateError("Too many id's for one submission. " "Break it up and do it manually.") logger.info("Producing readings on aws for %d text refs with new " "content not read by %s." % (len(trids), self.reader.name)) job_prefix = ('%s_reading_%s' % (self.reader.name.lower(), self.run_datetime.strftime('%Y%m%d_%H%M%S'))) with open(job_prefix + '.txt', 'w') as f: f.write('\n'.join(['trid:%s' % trid for trid in trids])) logger.info("Submitting jobs...") job_ids = submit_db_reading(job_prefix, job_prefix + '.txt', readers=[self.reader.name.lower()], start_ix=0, end_ix=None, pmids_per_job=max_refs, num_tries=2, force_read=False, force_fulltext=False, read_all_fulltext=False, project_name=self.project_name) logger.info("Waiting for complete...") wait_for_complete('run_db_reading_queue', job_list=job_ids, job_name_prefix=job_prefix, idle_log_timeout=1200, kill_on_log_timeout=True, stash_log_method='s3') return
def test_handler(): """Test the lambda handler locally.""" dts = make_date_str() key = f'models/test/test_model_{dts}.pkl' event = {'Records': [{'s3': {'object': {'key': key}}}]} context = None res = lambda_handler(event, context) print(res) assert res['statusCode'] == 200, res assert res['result'] == 'SUCCESS', res assert res['job_id'], res job_id = res['job_id'] results = {} wait_for_complete(QUEUE, job_list=[{ 'jobId': job_id }], result_record=results) print(results) assert job_id in [job_def['jobId'] for job_def in results['succeeded']], \ results['failed'] s3 = get_s3_client() s3_res = s3.list_objects(Bucket='emmaa', Prefix='results/test/' + dts[:10]) print(s3_res.keys()) assert s3_res, s3_res
def run_machine(model_path, pmids, belief_threshold, search_genes=None, ndex_cred=None, twitter_cred=None, grounding_map=None): start_time_local = datetime.datetime.now(tzlocal.get_localzone()) date_str = make_date_str() # Save PMIDs in file and send for remote reading if aws_available: pmid_fname = 'pmids-%s.txt' % date_str all_pmids = [] for v in pmids.values(): all_pmids += v all_pmids = list(set(all_pmids)) with open(pmid_fname, 'wt') as fh: for pmid in all_pmids: fh.write('%s\n' % pmid) # Submit reading job_list = submit_reading('rasmachine', pmid_fname, ['reach']) # Wait for reading to complete wait_for_complete('run_reach_queue', job_list, idle_log_timeout=600, kill_on_log_timeout=True) # Load the model logger.info(time.strftime('%c')) logger.info('Loading original model.') inc_model_file = os.path.join(model_path, 'model.pkl') model = IncrementalModel(inc_model_file) # Include search genes as prior genes if search_genes: model.prior_genes = search_genes stats = {} logger.info(time.strftime('%c')) logger.info('Preassembling original model.') model.preassemble(filters=global_filters, grounding_map=grounding_map) logger.info(time.strftime('%c')) # Original statistics stats['orig_stmts'] = len(model.get_statements()) stats['orig_assembled'] = len(model.assembled_stmts) orig_stmts = filter_db_highbelief(model.assembled_stmts, ['bel', 'biopax'], belief_threshold) orig_stmts = ac.filter_top_level(orig_stmts) stats['orig_final'] = len(orig_stmts) logger.info('%d final statements' % len(orig_stmts)) # Extend the model with PMIDs logger.info('----------------') logger.info(time.strftime('%c')) logger.info('Extending model.') stats['new_papers'], stats['new_abstracts'], stats['existing'] = \ extend_model(model_path, model, pmids, start_time_local) # Having added new statements, we preassemble the model model.preassemble(filters=global_filters, grounding_map=grounding_map) # New statistics stats['new_stmts'] = len(model.get_statements()) stats['new_assembled'] = len(model.assembled_stmts) new_stmts = filter_db_highbelief(model.assembled_stmts, ['bel', 'biopax'], belief_threshold) new_stmts = ac.filter_top_level(new_stmts) stats['new_final'] = len(new_stmts) logger.info('%d final statements' % len(new_stmts)) check_pmids(model.get_statements()) # Save model logger.info(time.strftime('%c')) logger.info('Saving model') model.save(inc_model_file) logger.info(time.strftime('%c')) # Save a time stamped version of the pickle for backup/diagnostic purposes if not aws_available: inc_model_bkp_file = os.path.join(model_path, 'model-%s.pkl' % date_str) model.save(inc_model_bkp_file) else: key = 'rasmachine/%s/model-%s.pkl' % (model_path.replace('/', '_'), date_str) s3 = boto3.client('s3') s3.upload_file(inc_model_file, 'bigmech', key) # Upload the new, final statements to NDEx if ndex_cred: upload_new_ndex(model_path, new_stmts, ndex_cred) # Print and tweet the status message logger.info('--- Final statistics ---') for k, v in sorted(stats.items(), key=lambda x: x[0]): logger.info('%s: %s' % (k, v)) logger.info('------------------------') msg_str = make_status_message(stats) if msg_str is not None: logger.info('Status message: %s' % msg_str) if twitter_cred: logger.info('Now tweeting: %s' % msg_str) twitter_client.update_status(msg_str, twitter_cred)
type=int, help=('If the logs are not updated for %(metavar)s seconds, ' 'print a warning. If `--kill_on_log_timeout` flag is set, then ' 'the offending jobs will be automatically terminated.')) parser.add_argument( '--kill_on_timeout', '-K', action='store_true', help='If a log times out, terminate the offending job.') parser.add_argument( '--stash_log_method', '-l', choices=['s3', 'local'], metavar='METHOD', help=('Select a method from: [%(choices)s] to store the job logs. ' 'If no method is specified, the logs will not be ' 'loaded off of AWS. If \'s3\' is specified, then ' '`job_name_prefix` must also be given, as this will indicate ' 'where on s3 to store the logs.')) args = parser.parse_args() from indra.tools.reading.submit_reading_pipeline import wait_for_complete job_list = None if args.job_list is not None: job_list = [{'jobId': jid} for jid in args.job_list] wait_for_complete(args.queue_name, job_list, args.job_name_prefix, args.poll_interval, args.timeout, args.kill_on_timeout, args.stash_log_method)
def run_machine(model_path, pmids, belief_threshold, search_genes=None, ndex_cred=None, twitter_cred=None, grounding_map=None): start_time_local = datetime.datetime.now(tzlocal.get_localzone()) date_str = make_date_str() # Save PMIDs in file and send for remote reading if aws_available: pmid_fname = 'pmids-%s.txt' % date_str all_pmids = [] for v in pmids.values(): all_pmids += v all_pmids = list(set(all_pmids)) with open(pmid_fname, 'wt') as fh: for pmid in all_pmids: fh.write('%s\n' % pmid) # Submit reading job_list = submit_reading('rasmachine', pmid_fname, ['reach']) # Wait for reading to complete wait_for_complete('run_reach_queue', job_list, idle_log_timeout=600, kill_on_log_timeout=True) # Load the model logger.info(time.strftime('%c')) logger.info('Loading original model.') inc_model_file = os.path.join(model_path, 'model.pkl') model = IncrementalModel(inc_model_file) # Include search genes as prior genes if search_genes: model.prior_genes = search_genes stats = {} logger.info(time.strftime('%c')) logger.info('Preassembling original model.') model.preassemble(filters=global_filters, grounding_map=grounding_map) logger.info(time.strftime('%c')) # Original statistics stats['orig_stmts'] = len(model.get_statements()) stats['orig_assembled'] = len(model.assembled_stmts) orig_stmts = filter_db_highbelief(model.assembled_stmts, ['bel', 'biopax'], belief_threshold) orig_stmts = ac.filter_top_level(orig_stmts) stats['orig_final'] = len(orig_stmts) logger.info('%d final statements' % len(orig_stmts)) # Extend the model with PMIDs logger.info('----------------') logger.info(time.strftime('%c')) logger.info('Extending model.') stats['new_papers'], stats['new_abstracts'], stats['existing'] = \ extend_model(model_path, model, pmids, start_time_local) # Having added new statements, we preassemble the model model.preassemble(filters=global_filters, grounding_map=grounding_map) # New statistics stats['new_stmts'] = len(model.get_statements()) stats['new_assembled'] = len(model.assembled_stmts) new_stmts = filter_db_highbelief(model.assembled_stmts, ['bel', 'biopax'], belief_threshold) new_stmts = ac.filter_top_level(new_stmts) stats['new_final'] = len(new_stmts) logger.info('%d final statements' % len(new_stmts)) check_pmids(model.get_statements()) # Save model logger.info(time.strftime('%c')) logger.info('Saving model') model.save(inc_model_file) logger.info(time.strftime('%c')) # Save a time stamped version of the pickle for backup/diagnostic purposes if not aws_available: inc_model_bkp_file = os.path.join(model_path, 'model-%s.pkl' % date_str) model.save(inc_model_bkp_file) else: key = 'rasmachine/%s/model-%s.pkl' % (model_path.replace( '/', '_'), date_str) s3 = boto3.client('s3') s3.upload_file(inc_model_file, 'bigmech', key) # Upload the new, final statements to NDEx if ndex_cred: upload_new_ndex(model_path, new_stmts, ndex_cred) # Print and tweet the status message logger.info('--- Final statistics ---') for k, v in sorted(stats.items(), key=lambda x: x[0]): logger.info('%s: %s' % (k, v)) logger.info('------------------------') msg_str = make_status_message(stats) if msg_str is not None: logger.info('Status message: %s' % msg_str) if twitter_cred: logger.info('Now tweeting: %s' % msg_str) twitter_client.update_status(msg_str, twitter_cred)
def run_reading(pmid_fname): job_list = submit_reading(basen, pmid_fname, ['reach'], pmids_per_job=2000) reading_res = wait_for_complete(job_list) combine_res = submit_combine(basen, job_list)