def test_produce_readings(): "Comprehensive test of the high level production of readings." # Prep the inputs. db = get_db_with_pubmed_content() complete_tr_list = db.select_all(db.TextRef) id_dict = get_id_dict(complete_tr_list) # Test with just sparser for tollerable speeds. reader_list = get_readers('SPARSER') # Test the read_mode='none' option (should yield nothing, because there # aren't any readings yet.) outputs_0 = rdb.produce_readings(id_dict, reader_list, verbose=True, db=db, read_mode='none') assert len(outputs_0) == 0 # Test just getting a pickle file (Nothing should be posted to the db.). pkl_file = 'test_db_res.pkl' outputs_1 = rdb.produce_readings(id_dict, reader_list, verbose=True, db=db, no_upload=True, pickle_file=pkl_file) N_out = len(outputs_1) N_exp = len(reader_list)*db.filter_query(db.TextContent).count() assert N_out == N_exp, "Expected %d readings, got %d." % (N_exp, N_out) assert path.exists(pkl_file), "Pickle file not created." with open(pkl_file, 'rb') as f: N_pkl = len(pickle.load(f)) assert N_pkl == N_exp, \ "Expected %d readings in pickle, got %d." % (N_exp, N_out) N_readings = db.filter_query(db.Readings).count() assert N_readings == 0, \ "There shouldn't be any readings yet, but found %d." % N_readings # Test reading and insert to the database. rdb.produce_readings(id_dict, reader_list, verbose=True, db=db) N_db = db.filter_query(db.Readings).count() assert N_db == N_exp, "Excpected %d readings, got %d." % (N_exp, N_db) # Test reading again, without read_mode='all' outputs_2 = rdb.produce_readings(id_dict, reader_list, verbose=True, db=db) assert len(outputs_2) == N_exp, \ "Got %d readings, expected %d." % (len(outputs_2), N_exp) assert all([rd.reading_id is not None for rd in outputs_2]) # Test with read_mode='none' again. outputs_3 = rdb.produce_readings(id_dict, reader_list, verbose=True, db=db, read_mode='none') assert len(outputs_3) == N_exp assert all([rd.reading_id is not None for rd in outputs_3]) # Test the read_mode='all'. outputs_4 = rdb.produce_readings(id_dict, reader_list, verbose=True, db=db, read_mode='all') assert len(outputs_4) == N_exp assert all([rd.reading_id is None for rd in outputs_4])
def _run_reading(self, db, trids, max_refs=5000): if len(trids) > max_refs: raise ReadingUpdateError("Too many id's to run locally. Try " "running on batch (use_batch).") logger.info("Producing readings locally for %d new text refs." % len(trids)) base_dir = path.join(THIS_DIR, 'read_all_%s' % self.reader.name) reader_inst = self.reader(base_dir=base_dir, n_proc=self.n_proc) logger.info("Making readings...") outputs = rdb.produce_readings({'trid': trids}, [reader_inst], read_mode='unread_unread', db=db, prioritize=True, verbose=self.verbose) logger.info("Made %d readings." % len(outputs)) logger.info("Making statements...") rdb.produce_statements(outputs, n_proc=self.n_proc, db=db) return
if e.response['Error']['Code'] == 'NoSuchKey': logger.info('Could not find PMID list file at %s, exiting' % id_list_key) sys.exit(1) # If there was some other kind of problem, re-raise the exception else: raise e # Get the content from the object id_list_str = id_list_obj['Body'].read().decode('utf8').strip() id_str_list = id_list_str.splitlines()[args.start_index:args.end_index] random.shuffle(id_str_list) id_dict = get_id_dict([line.strip() for line in id_str_list]) # Read everything ======================================== outputs = produce_readings(id_dict, readers, verbose=True, read_mode=args.mode, force_fulltext=args.force_fulltext, prioritize=(not args.read_all_fulltext)) # Preserve the sparser logs contents = os.listdir('.') sparser_logs = [fname for fname in contents if fname.startswith('sparser') and fname.endswith('log')] sparser_log_dir = ('reading_results/%s/logs/run_db_reading_queue/' 'sparser_logs_%s/') % ( args.basename, datetime.now().strftime('%Y%m%d_%H%M%S') ) for fname in sparser_logs: s3_key = sparser_log_dir + fname logger.info("Saving sparser logs to %s on s3 in %s." % (s3_key, bucket_name))
# Get a handle for the database if args.test: from indra.db import util as dbu db = dbu.get_test_db() else: db = None s3_log_prefix = ('reading_results/%s/logs/run_db_reading_queue/%s/' % (args.basename, args.job_name)) # Read everything ======================================== starts['reading'] = datetime.now() outputs = produce_readings(id_dict, readers, verbose=True, read_mode=args.read_mode, get_preexisting=(args.stmt_mode == 'all'), force_fulltext=args.force_fulltext, prioritize=args.use_best_fulltext, db=db) ends['reading'] = datetime.now() # Preserve the sparser logs contents = os.listdir('.') sparser_logs = [fname for fname in contents if fname.startswith('sparser') and fname.endswith('log')] sparser_log_dir = s3_log_prefix + 'sparser_logs/' for fname in sparser_logs: s3_key = sparser_log_dir + fname logger.info("Saving sparser logs to %s on s3 in %s." % (s3_key, bucket_name)) with open(fname, 'r') as f: client.put_object(Key=s3_key, Body=f.read(),