def _run_reading(self, db, trids, max_refs=5000): if len(trids) > max_refs: raise ReadingUpdateError("Too many id's to run locally. Try " "running on batch (use_batch).") logger.info("Producing readings locally for %d new text refs." % len(trids)) base_dir = path.join(THIS_DIR, 'read_all_%s' % self.reader.name) reader_inst = self.reader(base_dir=base_dir, n_proc=self.n_proc) logger.info("Making readings...") outputs = rdb.produce_readings({'trid': trids}, [reader_inst], read_mode='unread_unread', db=db, prioritize=True, verbose=self.verbose) logger.info("Made %d readings." % len(outputs)) logger.info("Making statements...") rdb.produce_statements(outputs, n_proc=self.n_proc, db=db) return
def test_reading_content_insert(): "Test the content primary through-put of make_db_readings." db = get_db_with_pubmed_content() print("Test reading") tc_list = db.select_all(db.TextContent) readers = get_readers() reading_output = [] for reader in readers: reading_output += reader.read([process_content(tc) for tc in tc_list], verbose=True) expected_output_len = len(tc_list)*len(readers) assert len(reading_output) == expected_output_len, \ ("Not all text content successfully read." "Expected %d outputs, but got %d.") % (expected_output_len, len(reading_output)) print("Test reading insert") rdb.upload_readings(reading_output, db=db) r_list = db.select_all(db.Readings) def is_complete_match(r_list, reading_output): return all([any([rd.matches(r) for r in r_list]) for rd in reading_output]) assert is_complete_match(r_list, reading_output), \ "Not all reading output posted." rdb.upload_readings(reading_output, db=db) assert is_complete_match(r_list, reading_output), \ "Uniqueness constraints failed." print("Test enrichement") assert all([rd.reading_id is None for rd in reading_output]), \ "No readings should have reading_ids already." rdb._enrich_reading_data(reading_output, db=db) assert all([rd.reading_id is not None for rd in reading_output]),\ "Some reading data objects didn't have reading_ids after enrichment." print("Test making statements") stmts = rdb.produce_statements(reading_output, db=db) assert len(stmts), 'No statements created.' db_stmts = db.select_all(db.Statements) assert len(db_stmts) == len(stmts), \ "Only %d/%d statements added." % (len(db_stmts), len(stmts)) assert len(db.select_all(db.Agents)), "No agents added."
id_list_str = id_list_obj['Body'].read().decode('utf8').strip() id_str_list = id_list_str.splitlines()[args.start_index:args.end_index] random.shuffle(id_str_list) id_dict = get_id_dict([line.strip() for line in id_str_list]) # Read everything ======================================== outputs = produce_readings(id_dict, readers, verbose=True, read_mode=args.mode, force_fulltext=args.force_fulltext, prioritize=(not args.read_all_fulltext)) # Preserve the sparser logs contents = os.listdir('.') sparser_logs = [fname for fname in contents if fname.startswith('sparser') and fname.endswith('log')] sparser_log_dir = ('reading_results/%s/logs/run_db_reading_queue/' 'sparser_logs_%s/') % ( args.basename, datetime.now().strftime('%Y%m%d_%H%M%S') ) for fname in sparser_logs: s3_key = sparser_log_dir + fname logger.info("Saving sparser logs to %s on s3 in %s." % (s3_key, bucket_name)) with open(fname, 'r') as f: client.put_object(Key=s3_key, Body=f.read(), Bucket=bucket_name) # Convert the outputs to statements ================================== produce_statements(outputs, n_proc=args.num_cores)
read_mode=args.read_mode, get_preexisting=(args.stmt_mode == 'all'), force_fulltext=args.force_fulltext, prioritize=args.use_best_fulltext, db=db) ends['reading'] = datetime.now() # Preserve the sparser logs contents = os.listdir('.') sparser_logs = [fname for fname in contents if fname.startswith('sparser') and fname.endswith('log')] sparser_log_dir = s3_log_prefix + 'sparser_logs/' for fname in sparser_logs: s3_key = sparser_log_dir + fname logger.info("Saving sparser logs to %s on s3 in %s." % (s3_key, bucket_name)) with open(fname, 'r') as f: client.put_object(Key=s3_key, Body=f.read(), Bucket=bucket_name) # Convert the outputs to statements ================================== if args.stmt_mode != 'none': starts['statement production'] = datetime.now() stmt_data = produce_statements(outputs, n_proc=args.num_cores, db=db) ends['statement production'] = datetime.now() else: stmt_data = [] report_statistics(outputs, stmt_data, starts, ends, s3_log_prefix, client, bucket_name)