Example #1
0
def test_produce_readings():
    "Comprehensive test of the high level production of readings."
    # Prep the inputs.
    db = get_db_with_pubmed_content()
    complete_tr_list = db.select_all(db.TextRef)
    id_dict = get_id_dict(complete_tr_list)

    # Test with just sparser for tollerable speeds.
    reader_list = get_readers('SPARSER')

    # Test the read_mode='none' option (should yield nothing, because there
    # aren't any readings yet.)
    outputs_0 = rdb.produce_readings(id_dict, reader_list, verbose=True, db=db,
                                     read_mode='none')
    assert len(outputs_0) == 0

    # Test just getting a pickle file (Nothing should be posted to the db.).
    pkl_file = 'test_db_res.pkl'
    outputs_1 = rdb.produce_readings(id_dict, reader_list, verbose=True, db=db,
                                     no_upload=True, pickle_file=pkl_file)
    N_out = len(outputs_1)
    N_exp = len(reader_list)*db.filter_query(db.TextContent).count()
    assert N_out == N_exp, "Expected %d readings, got %d." % (N_exp, N_out)
    assert path.exists(pkl_file), "Pickle file not created."
    with open(pkl_file, 'rb') as f:
        N_pkl = len(pickle.load(f))
    assert N_pkl == N_exp, \
        "Expected %d readings in pickle, got %d." % (N_exp, N_out)
    N_readings = db.filter_query(db.Readings).count()
    assert N_readings == 0, \
        "There shouldn't be any readings yet, but found %d." % N_readings

    # Test reading and insert to the database.
    rdb.produce_readings(id_dict, reader_list, verbose=True, db=db)
    N_db = db.filter_query(db.Readings).count()
    assert N_db == N_exp, "Excpected %d readings, got %d." % (N_exp, N_db)

    # Test reading again, without read_mode='all'
    outputs_2 = rdb.produce_readings(id_dict, reader_list, verbose=True, db=db)
    assert len(outputs_2) == N_exp, \
        "Got %d readings, expected %d." % (len(outputs_2), N_exp)
    assert all([rd.reading_id is not None for rd in outputs_2])

    # Test with read_mode='none' again.
    outputs_3 = rdb.produce_readings(id_dict, reader_list, verbose=True, db=db,
                                     read_mode='none')
    assert len(outputs_3) == N_exp
    assert all([rd.reading_id is not None for rd in outputs_3])

    # Test the read_mode='all'.
    outputs_4 = rdb.produce_readings(id_dict, reader_list, verbose=True, db=db,
                                     read_mode='all')
    assert len(outputs_4) == N_exp
    assert all([rd.reading_id is None for rd in outputs_4])
Example #2
0
    def _run_reading(self, db, trids, max_refs=5000):
        if len(trids) > max_refs:
            raise ReadingUpdateError("Too many id's to run locally. Try "
                                     "running on batch (use_batch).")
        logger.info("Producing readings locally for %d new text refs." %
                    len(trids))
        base_dir = path.join(THIS_DIR, 'read_all_%s' % self.reader.name)
        reader_inst = self.reader(base_dir=base_dir, n_proc=self.n_proc)

        logger.info("Making readings...")
        outputs = rdb.produce_readings({'trid': trids}, [reader_inst],
                                       read_mode='unread_unread',
                                       db=db,
                                       prioritize=True,
                                       verbose=self.verbose)
        logger.info("Made %d readings." % len(outputs))
        logger.info("Making statements...")
        rdb.produce_statements(outputs, n_proc=self.n_proc, db=db)
        return
Example #3
0
        if e.response['Error']['Code'] == 'NoSuchKey':
            logger.info('Could not find PMID list file at %s, exiting' %
                        id_list_key)
            sys.exit(1)
        # If there was some other kind of problem, re-raise the exception
        else:
            raise e
    # Get the content from the object
    id_list_str = id_list_obj['Body'].read().decode('utf8').strip()
    id_str_list = id_list_str.splitlines()[args.start_index:args.end_index]
    random.shuffle(id_str_list)
    id_dict = get_id_dict([line.strip() for line in id_str_list])

    # Read everything ========================================
    outputs = produce_readings(id_dict, readers, verbose=True,
                               read_mode=args.mode,
                               force_fulltext=args.force_fulltext,
                               prioritize=(not args.read_all_fulltext))

    # Preserve the sparser logs
    contents = os.listdir('.')
    sparser_logs = [fname for fname in contents
                    if fname.startswith('sparser') and fname.endswith('log')]
    sparser_log_dir = ('reading_results/%s/logs/run_db_reading_queue/'
                       'sparser_logs_%s/') % (
                           args.basename,
                           datetime.now().strftime('%Y%m%d_%H%M%S')
                           )
    for fname in sparser_logs:
        s3_key = sparser_log_dir + fname
        logger.info("Saving sparser logs to %s on s3 in %s."
                    % (s3_key, bucket_name))
Example #4
0
    # Get a handle for the database
    if args.test:
        from indra.db import util as dbu
        db = dbu.get_test_db()
    else:
        db = None

    s3_log_prefix = ('reading_results/%s/logs/run_db_reading_queue/%s/'
                     % (args.basename, args.job_name))

    # Read everything ========================================
    starts['reading'] = datetime.now()
    outputs = produce_readings(id_dict, readers, verbose=True,
                               read_mode=args.read_mode,
                               get_preexisting=(args.stmt_mode == 'all'),
                               force_fulltext=args.force_fulltext,
                               prioritize=args.use_best_fulltext, db=db)
    ends['reading'] = datetime.now()

    # Preserve the sparser logs
    contents = os.listdir('.')
    sparser_logs = [fname for fname in contents
                    if fname.startswith('sparser') and fname.endswith('log')]
    sparser_log_dir = s3_log_prefix + 'sparser_logs/'
    for fname in sparser_logs:
        s3_key = sparser_log_dir + fname
        logger.info("Saving sparser logs to %s on s3 in %s."
                    % (s3_key, bucket_name))
        with open(fname, 'r') as f:
            client.put_object(Key=s3_key, Body=f.read(),