Beispiel #1
0
def test_stmt_mode_unread():
    "Test whether we can only create statements from unread content."
    # Prep the inputs.
    db = get_db_with_pubmed_content()
    tcids = {tcid for tcid, in db.select_all(db.TextContent.id)}

    # Test with just sparser for tolerable speeds.
    readers = get_readers('SPARSER')

    # First create some readings.
    some_tcids = random.sample(tcids, len(tcids) // 2)
    workers0 = rdb.run_reading(readers, some_tcids, db=db, verbose=True)
    pre_stmt_hash_set = {
        sd.statement.get_hash(shallow=False)
        for sd in workers0[0].statement_outputs
    }

    # Now only make statements for the content that was not read.
    workers = rdb.run_reading(readers,
                              tcids,
                              db=db,
                              verbose=True,
                              stmt_mode='unread')
    stmt_hash_set = {
        sd.statement.get_hash(shallow=False)
        for sd in workers[0].statement_outputs
    }
    assert stmt_hash_set.isdisjoint(pre_stmt_hash_set), \
        "There were overlapping statements."
Beispiel #2
0
    def _run_reading(self, db, tcids, reader_name):
        ids_per_job = 5000
        if len(tcids) > ids_per_job:
            raise ReadingUpdateError("Too many id's to run locally. Try "
                                     "running on batch (use_batch).")
        logger.info("Producing readings locally for %d new text refs."
                    % len(tcids))
        base_dir = path.join(THIS_DIR, 'read_all_%s' % reader_name)
        readers = rdb.construct_readers([reader_name], base_dir=base_dir,
                                        n_proc=self.n_proc)

        rdb.run_reading(readers, tcids, db=db, batch_size=ids_per_job,
                        verbose=self.verbose)
        return
Beispiel #3
0
def test_multi_batch_run():
    "Test that reading works properly with multiple batches run."
    db = get_db_with_pubmed_content()
    readers = get_readers()
    tcids = {tcid for tcid, in db.select_all(db.TextContent.id)}
    rdb.run_reading(readers, tcids, batch_size=len(tcids)//2, db=db,
                    stmt_mode='none')

    # This should catch any repeated readings.
    num_readings = db.filter_query(db.Reading).count()

    # NOTE: This might need some special consideration given the new TRIPS
    # reader, which only reads titles
    num_expected = len(readers)*len(tcids)
    assert num_readings == num_expected, \
        "Expected %d readings, only found %d." % (num_expected, num_readings)
Beispiel #4
0
def test_multiproc_statements():
    "Test the multiprocessing creation of statements."
    db = get_db_with_pubmed_content()
    readers = get_readers()
    tcids = {tcid for tcid, in db.select_all(db.TextContent.id)}
    workers = rdb.run_reading(readers, tcids, db=db)
    assert not any(worker.extant_readings for worker in workers)
    outputs = [rd for worker in workers for rd in worker.new_readings]
    stmts = rdb.make_statements(outputs, 2)
    assert len(stmts)
Beispiel #5
0
def test_produce_readings():
    "Comprehensive test of the high level production of readings."
    # Prep the inputs.
    db = get_db_with_pubmed_content()
    tcids = {tcid for tcid, in db.select_all(db.TextContent.id)}

    # Test with just sparser for tollerable speeds.
    readers = get_readers('SPARSER')

    # Test the reading_mode='none' option (should yield nothing, because there
    # aren't any readings yet.)
    workers = rdb.run_reading(readers, tcids, verbose=True, db=db,
                              reading_mode='none', stmt_mode='none')
    assert all(len(worker.new_readings) == 0 for worker in workers)
    assert all(len(worker.extant_readings) == 0 for worker in workers)

    # Test just getting a pickle file (Nothing should be posted to the db.).
    pkl_file = 'test_db_res.pkl'
    workers = rdb.run_reading(readers, tcids, verbose=True, db=db,
                              upload_readings=False, reading_pickle=pkl_file)
    N_new = len(workers[0].new_readings)
    N_old = len(workers[0].extant_readings)
    N_exp = len(readers)*len(tcids)
    assert N_new == N_exp, "Expected %d readings, got %d." % (N_exp, N_new)
    assert N_old == 0, "Found old readings, when there should be none."
    assert path.exists(pkl_file), "Pickle file not created."
    with open(pkl_file, 'rb') as f:
        N_pkl = len(pickle.load(f))
    assert N_pkl == N_exp, \
        "Expected %d readings in pickle, got %d." % (N_exp, N_pkl)
    N_readings = db.filter_query(db.Reading).count()
    assert N_readings == 0, \
        "There shouldn't be any readings yet, but found %d." % N_readings

    # Test reading and insert to the database.
    rdb.run_reading(readers, tcids, verbose=True, db=db)
    N_db = db.filter_query(db.Reading).count()
    assert N_db == N_exp, "Expected %d readings, got %d." % (N_exp, N_db)

    # Test reading again, without read_mode='all', ('unread' by default)
    workers = rdb.run_reading(readers, tcids, verbose=True, db=db)
    N_old = len(workers[0].extant_readings)
    N_new = len(workers[0].new_readings)
    assert N_old == N_exp, \
        "Got %d old readings, expected %d." % (N_old, N_exp)
    assert N_new == 0, \
        "Got %d new readings, when none should have been read." % N_new
    assert all([rd.reading_id is not None
                for rd in workers[0].extant_readings])

    # Test with read_mode='none' again.
    workers = rdb.run_reading(readers, tcids, verbose=True, db=db,
                              reading_mode='none')
    N_old = len(workers[0].extant_readings)
    assert N_old == N_exp
    assert all([rd.reading_id is not None
                for rd in workers[0].extant_readings])

    # Test the read_mode='all'.
    workers = rdb.run_reading(readers, tcids, verbose=True, db=db,
                              reading_mode='all')
    old = workers[0].extant_readings
    new = workers[0].new_readings
    assert len(new) == N_exp
    assert len(old) == 0
    assert all([rd.reading_id is not None for rd in new])
Beispiel #6
0
def main():
    arg_parser = get_parser()
    args = arg_parser.parse_args()

    s3 = boto3.client('s3')
    s3_log_prefix = get_s3_job_log_prefix(args.s3_base, args.job_name)
    logger.info("Using log prefix \"%s\"" % s3_log_prefix)
    id_list_key = args.s3_base + 'id_list'
    logger.info("Looking for id list on s3 at \"%s\"" % id_list_key)
    try:
        id_list_obj = s3.get_object(Bucket=bucket_name, Key=id_list_key)
    except botocore.exceptions.ClientError as e:
        # Handle a missing object gracefully
        if e.response['Error']['Code'] == 'NoSuchKey':
            logger.info('Could not find PMID list file at %s, exiting' %
                        id_list_key)
            sys.exit(1)
        # If there was some other kind of problem, re-raise the exception
        else:
            raise e

    # Get the content from the object
    id_list_str = id_list_obj['Body'].read().decode('utf8').strip()
    id_str_list = id_list_str.splitlines()[args.start_index:args.end_index]
    random.shuffle(id_str_list)
    tcids = [int(line.strip()) for line in id_str_list]

    # Get the reader objects
    if not os.path.exists(args.out_dir):
        os.makedirs(args.out_dir)
    kwargs = {'base_dir': args.out_dir, 'n_proc': args.num_cores}
    readers = construct_readers(args.readers, **kwargs)

    # Record the reader versions used in this run.
    reader_versions = {}
    for reader in readers:
        reader_versions[reader.name] = reader.get_version()
    s3.put_object(Bucket=bucket_name,
                  Key=get_s3_reader_version_loc(args.s3_base, args.job_name),
                  Body=json.dumps(reader_versions))

    # Some combinations of options don't make sense:
    forbidden_combos = [('all', 'unread'), ('none', 'unread'),
                        ('none', 'none')]
    assert (args.read_mode, args.rslt_mode) not in forbidden_combos, \
        ("The combination of reading mode %s and statement mode %s is not "
         "allowed." % (args.reading_mode, args.rslt_mode))

    # Get a handle for the database
    if args.test:
        from indra_db.tests.util import get_temp_db
        db = get_temp_db(clear=True)
    else:
        db = None

    # Read everything ========================================
    if args.batch is None:
        run_reading(readers,
                    tcids,
                    verbose=True,
                    db=db,
                    reading_mode=args.read_mode,
                    rslt_mode=args.rslt_mode)
    else:
        for tcid_batch in batch_iter(tcids, args.batch):
            run_reading(readers,
                        tcid_batch,
                        verbose=True,
                        db=db,
                        reading_mode=args.read_mode,
                        rslt_mode=args.rslt_mode)

    # Preserve the sparser logs
    contents = os.listdir('.')
    logger.info("Checking for any log files to cache:\n" + '\n'.join(contents))
    sparser_logs = []
    trips_logs = []
    for fname in contents:
        # Check if this file is a sparser log
        if fname.startswith('sparser') and fname.endswith('log'):
            sparser_logs.append(fname)
        elif is_trips_datestring(fname):
            for sub_fname in os.listdir(fname):
                if sub_fname.endswith('.log') or sub_fname.endswith('.err'):
                    trips_logs.append(os.path.join(fname, sub_fname))

    _dump_logs_to_s3(s3, s3_log_prefix, 'sparser', sparser_logs)
    _dump_logs_to_s3(s3, s3_log_prefix, 'trips', trips_logs)
    return