Esempio n. 1
0
def test_stmt_mode_unread():
    "Test whether we can only create statements from unread content."
    # Prep the inputs.
    db = get_db_with_pubmed_content()
    tcids = {tcid for tcid, in db.select_all(db.TextContent.id)}

    # Test with just sparser for tolerable speeds.
    readers = get_readers('SPARSER')

    # First create some readings.
    some_tcids = random.sample(tcids, len(tcids) // 2)
    workers0 = rdb.run_reading(readers, some_tcids, db=db, verbose=True)
    pre_stmt_hash_set = {
        sd.statement.get_hash(shallow=False)
        for sd in workers0[0].statement_outputs
    }

    # Now only make statements for the content that was not read.
    workers = rdb.run_reading(readers,
                              tcids,
                              db=db,
                              verbose=True,
                              stmt_mode='unread')
    stmt_hash_set = {
        sd.statement.get_hash(shallow=False)
        for sd in workers[0].statement_outputs
    }
    assert stmt_hash_set.isdisjoint(pre_stmt_hash_set), \
        "There were overlapping statements."
Esempio n. 2
0
def test_multiproc_statements():
    "Test the multiprocessing creation of statements."
    db = get_db_with_pubmed_content()
    readers = get_readers()
    tcids = {tcid for tcid, in db.select_all(db.TextContent.id)}
    workers = rdb.run_reading(readers, tcids, db=db)
    assert not any(worker.extant_readings for worker in workers)
    outputs = [rd for worker in workers for rd in worker.new_readings]
    stmts = rdb.make_statements(outputs, 2)
    assert len(stmts)
Esempio n. 3
0
def test_sparser_parallel_one_batch():
    "Test that sparser runs with multiple procs with batches of 1."
    db = get_db_with_pubmed_content()
    sparser_reader = SparserReader(n_proc=2)
    tc_list = db.select_all(db.TextContent)
    result = sparser_reader.read([rdb.process_content(tc) for tc in tc_list],
                                 verbose=True, n_per_proc=1)
    N_exp = len(tc_list)
    N_res = len(result)
    assert N_exp == N_res, \
        "Expected to get %d results, but got %d." % (N_exp, N_res)
Esempio n. 4
0
def test_sparser_parallel():
    "Test running sparser in parallel."
    db = get_db_with_pubmed_content()
    sparser_reader = SparserReader(n_proc=2)
    tc_list = db.select_all(db.TextContent)
    result = sparser_reader.read([rdb.process_content(tc) for tc in tc_list],
                                 verbose=True, log=True)
    N_exp = len(tc_list)
    N_res = len(result)
    assert N_exp == N_res, \
        "Expected to get %d results, but got %d." % (N_exp, N_res)
Esempio n. 5
0
def test_multi_batch_run():
    "Test that reading works properly with multiple batches run."
    db = get_db_with_pubmed_content()
    readers = get_readers()
    tcids = {tcid for tcid, in db.select_all(db.TextContent.id)}
    rdb.run_reading(readers, tcids, batch_size=len(tcids)//2, db=db,
                    stmt_mode='none')

    # This should catch any repeated readings.
    num_readings = db.filter_query(db.Reading).count()

    # NOTE: This might need some special consideration given the new TRIPS
    # reader, which only reads titles
    num_expected = len(readers)*len(tcids)
    assert num_readings == num_expected, \
        "Expected %d readings, only found %d." % (num_expected, num_readings)
Esempio n. 6
0
def test_read_db():
    "Test the low level make_db_readings functionality with various settings."
    # Prep the inputs.
    db = get_db_with_pubmed_content()
    tcids = {tcid for tcid, in db.select_all(db.TextContent.id)}
    reader = get_readers('SPARSER')[0]

    # Run the reading with default batch size and reading_mode set to 'unread'
    worker1 = rdb.DatabaseReader(tcids, reader, db=db, reading_mode='unread')
    worker1.get_readings()
    N1 = len(worker1.new_readings)
    N1_exp = len(tcids)
    assert N1 == N1_exp, \
        'Expected %d readings, but got %d.' % (N1_exp, N1)
    worker1.dump_readings_to_db()
    N1_db = len(db.select_all(db.Reading))
    assert N1_db == N1, \
        'Expected %d readings to be copied to db, only %d found.' % (N1, N1_db)

    # Run the reading with default batch size, reading_mode set to 'all'. (this
    # should produce new readings.)
    reader.reset()
    worker2 = rdb.DatabaseReader(tcids, reader, db=db, reading_mode='all')
    worker2.get_readings()

    N2_old = len(worker2.extant_readings)
    N2_new = len(worker2.new_readings)
    print(N2_old, N2_new, N1, N1_db)
    assert N2_old == 0,\
        "Got %d old readings despite reading_mode set to 'all'." % N2_old
    assert N1 == N2_new, \
        "Got %d readings from run 1 but %d from run 2." % (N1, N2_new)

    # Run the reading with default batch size, with reading_mode set to
    # 'unread', again. (this should NOT produce new readings.)
    reader.reset()
    worker3 = rdb.DatabaseReader(tcids, reader, db=db, reading_mode='unread')
    worker3.get_readings()

    N_new = len(worker3.new_readings)
    N_old = len(worker3.extant_readings)

    assert N_new == 0,\
        "Got new readings when reading_mode was 'unread' and readings existed."
    assert N_old == N1, \
        ("Missed old readings when reading_mode was 'unread' and readings "
         "existed: expected %d, but got %d." % (N1, N_old))
Esempio n. 7
0
def test_reading_content_insert():
    "Test the content primary through-put of make_db_readings."
    db = get_db_with_pubmed_content()

    print("Test reading")
    tcids = {tcid for tcid, in db.select_all(db.TextContent.id)}
    readers = get_readers()
    workers = [
        rdb.DatabaseReader(tcids, reader, verbose=True, db=db)
        for reader in readers
    ]
    reading_output = []
    for worker in workers:
        worker.get_readings()

        expected_output_len = len(tcids)
        N_new = len(worker.new_readings)
        reading_output.extend(worker.new_readings)
        assert N_new == expected_output_len, \
            ("Not all text content successfully read by %s."
             "Expected %d outputs, but got %d."
             % (worker.reader.name, expected_output_len, N_new))

    print("Test reading insert")
    for worker in workers:
        worker.dump_readings_to_db()
    r_list = db.select_all(db.Reading)

    def is_complete_match(r_list, reading_output):
        return all(
            [any([rd.matches(r) for r in r_list]) for rd in reading_output])

    assert is_complete_match(r_list, reading_output), \
        "Not all reading output posted."

    print("Test making statements")
    num_stmts = 0
    for worker in workers:
        worker.get_statements()
        num_stmts += len(worker.statement_outputs)

        worker.dump_statements_to_db()

    num_db_sids = db.count(db.RawStatements.id)
    assert num_db_sids == num_stmts, \
        "Only %d/%d statements added." % (num_db_sids, num_stmts)
    assert len(db.select_all(db.RawAgents)), "No agents added."
Esempio n. 8
0
def test_get_content():
    "Test that content queries are correctly formed."
    db = get_db_with_pubmed_content()
    tcids = {tcid for tcid, in db.select_all(db.TextContent.id)}
    readers = get_readers()
    for reader in readers:
        worker = rdb.DatabaseReader(tcids, reader, db=db)

        N_exp = db.filter_query(db.TextContent).count()
        N_1 = sum([1 for _ in worker.iter_over_content()])
        assert N_1 == N_exp,\
            "Expected %d results in our query, got %d." % (N_exp, N_1)

        # Test response to empyt dict.
        worker = rdb.DatabaseReader([], reader, db=db)
        assert not any([c for c in worker.iter_over_content()]), \
            "Expected no results when passing no ids."
Esempio n. 9
0
def test_produce_readings():
    "Comprehensive test of the high level production of readings."
    # Prep the inputs.
    db = get_db_with_pubmed_content()
    tcids = {tcid for tcid, in db.select_all(db.TextContent.id)}

    # Test with just sparser for tollerable speeds.
    readers = get_readers('SPARSER')

    # Test the reading_mode='none' option (should yield nothing, because there
    # aren't any readings yet.)
    workers = rdb.run_reading(readers, tcids, verbose=True, db=db,
                              reading_mode='none', stmt_mode='none')
    assert all(len(worker.new_readings) == 0 for worker in workers)
    assert all(len(worker.extant_readings) == 0 for worker in workers)

    # Test just getting a pickle file (Nothing should be posted to the db.).
    pkl_file = 'test_db_res.pkl'
    workers = rdb.run_reading(readers, tcids, verbose=True, db=db,
                              upload_readings=False, reading_pickle=pkl_file)
    N_new = len(workers[0].new_readings)
    N_old = len(workers[0].extant_readings)
    N_exp = len(readers)*len(tcids)
    assert N_new == N_exp, "Expected %d readings, got %d." % (N_exp, N_new)
    assert N_old == 0, "Found old readings, when there should be none."
    assert path.exists(pkl_file), "Pickle file not created."
    with open(pkl_file, 'rb') as f:
        N_pkl = len(pickle.load(f))
    assert N_pkl == N_exp, \
        "Expected %d readings in pickle, got %d." % (N_exp, N_pkl)
    N_readings = db.filter_query(db.Reading).count()
    assert N_readings == 0, \
        "There shouldn't be any readings yet, but found %d." % N_readings

    # Test reading and insert to the database.
    rdb.run_reading(readers, tcids, verbose=True, db=db)
    N_db = db.filter_query(db.Reading).count()
    assert N_db == N_exp, "Expected %d readings, got %d." % (N_exp, N_db)

    # Test reading again, without read_mode='all', ('unread' by default)
    workers = rdb.run_reading(readers, tcids, verbose=True, db=db)
    N_old = len(workers[0].extant_readings)
    N_new = len(workers[0].new_readings)
    assert N_old == N_exp, \
        "Got %d old readings, expected %d." % (N_old, N_exp)
    assert N_new == 0, \
        "Got %d new readings, when none should have been read." % N_new
    assert all([rd.reading_id is not None
                for rd in workers[0].extant_readings])

    # Test with read_mode='none' again.
    workers = rdb.run_reading(readers, tcids, verbose=True, db=db,
                              reading_mode='none')
    N_old = len(workers[0].extant_readings)
    assert N_old == N_exp
    assert all([rd.reading_id is not None
                for rd in workers[0].extant_readings])

    # Test the read_mode='all'.
    workers = rdb.run_reading(readers, tcids, verbose=True, db=db,
                              reading_mode='all')
    old = workers[0].extant_readings
    new = workers[0].new_readings
    assert len(new) == N_exp
    assert len(old) == 0
    assert all([rd.reading_id is not None for rd in new])