Beispiel #1
0
def main():
    parser = get_parser()
    args = parser.parse_args()
    if args.test:
        if 'test' not in args.database:
            from indra_db.tests.util import get_temp_db
            db = get_temp_db()
        else:
            db = get_db(args.database)
    elif args.database == 'primary':
        db = get_primary_db()
    else:
        db = get_db(args.database)

    readers = ['SPARSER', 'REACH', 'TRIPS', 'ISI', 'EIDOS', 'MTI']
    if args.method == 'local':
        bulk_manager = BulkLocalReadingManager(readers,
                                               buffer_days=args.buffer,
                                               n_procs=args.num_procs)
    elif args.method == 'aws':
        bulk_manager = BulkAwsReadingManager(readers,
                                             buffer_days=args.buffer,
                                             project_name=args.project_name)
    else:
        assert False, "This shouldn't be allowed."

    if args.task == 'read_all':
        bulk_manager.read_all(db)
    elif args.task == 'read_new':
        bulk_manager.read_new(db)
    return
def test_lazy_copier_unique_constraints():
    db = get_temp_db(clear=True)

    N = int(10**5)
    S = int(10**8)
    fake_mids_a = {('man-' + str(random.randint(0, S)), ) for _ in range(N)}
    fake_mids_b = {('man-' + str(random.randint(0, S)), ) for _ in range(N)}

    assert len(fake_mids_a | fake_mids_b) < len(fake_mids_a) + len(fake_mids_b)

    start = datetime.now()
    db.copy('text_ref', fake_mids_a, ('manuscript_id', ))
    print("First load:", datetime.now() - start)

    try:
        db.copy('text_ref', fake_mids_b, ('manuscript_id', ))
        assert False, "Vanilla copy succeeded when it should have failed."
    except Exception as e:
        db._conn.rollback()
        pass

    start = datetime.now()
    db.copy_lazy('text_ref', fake_mids_b, ('manuscript_id', ))
    print("Lazy copy:", datetime.now() - start)

    mid_results = [mid for mid, in db.select_all(db.TextRef.manuscript_id)]
    assert len(mid_results) == len(set(mid_results)), \
        (len(mid_results), len(set(mid_results)))

    return
Beispiel #3
0
def test_lazy_report_copy():
    db = get_temp_db(True)
    inps_1 = _do_init_copy(db)
    inps_2 = {('b', '2'), ('c', '1'), ('d', '3')}

    left_out = db.copy_report_lazy('text_ref', inps_2, COLS)
    _assert_set_equal(inps_1 | inps_2, _ref_set(db))
    _assert_set_equal(inps_1 & inps_2, {t[:2] for t in left_out})
Beispiel #4
0
def test_insert_and_query_pmid():
    "Test that we can add a text_ref and get the text_ref back."
    db = get_temp_db()
    pmid = '1234'
    text_ref_id = db.insert('text_ref', pmid=pmid)
    entries = db.select_all('text_ref', db.TextRef.pmid == pmid)
    assert_equal(len(entries), 1, "One item inserted, multiple entries found.")
    assert_equal(entries[0].pmid, pmid)
    assert_equal(entries[0].id, text_ref_id, "Got back wrong text_ref_id.")
Beispiel #5
0
def get_test_db_with_pubmed_content(with_pm=False):
    "Populate the database with sample content from pubmed."
    db = get_temp_db(clear=True)
    pm = Pubmed(ftp_url=get_test_ftp_url(), local=True)
    pm.populate(db)
    if with_pm:
        return db, pm
    else:
        return db
Beispiel #6
0
def test_lazy_copy():
    db = get_temp_db(True)
    inps_1 = {('a', '1'), ('b', '2')}
    inps_2 = {('b', '2'), ('c', '1'), ('d', '3')}

    db.copy('text_ref', inps_1, COLS)
    _assert_set_equal(inps_1, _ref_set(db))

    db.copy_lazy('text_ref', inps_2, COLS)
    _assert_set_equal(inps_1 | inps_2, _ref_set(db))
Beispiel #7
0
def test_multible_pmc_oa_content():
    "Test to make sure repeated content is handled correctly."
    db = get_temp_db()
    pmc = PmcOA(ftp_url=get_test_ftp_url(), local=True)
    pmc.populate(db)
    num_conts = len(db.select_all('text_content'))
    pmc.populate(db)
    assert len(db.select_all('text_content')) == num_conts,\
        "Duplicate text content allowed to be submitted."
    return
Beispiel #8
0
def test_multiple_pmids():
    "Test that pre-existing pmids are correctly handled."
    db = get_temp_db()
    med = Pubmed(ftp_url=get_test_ftp_url(), local=True)
    med.populate(db)
    num_refs = len(db.select_all('text_ref'))
    med.populate(db)
    assert len(db.select_all('text_ref')) == num_refs,\
        "Duplicate pmids allowed to be submitted.."
    return
Beispiel #9
0
def test_uniqueness_text_ref_url():
    "Test whether the uniqueness imposed on the url of text_refs is enforced."
    db = get_temp_db()
    url = 'http://foobar.com'
    db.insert('text_ref', url=url)
    try:
        db.insert('text_ref', url=url)
    except IntegrityError:
        return  # PASS
    assert False, "Uniqueness was not enforced."
Beispiel #10
0
def test_vanilla_copy():
    db = get_temp_db(True)
    inps = {('a', '1'), ('b', '1')}
    db.copy('text_ref', inps, COLS)
    assert inps == _ref_set(db)

    try:
        db.copy('text_ref', inps, COLS)
    except:
        return
    assert False, "Copy of duplicate data succeeded."
Beispiel #11
0
def test_detailed_copy_report():
    db = get_temp_db(True)
    inps_1 = _do_init_copy(db)
    inps_2 = {('b', '2'), ('c', '1'), ('d', '3')}

    exiting_ids = {trid for trid, in db.select_all(db.TextRef.id)}

    existing_ids, new_ids, skipped_rows = \
        db.copy_detailed_report_lazy('text_ref', inps_2, COLS)
    _assert_set_equal(inps_1 | inps_2, _ref_set(db))
    _assert_set_equal(inps_1 & inps_2, {t[:2] for t in skipped_rows})
    assert {trid for trid, in new_ids} != exiting_ids
Beispiel #12
0
def test_push_copy():
    db = get_temp_db(True)
    inps_1 = _do_init_copy(db)
    inps_2 = {('b', '2'), ('c', '1'), ('d', '3')}

    original_date = db.select_one(db.TextRef.create_date,
                                  db.TextRef.pmid == 'b')

    db.copy_push('text_ref', inps_2, COLS)
    _assert_set_equal(inps_1 | inps_2, _ref_set(db))
    new_date = db.select_one(db.TextRef.create_date, db.TextRef.pmid == 'b')
    assert new_date != original_date, "PMID b was not updated."
Beispiel #13
0
def test_uniqueness_text_ref_doi_pmid():
    "Test uniqueness enforcement behavior for text_ref insertion."
    db = get_temp_db()
    pmid = '1234'
    doi = 'foo/1234'
    db.insert('text_ref', doi=doi, pmid=pmid)
    try:
        db.insert('text_ref', doi=doi, pmid=pmid)
    except IntegrityError:
        return  # PASS
    finally:
        db._clear(force=True)
    assert False, "Uniqueness was not enforced."
Beispiel #14
0
def _check_kbm(Kb, *args, **kwargs):
    db = get_temp_db(clear=True)
    dbid = db.select_one(db.DBInfo.id, db.DBInfo.db_name == Kb.name)
    assert dbid is None
    kbm = Kb(*args, **kwargs)
    kbm.upload(db)
    dbid = db.select_one(db.DBInfo.id, db.DBInfo.db_name == Kb.name)[0]
    assert dbid is not None
    db_stmts = db.select_all(db.RawStatements)
    print(len(db_stmts))
    assert len(db_stmts)
    assert all(s.db_info_id == dbid for s in db_stmts)
    db.session.close()
def test_db_lazy_insert():
    db = get_temp_db(clear=True)

    N = int(10**5)
    S = int(10**8)
    fake_pmids_a = {(i, str(random.randint(0, S))) for i in range(N)}
    fake_pmids_b = {(int(N / 2 + i), str(random.randint(0, S)))
                    for i in range(N)}

    expected = {id: pmid for id, pmid in fake_pmids_b}
    for id, pmid in fake_pmids_a:
        expected[id] = pmid

    start = datetime.now()
    db.copy('text_ref', fake_pmids_a, ('id', 'pmid'))
    print("First load:", datetime.now() - start)

    try:
        db.copy('text_ref', fake_pmids_b, ('id', 'pmid'))
        assert False, "Vanilla copy succeeded when it should have failed."
    except Exception as e:
        db._conn.rollback()
        pass

    # Try adding more text refs lazily. Overlap is guaranteed.
    start = datetime.now()
    db.copy_lazy('text_ref', fake_pmids_b, ('id', 'pmid'))
    print("Lazy copy:", datetime.now() - start)

    refs = db.select_all([db.TextRef.id, db.TextRef.pmid])
    result = {id: pmid for id, pmid in refs}
    assert result.keys() == expected.keys()
    passed = True
    for id, pmid in expected.items():
        if result[id] != pmid:
            print(id, pmid)
            passed = False
    assert passed, "Result did not match expected."

    # As a benchmark, see how long this takes the "old fashioned" way.
    db._clear(force=True)
    start = datetime.now()
    db.copy('text_ref', fake_pmids_a, ('id', 'pmid'))
    print('Second load:', datetime.now() - start)

    start = datetime.now()
    current_ids = {trid for trid, in db.select_all(db.TextRef.id)}
    clean_fake_pmids_b = {t for t in fake_pmids_b if t[0] not in current_ids}
    db.copy('text_ref', clean_fake_pmids_b, ('id', 'pmid'))
    print('Old fashioned copy:', datetime.now() - start)
    return
Beispiel #16
0
def test_multiple_text_ref_pmc_oa():
    "Test whether a duplicate text ref in pmc oa is handled correctly."
    db = get_temp_db()
    pmc = PmcOA(ftp_url=get_test_ftp_url(), local=True)
    pmc.review_fname = 'test_review_multiple_text_ref_pmc_oa.txt'
    inp = dict.fromkeys(pmc.tr_cols)
    inp.update(pmcid='PMC5579538', doi='10.1021/acsomega.7b00205')
    pmc.upload_batch(db, [inp], [])
    num_refs = len(db.select_all('text_ref'))
    pmc.upload_batch(db, [inp], [])
    assert len(db.select_all('text_ref')) == num_refs,\
        "Duplicate refs allowed to be submitted.."
    remove(pmc.review_fname)
    return
Beispiel #17
0
def test_detailed_copy_report_repeated_pmid_with_conflict():
    db = get_temp_db(True)

    inps_1 = {('1', 'PMC1', '10.1/a'), ('2', 'PMC2', '10.2/b')}
    inps_2 = {('1', 'PMC3', '10.1/a')}

    cols = ('pmid', 'pmcid', 'doi')
    db.copy('text_ref', inps_1, cols)

    existing_ids, new_ids, skipped_rows = \
        db.copy_detailed_report_lazy('text_ref', inps_2, cols, ('pmid', 'id'))
    assert existing_ids == [('1', 1)]
    assert len(skipped_rows) == 1
    assert not new_ids
Beispiel #18
0
def test_push_report_copy():
    db = get_temp_db(True)
    inps_1 = {('a', '1'), ('b', '2')}
    inps_2 = {('b', '2'), ('c', '1'), ('d', '3')}

    db.copy('text_ref', inps_1, COLS)
    _assert_set_equal(inps_1, _ref_set(db))
    original_date = db.select_one(db.TextRef.create_date,
                                  db.TextRef.pmid == 'b')

    updated = db.copy_report_push('text_ref', inps_2, COLS)
    _assert_set_equal(inps_1 | inps_2, _ref_set(db))
    _assert_set_equal(inps_1 & inps_2, {t[:2] for t in updated})
    new_date = db.select_one(db.TextRef.create_date, db.TextRef.pmid == 'b')
    assert new_date != original_date, 'PMID b was not updated.'
Beispiel #19
0
def test_simple_db_insert():
    db = get_temp_db()
    db._clear(force=True)
    stmts = [
        Phosphorylation(Agent('MEK', db_refs={'FPLX': 'MEK'}),
                        Agent('ERK', db_refs={'FPLX': 'ERK'}),
                        evidence=Evidence(source_api='test')),
        Complex([Agent(n, db_refs={'FPLX': n}) for n in ('MEK', 'ERK')],
                evidence=Evidence(source_api='test'))
    ]
    dbid = db.insert(db.DBInfo, db_name='test', source_api='tester')
    insert_db_stmts(db, stmts, dbid)
    db_stmts = db.select_all(db.RawStatements)
    db_agents = db.select_all(db.RawAgents)
    assert len(db_stmts) == 2, len(db_stmts)
    assert len(db_agents) == 8, len(db_agents)
    db.session.close()
Beispiel #20
0
def _get_db_no_pa_stmts():
    db = get_temp_db(clear=True)

    db_builder = DbBuilder(db)
    db_builder.add_text_refs([('12345', 'PMC54321'), ('24680', 'PMC08642'),
                              ('97531', )])
    db_builder.add_text_content([['pubmed-ttl', 'pubmed-abs', 'pmc_oa'],
                                 ['pubmed-abs', 'manuscripts'],
                                 ['pubmed-ttl', 'pubmed-abs']])
    db_builder.add_readings([['REACH', 'TRIPS'], ['REACH', 'SPARSER'],
                             ['REACH', 'ISI'],
                             ['SPARSER'], ['REACH', 'SPARSER'],
                             ['SPARSER', 'TRIPS', 'REACH'], ['REACH',
                                                             'EIDOS']])
    db_builder.add_raw_reading_statements([
        [Phosphorylation(mek, erk)],  # reach pubmed title
        [Phosphorylation(mek, erk, 'T', '124')],  # trips pubmed title
        [
            Phosphorylation(mek, erk),
            Inhibition(erk, ras), (Phosphorylation(mek, erk), 'in the body')
        ],  # reach pubmed-abs
        [
            Complex([mek, erk]),
            Complex([erk, ras]), (Phosphorylation(None, erk), 'In the body')
        ],  # sparser pubmed-abs
        [],  # reach pmc_oa
        [],  # ISI pmc_oa
        [Phosphorylation(map2k1, mapk1)],  # sparser pubmed-abs
        [],  # reach manuscripts
        [],  # sparser manuscripts
        [Inhibition(simvastatin_ng, raf),
         Activation(map2k1_mg, erk)],  # sparser pubmed title
        [],  # TRIPS pubmed title
        [],  # reach pubmed title
        [],  # reach pubmed abs
        [],  # eidos pubmed abs
    ])
    db_builder.add_databases(['biopax', 'tas', 'bel'])
    db_builder.add_raw_database_statements([[
        Activation(mek, raf),
        Inhibition(erk, ras),
        Phosphorylation(mek, erk)
    ], [Inhibition(simvastatin, raf)], [Phosphorylation(mek, erk, 'T',
                                                        '124')]])
    return db
Beispiel #21
0
def test_detailed_copy_report_pmid_and_id():
    db = get_temp_db(True)
    inps_1 = _do_init_copy(db)
    inps_2 = {('b', '2'), ('c', '1'), ('d', '3')}

    existing_id_dict = {
        pmid: trid
        for trid, pmid in db.select_all([db.TextRef.id, db.TextRef.pmid])
    }

    existing_ids, new_ids, skipped_rows = \
        db.copy_detailed_report_lazy('text_ref', inps_2, COLS,
                                     ('pmid', 'pmcid', 'id'))
    new_id_dict = {pmid: trid for pmid, trid in new_ids}
    returned_existing_id_dict = {pmid: trid for pmid, _, trid, in existing_ids}
    assert returned_existing_id_dict == {'b': 1}
    _assert_set_equal(inps_1 | inps_2, _ref_set(db))
    _assert_set_equal(inps_1 & inps_2, {t[:2] for t in skipped_rows})
    assert set(existing_id_dict.keys()) != set(new_id_dict.keys())
Beispiel #22
0
def test_normal_db_reading_call():
    s3 = boto3.client('s3')
    chdir(path.expanduser('~'))
    # Put some basic stuff in the test databsae
    N = 6
    db = get_temp_db(clear=True)
    db.copy('text_ref', [(i, 'PMID80945%d' % i) for i in range(N)],
            cols=('id', 'pmid'))
    text_content = [
        (i, i, 'pubmed', 'text', 'abstract',
         zip_string('MEK phosphorylates ERK in test %d.' % i))
        for i in range(N)
        ]
    text_content += [
        (N, N-1, 'pmc_oa', 'text', 'fulltext',
         zip_string('MEK phosphorylates ERK. EGFR activates SHC.'))
        ]
    db.copy('text_content', text_content,
            cols=('id', 'text_ref_id', 'source', 'format', 'text_type',
                  'content'))

    # Put an id file on s3
    basename = 'local_db_test_run'
    s3_prefix = 'reading_results/%s/' % basename
    s3.put_object(Bucket='bigmech', Key=s3_prefix + 'id_list',
                  Body='\n'.join(['%d' % i for i in range(len(text_content))]))

    # Call the reading tool
    sub = DbReadingSubmitter(basename, ['sparser'])
    job_name, cmd = sub._make_command(0, len(text_content))
    cmd += ['--test']
    check_call(cmd)
    sub.produce_report()

    # Remove garbage on s3
    res = s3.list_objects(Bucket='bigmech', Prefix=s3_prefix)
    for entry in res['Contents']:
        print("Removing %s..." % entry['Key'])
        s3.delete_object(Bucket='bigmech', Key=entry['Key'])
    return
def test_lazy_copier_update():
    db = get_temp_db(clear=True)

    N = int(10**5)
    S = int(10**8)
    fake_pmids_a = {(i, str(random.randint(0, S))) for i in range(N)}
    fake_pmids_b = {(int(N / 2 + i), str(random.randint(0, S)))
                    for i in range(N)}

    expected = {id: pmid for id, pmid in fake_pmids_a}
    for id, pmid in fake_pmids_b:
        expected[id] = pmid

    start = datetime.now()
    db.copy('text_ref', fake_pmids_a, ('id', 'pmid'))
    print("First load:", datetime.now() - start)

    try:
        db.copy('text_ref', fake_pmids_b, ('id', 'pmid'))
        assert False, "Vanilla copy succeeded when it should have failed."
    except Exception as e:
        db._conn.rollback()
        pass

    # Try adding more text refs lazily. Overlap is guaranteed.
    start = datetime.now()
    db.copy_push('text_ref', fake_pmids_b, ('id', 'pmid'))
    print("Lazy copy:", datetime.now() - start)

    refs = db.select_all([db.TextRef.id, db.TextRef.pmid])
    result = {id: pmid for id, pmid in refs}
    assert result.keys() == expected.keys()
    passed = True
    for id, pmid in expected.items():
        if result[id] != pmid:
            print(id, pmid)
            passed = False
    assert passed, "Result did not match expected."
Beispiel #24
0
    def __init__(self):
        self.db = get_temp_db(clear=True)

        N = int(10**5)
        S = int(10**8)
        self.fake_pmids_a = {(i, str(random.randint(0, S))) for i in range(N)}
        self.fake_pmids_b = {(int(N / 2 + i), str(random.randint(0, S)))
                             for i in range(N)}

        self.expected = {id: pmid for id, pmid in self.fake_pmids_a}
        for id, pmid in self.fake_pmids_b:
            self.expected[id] = pmid

        start = datetime.now()
        self.db.copy('text_ref', self.fake_pmids_a, ('id', 'pmid'))
        print("First load:", datetime.now() - start)

        try:
            self.db.copy('text_ref', self.fake_pmids_b, ('id', 'pmid'))
            assert False, "Vanilla copy succeeded when it should have failed."
        except Exception as e:
            self.db._conn.rollback()
            pass
Beispiel #25
0
def test_dump():
    db = get_temp_db(clear=True)
    m = XddManager()

    # Enter "old" DOIs
    s3 = boto3.client('s3')
    res = s3.list_objects_v2(**m.bucket.kw())
    dois = set()
    for ref in res['Contents']:
        key = ref['Key']
        if 'bib' not in key:
            continue
        try:
            obj = s3.get_object(Key=key, **m.bucket.kw())
        except Exception:
            print('ack')
            continue
        bibs = json.loads(obj['Body'].read())
        dois |= {
            bib['identifier'][0]['id']
            for bib in bibs if 'identifier' in bib
        }
    sample_dois = random.sample(dois, len(dois) // 2)
    new_trs = [db.TextRef.new(doi=doi) for doi in sample_dois]
    print(f"Adding {len(new_trs)} 'old' text refs.")
    db.session.add_all(new_trs)
    db.session.commit()

    # Run the update.
    m.run(db)

    # Check the result.
    assert db.select_all(db.TextRef)
    assert db.select_all(db.TextContent)
    assert db.select_all(db.Reading)
    assert db.select_all(db.RawStatements)
    assert db.select_all(db.RawAgents)
def _construct_database():
    db = get_temp_db(clear=True)
    db_builder = DbBuilder(db)
    db_builder.add_text_refs([('12345', 'PMC54321'), ('24680', 'PMC08642')])
    db_builder.add_text_content([['pubmed-abs', 'pmc_oa'], ['pubmed-abs']])
    db_builder.add_readings([['REACH'], ['REACH'], ['REACH', 'SPARSER']])

    mek = Agent('MEK', db_refs={'FPLX': 'MEK'})
    erk = Agent('ERK', db_refs={'FPLX': 'ERK'})
    raf = Agent('RAF', db_refs={'FPLX': 'RAF'})

    db_builder.add_raw_reading_statements(
        [[Phosphorylation(mek, erk),
          Complex([mek, erk])], [Phosphorylation(mek, erk)],
         [Activation(mek, erk)], [Complex([mek, erk]),
                                  Complex([raf, erk])]])

    db_builder.add_databases(['signor'])
    db_builder.add_raw_database_statements([[Complex([raf, erk])]])
    db_builder.add_pa_statements([(Phosphorylation(mek, erk), [0, 2]),
                                  (Complex([mek, erk]), [1, 4]),
                                  (Activation(mek, erk), [3]),
                                  (Complex([raf, erk]), [5, 6])])
    return db
Beispiel #27
0
 def __init__(self):
     self.test_db = get_temp_db(clear=True)
     self.test_data = _make_test_db_input()
Beispiel #28
0
def test_db_presence():
    db = get_temp_db(clear=True)
    db.insert(db.TextRef, pmid='12345')
Beispiel #29
0
def test_dump_build():
    """Test the dump pipeline.

    Method
    ------
    CREATE CONTEXT:
    - Create a local principal database with a small amount of content.
      Aim for representation of stmt motifs and sources.
    - Create a local readonly database.
    - Create a fake bucket (moto)

    RUN THE DUMP

    CHECK THE RESULTS
    """
    assert config.is_db_testing()

    # Create the dump locale.
    s3 = boto3.client('s3')
    dump_head = config.get_s3_dump()
    s3.create_bucket(Bucket=dump_head.bucket)
    assert dump_head.bucket == S3_DATA_LOC['bucket']

    # Create the principal database.
    db = get_temp_db(clear=True)

    db.copy('text_ref', [        # trid
        ('1', 1, 'PMC1', 1),     # 1
        ('2', 2, 'PMC2', 2),     # 2
        ('3', 3, None, None),    # 3
        (None, None, 'PMC4', 4)  # 4
    ], ('pmid', 'pmid_num', 'pmcid', 'pmcid_num'))

    db.copy('mesh_ref_annotations', [
        (1, 11, False),
        (1, 13, False),
        (1, 12, True),
        (2, 12, True),
        (3, 13, False),
        (3, 33, True)
    ], ('pmid_num', 'mesh_num', 'is_concept'))

    db.copy('text_content', [              # tcid
        (1, 'pubmed', 'txt', 'abstract'),  # 1
        (1, 'pmc', 'xml', 'fulltext'),     # 2
        (2, 'pubmed', 'txt', 'title'),     # 3
        (3, 'pubmed', 'txt', 'abstract'),  # 4
        (3, 'pmc', 'xml', 'fulltext'),     # 5
        (4, 'pmc', 'xml', 'fulltext')      # 6
    ], ('text_ref_id', 'source', 'format', 'text_type'))

    db.copy('reading', [(tcid, rdr, 1, reader_versions[rdr][-1], 'emtpy')
                        for tcid, rdr in [
        # 1             2             3
        (1, 'reach'), (1, 'eidos'), (1, 'isi'),

        # 4
        (2, 'reach'),

        # 5             6            7
        (3, 'reach'), (3, 'eidos'), (3, 'trips'),

        # 8
        (4, 'reach'),

        # 9
        (5, 'reach'),

        # 10
        (6, 'reach')
    ]], ('text_content_id', 'reader', 'batch_id', 'reader_version', 'format'))

    db.copy('db_info', [
        ('signor', 'signor', 'Signor'),       # 1
        ('pc', 'biopax', 'Pathway Commons'),  # 2
        ('medscan', 'medscan', 'MedScan')     # 3
    ], ('db_name', 'source_api', 'db_full_name'))

    raw_stmts = {
        'reading': {
            2: [
                Inhibition(
                    Agent('Fever', db_refs={'TEXT': 'fever', 'MESH': 'D005334'}),
                    Agent('Cough', db_refs={'TEXT': 'cough', 'MESH': 'D003371'}),
                    evidence=Evidence(text="We found fever inhibits cough.")
                )
            ],
            4: [
                Phosphorylation(
                    Agent('MEK', db_refs={'FPLX': 'MEK', 'TEXT': 'mek'}),
                    Agent('ERK', db_refs={'FPLX': 'MEK', 'TEXT': 'erk'}),
                    evidence=Evidence(text="mek phosphorylates erk, so say I.")
                ),
                Activation(
                    Agent('MAP2K1', db_refs={'HGNC': '6840', 'TEXT': 'MEK1'}),
                    Agent('MAPK1', db_refs={'HGNC': '6871', 'TEXT': 'ERK1'}),
                    evidence=Evidence(text="MEK1 activates ERK1, or os I'm told.")
                ),
                Activation(
                    Agent('ERK', db_refs={'FPLX': 'ERK', 'TEXT': 'ERK'}),
                    Agent('JNK', db_refs={'FPLX': 'JNK', 'TEXT': 'JNK'}),
                    evidence=Evidence(text="ERK activates JNK, maybe.")
                ),
                Complex([
                    Agent('MEK', db_refs={'FPLX': 'MEK', 'TEXT': 'MAP2K'}),
                    Agent('ERK', db_refs={'FPLX': 'ERK', 'TEXT': 'MAPK'}),
                    Agent('RAF', db_refs={'FPLX': 'RAF', 'TEXT': 'RAF'})
                ], evidence=Evidence(text="MAP2K, MAPK, and RAF form a complex."))
            ],
            7: [
                Activation(
                    Agent('ERK', db_refs={'FPLX': 'ERK', 'TEXT': 'ERK'}),
                    Agent('JNK', db_refs={'FPLX': 'JNK', 'TEXT': 'JNK'}),
                    evidence=Evidence(text='ERK activates JNK, maybe.')
                )
            ],
            8: [
                Complex([
                    Agent('MEK', db_refs={'FPLX': 'MEK', 'TEXT': 'mek'}),
                    Agent('ERK', db_refs={'FPLX': 'ERK', 'TEXT': 'erk'})
                ], evidence=Evidence(text="...in the mek-erk complex."))
            ],
        },
        'databases': {
            2: [
                Conversion(
                    Agent('FRK', db_refs={'HGNC': '3955'}),
                    [Agent('ATP', db_refs={'MESH': 'D000255'})],
                    [Agent('hydron', db_refs={'CHEBI': 'CHEBI:15378'})]
                )
            ],
            3: [
                Phosphorylation(
                    Agent('MEK', db_refs={'FPLX': 'MEK', 'TEXT': 'MEK'}),
                    Agent('ERK', db_refs={'FPLX': 'ERK', 'TEXT': 'ERK'}),
                    evidence=Evidence(text="...MEK phosphorylates ERK medscan.")
                )
            ]
        }
    }
    simple_insert_stmts(db, raw_stmts)

    # Run preassembly.
    prass.create_corpus(db)

    # Do the dump proceedure.
    ro = get_temp_ro(clear=True)
    dump(db, ro)

    # Check that the s3 dump exists.
    all_dumps = dm.list_dumps()
    assert len(all_dumps) == 1

    # Check to make sure all the dump files are present.
    dump_path = all_dumps[0]
    file_list = dump_path.list_objects(s3)
    assert dm.Start.from_list(file_list)
    assert dm.Readonly.from_list(file_list)
    assert dm.Belief.from_list(file_list)
    assert dm.Sif.from_list(file_list)
    assert dm.StatementHashMeshId.from_list(file_list)
    assert dm.FullPaStmts.from_list(file_list)
    assert dm.End.from_list(file_list)

    # Check what tables are active in the readonly database.
    active_tables = ro.get_active_tables()
    for tbl in ro.get_tables():
        if ro.tables[tbl]._temp:
            # If it was temp, it should be gone.
            assert tbl not in active_tables
        else:
            # Otherwise, it should be there.
            assert tbl in active_tables

    # Check that the principal db has no more ro schema.
    assert 'readonly' not in db.get_schemas()

    # Check contents of the readonly database.
    assert len(ro.select_all(ro.FastRawPaLink)) \
           == len(db.select_all(db.RawUniqueLinks))

    # Check that a query basically works.
    from indra_db.client.readonly import HasAgent
    res = HasAgent('MEK').get_statements(ro)
    assert len(res.statements()) == 2, len(res.statements())

    # Check that belief is represented in the table.
    bdict = {h: b for h, b in ro.select_all([ro.SourceMeta.mk_hash,
                                             ro.SourceMeta.belief])}
    assert all(1 >= b > 0 for b in bdict.values())

    # Check to make sure lambda was diverted correctly.
    call_records = config.get_test_call_records()
    assert len(call_records) == 2
    assert all(rec.func_name == '_set_lambda_env' for rec in call_records)
    assert all(isinstance(rec.args[1], dict) for rec in call_records)
    assert 'INDRAROOVERRIDE' in call_records[0].args[1]
    assert call_records[0].args[1]['INDRAROOVERRIDE'] == str(db.url)
    assert not call_records[1].args[1]
Beispiel #30
0
def _build_test_set():
    agents = [{
        'NAME': 'ERK',
        'FPLX': 'ERK',
        'TEXT': 'MAPK'
    }, {
        'NAME': 'TP53',
        'HGNC': '11998'
    }, {
        'NAME': 'MEK',
        'FPLX': 'MEK'
    }, {
        'NAME': 'Vemurafenib',
        'CHEBI': 'CHEBI:63637'
    }]
    stypes = ['Phosphorylation', 'Activation', 'Inhibition', 'Complex']
    sources = [('medscan', 'rd'), ('reach', 'rd'), ('pc11', 'db'),
               ('signor', 'db')]
    mesh_ids = ['D000225', 'D002352', 'D015536']

    mesh_combos = []
    for num_mesh in range(0, 3):
        if num_mesh == 1:
            mesh_groups = [[mid] for mid in mesh_ids]
        else:
            mesh_groups = combinations(mesh_ids, num_mesh)

        mesh_combos.extend(list(mesh_groups))
    random.shuffle(mesh_combos)

    source_data = []
    for num_srcs in range(1, 5):
        if num_srcs == 1:
            src_iter = [[src] for src in sources]
        else:
            src_iter = combinations(sources, num_srcs)

        for src_list in src_iter:
            only_src = None if len(src_list) > 1 else src_list[0][0]
            has_rd = any(t == 'rd' for _, t in src_list)
            if has_rd:
                mesh_ids = mesh_combos[len(source_data) % len(mesh_combos)]
            else:
                mesh_ids = []
            source_data.append({
                'sources': {src: random.randint(1, 50)
                            for src, _ in src_list},
                'has_rd': any(t == 'rd' for _, t in src_list),
                'has_db': any(t == 'db' for _, t in src_list),
                'only_src': only_src,
                'mesh_ids': mesh_ids
            })
    random.shuffle(source_data)

    stmts = [
        tuple(tpl) + (None, None)
        for tpl in product(stypes, permutations(agents, 2))
    ]
    stmts += [('ActiveForm', (ref, ), activity, is_active)
              for activity, is_active, ref in product(
                  ['transcription', 'activity'], [True, False], agents)]

    complex_pairs = []

    name_meta_rows = []
    name_meta_cols = ('mk_hash', 'ag_num', 'db_id', 'role_num', 'type_num',
                      'ev_count', 'activity', 'is_active', 'agent_count')

    text_meta_rows = []
    text_meta_cols = ('mk_hash', 'ag_num', 'db_id', 'role_num', 'type_num',
                      'ev_count', 'activity', 'is_active', 'agent_count')

    other_meta_rows = []
    other_meta_cols = ('mk_hash', 'ag_num', 'db_name', 'db_id', 'role_num',
                       'type_num', 'ev_count', 'activity', 'is_active',
                       'agent_count')

    source_meta_rows = []
    source_meta_cols = ('mk_hash', 'reach', 'medscan', 'pc11', 'signor',
                        'ev_count', 'type_num', 'activity', 'is_active',
                        'agent_count', 'num_srcs', 'src_json', 'only_src',
                        'has_rd', 'has_db')

    mesh_meta_rows = []
    mesh_meta_cols = ('mk_hash', 'ev_count', 'mesh_num', 'type_num',
                      'activity', 'is_active', 'agent_count')
    for stype, refs, activity, is_active in stmts:

        # Extract agents, and make a Statement.
        StmtClass = get_statement_by_name(stype)
        if stype == 'ActiveForm':
            ag = make_agent_from_ref(refs[0])
            stmt = StmtClass(ag, activity=activity, is_active=is_active)
        else:
            ag1 = make_agent_from_ref(refs[0])
            ag2 = make_agent_from_ref(refs[1])
            if stype == 'Complex':
                if {ag1.name, ag2.name} in complex_pairs:
                    continue
                stmt = StmtClass([ag1, ag2])
                complex_pairs.append({ag1.name, ag2.name})
            else:
                stmt = StmtClass(ag1, ag2)

        # Connect with a source.
        source_dict = source_data[len(source_meta_rows) % len(source_data)]
        ev_count = sum(source_dict['sources'].values())
        src_row = (stmt.get_hash(), )
        for src_name in ['reach', 'medscan', 'pc11', 'signor']:
            src_row += (source_dict['sources'].get(src_name), )
        src_row += (ev_count, ro_type_map.get_int(stype), activity, is_active,
                    len(refs), len(source_dict['sources']),
                    json.dumps(source_dict['sources']),
                    source_dict['only_src'], source_dict['has_rd'],
                    source_dict['has_db'])
        source_meta_rows.append(src_row)

        # Add mesh rows
        for mesh_id in source_dict['mesh_ids']:
            mesh_meta_rows.append(
                (stmt.get_hash(), ev_count, int(mesh_id[1:]),
                 ro_type_map.get_int(stype), activity, is_active, len(refs)))

        # Generate agent rows.
        ref_rows, _, _ = extract_agent_data(stmt, stmt.get_hash())
        for row in ref_rows:
            row = row[:4] + (ro_role_map.get_int(
                row[4]), ro_type_map.get_int(stype), ev_count, activity,
                             is_active, len(refs))
            if row[2] == 'NAME':
                row = row[:2] + row[3:]
                name_meta_rows.append(row)
            elif row[2] == 'TEXT':
                row = row[:2] + row[3:]
                text_meta_rows.append(row)
            else:
                other_meta_rows.append(row)

    db = get_temp_db(clear=True)
    src_meta_cols = [{'name': col} for col, _ in sources]
    db.SourceMeta.load_cols(db.engine, src_meta_cols)
    for tbl in [
            db.SourceMeta, db.MeshMeta, db.NameMeta, db.TextMeta, db.OtherMeta
    ]:
        tbl.__table__.create(db.engine)
    db.copy('readonly.source_meta', source_meta_rows, source_meta_cols)
    db.copy('readonly.mesh_meta', mesh_meta_rows, mesh_meta_cols)
    db.copy('readonly.name_meta', name_meta_rows, name_meta_cols)
    db.copy('readonly.text_meta', text_meta_rows, text_meta_cols)
    db.copy('readonly.other_meta', other_meta_rows, other_meta_cols)
    return db