Ejemplo n.º 1
0
def main():
    parser = get_parser()
    args = parser.parse_args()
    if args.test:
        if 'test' not in args.database:
            from indra_db.tests.util import get_temp_db
            db = get_temp_db()
        else:
            db = get_db(args.database)
    elif args.database == 'primary':
        db = get_primary_db()
    else:
        db = get_db(args.database)

    readers = ['SPARSER', 'REACH', 'TRIPS', 'ISI', 'EIDOS', 'MTI']
    if args.method == 'local':
        bulk_manager = BulkLocalReadingManager(readers,
                                               buffer_days=args.buffer,
                                               n_procs=args.num_procs)
    elif args.method == 'aws':
        bulk_manager = BulkAwsReadingManager(readers,
                                             buffer_days=args.buffer,
                                             project_name=args.project_name)
    else:
        assert False, "This shouldn't be allowed."

    if args.task == 'read_all':
        bulk_manager.read_all(db)
    elif args.task == 'read_new':
        bulk_manager.read_new(db)
    return
Ejemplo n.º 2
0
def test_num_evidence():
    ro = get_db('primary')
    q = HasNumEvidence(tuple(range(5, 10)))
    res = q.get_statements(ro, limit=5, ev_limit=8)
    assert all(5 <= n < 10 for n in res.evidence_totals.values())
    stmts = res.statements()
    assert all(5 < len(s.evidence) <= 8 for s in stmts)
Ejemplo n.º 3
0
def read_db_ids_search_terms(id_search_terms, id_type):
    """Return extracted EmmaaStatements from INDRA database given an
    ID-search term dict.

    Parameters
    ----------
    id_search_terms : dict
        A dict representing a set of IDs pointing to search terms that
        produced them.

    Returns
    -------
    list[:py:class:`emmaa.model.EmmaaStatement`]
        A list of EmmaaStatements extracted from the given IDs.
    """
    ids = list(id_search_terms.keys())
    date = datetime.datetime.utcnow()
    db = get_db('primary')
    id_stmts = get_raw_stmt_jsons_from_papers(ids, id_type=id_type, db=db)
    estmts = []
    for _id, stmt_jsons in id_stmts.items():
        stmts = stmts_from_json(stmt_jsons)
        id_estmts = to_emmaa_stmts(stmts, date, id_search_terms[_id],
                                   {'internal': True})
        estmts += id_estmts
    return estmts
Ejemplo n.º 4
0
def test_evidence_filtering_trios():
    ro = get_db('primary')
    q1 = HasAgent('TP53')
    q_list = [
        ~HasOnlySource('medscan'),
        HasSources(['reach', 'sparser']),
        HasDatabases(),
        HasReadings(),
        FromMeshId('D001943')
    ]
    for q2, q3, q4 in combinations(q_list, 3):
        query = q1 | q2 | q3 | q4
        ev_filter = q2.ev_filter() & q3.ev_filter() & q4.ev_filter()
        query.get_statements(ro,
                             limit=2,
                             ev_limit=5,
                             evidence_filter=ev_filter)

        ev_filter = q2.ev_filter() | q3.ev_filter() | q4.ev_filter()
        query.get_statements(ro,
                             limit=2,
                             ev_limit=5,
                             evidence_filter=ev_filter)

    for q2, q3, q4 in permutations(q_list, 3):
        query = q1 | q2 | q3 | q4
        ev_filter = q2.ev_filter() & q3.ev_filter() | q4.ev_filter()
        query.get_statements(ro,
                             limit=2,
                             ev_limit=5,
                             evidence_filter=ev_filter)
Ejemplo n.º 5
0
def get_belief(db=None, partition=True):
    if db is None:
        db = dbu.get_db('primary')

    if partition:
        import networkx as nx
        hashes = {h for h, in db.select_all(db.PAStatements.mk_hash)}
        link_pair = [
            db.PASupportLinks.supporting_mk_hash,
            db.PASupportLinks.supported_mk_hash
        ]
        links = {tuple(link) for link in db.select_all(link_pair)}
        g = nx.Graph()
        g.add_nodes_from(hashes)
        g.add_edges_from(links)

        group = set()
        beliefs = {}
        for c in nx.connected_components(g):
            group |= c

            if len(group) >= 10000:
                sg = g.subgraph(group)
                stmts = load_mock_statements(db,
                                             hashes=group,
                                             sup_links=list(sg.edges))
                beliefs.update(calculate_belief(stmts))
                group = set()
        return beliefs
    else:
        stmts = load_mock_statements(db)
        return calculate_belief(stmts)
Ejemplo n.º 6
0
    def __check_stmts(self,
                      json_stmts,
                      check_support=False,
                      check_stmts=False):
        assert len(json_stmts) is not 0, \
            'Did not get any statements.'
        stmts = stmts_from_json(json_stmts)
        for s in stmts:
            assert s.evidence, "Statement lacks evidence."
            for ev in s.evidence:
                if ev.source_api in {'reach', 'sparser', 'trips'} \
                        and ev.pmid is None:

                    # Check because occasionally there is genuinely no pmid.
                    from indra_db.util import get_db
                    db = get_db('primary')
                    tr = db.select_one(db.TextRef,
                                       db.TextRef.id == ev.text_refs['TRID'])
                    assert tr.pmid is None, \
                        ('Statement from reading missing pmid:\n%s\n%s.'
                         % (s, json.dumps(ev.to_json(), indent=2)))

        # To allow for faster response-times, we currently do not include
        # support links in the response.
        if check_support:
            assert any([s.supports + s.supported_by for s in stmts]),\
                ("Some statements lack support: %s."
                 % str([str(s) for s in stmts if not s.supports+s.supported_by]))
            if check_stmts:
                assert all([not s1.matches(s2)
                            for s1, s2 in combinations(stmts, 2)]),\
                    ("Some statements match: %s."
                     % str([(s1, s2) for s1, s2 in combinations(stmts, 2)
                            if s1.matches(s2)]))
        return
Ejemplo n.º 7
0
def get_unique_text_refs():
    """Get unique INDRA DB TextRef IDs for all identifiers in CORD19.

    Queries TextRef IDs with PMIDs, PMCIDs, and DOIs from CORD19, then
    deduplicates to obtain a unique set of TextRefs.

    Returns
    -------
    set of ints
        Unique TextRef IDs.
    """
    pmcids = get_ids('pmcid')
    pmids = [fix_pmid(pmid) for pmid in get_ids('pubmed_id')]
    dois = [fix_doi(doi) for doi in get_ids('doi')]
    # Get unique text_refs from the DB
    db = get_db('primary')
    print("Getting TextRefs by PMCID")
    tr_pmcids = db.select_all(db.TextRef.id, db.TextRef.pmcid_in(pmcids))
    print("Getting TextRefs by PMID")
    tr_pmids = db.select_all(db.TextRef.id, db.TextRef.pmid_in(pmids))
    tr_dois = []
    for ix, doi_batch in enumerate(batch_iter(dois, 10000)):
        print("Getting Text Refs by DOI batch", ix)
        tr_doi_batch = db.select_all(db.TextRef.id,
                            db.TextRef.doi_in(doi_batch, filter_ids=True))
        tr_dois.extend(tr_doi_batch)
    ids = set([res.id for res_list in (tr_dois, tr_pmcids, tr_pmids)
                      for res in res_list])
    print(len(ids), "unique TextRefs in DB")
    trs = db.select_all(db.TextRef, db.TextRef.id.in_(ids))
    return trs
Ejemplo n.º 8
0
def get_text_content_stats(fname=None, db=None):
    if db is None:
        db = get_db('primary')
    tc_rdng_link = db.TextContent.id == db.Reading.text_content_id
    __report_stat("\nText Content statistics:", fname)
    __report_stat('------------------------', fname)
    total_content = db.count(db.TextContent)
    __report_stat("Total number of text content entries: %d" % total_content)
    latest_updates = (db.session.query(db.Updates.source,
                                       func.max(db.Updates.datetime)).group_by(
                                           db.Updates.source).all())
    __report_stat(
        ("Latest updates:\n    %s" %
         '\n    '.join(['%s: %s' % (s, d) for s, d in latest_updates])), fname)
    content_read = db.count(db.Reading.text_content_id)
    __report_stat("Total content read: %d" % content_read, fname)
    fulltext_content = db.count(db.TextContent,
                                db.TextContent.text_type == 'fulltext')
    __report_stat("Number of fulltext entries: %d" % fulltext_content, fname)
    fulltext_read = db.count(db.TextContent,
                             db.TextContent.text_type == 'fulltext',
                             tc_rdng_link)
    __report_stat("Number of fulltext entries read: %d" % fulltext_read, fname)
    _report_groups(db, db.TextContent.id, db.TextContent.source, fname)
    _report_groups(db, db.TextContent.id, db.TextContent.source, fname,
                   tc_rdng_link)
    return
Ejemplo n.º 9
0
 def dump(self, continuing=False):
     if self.use_principal:
         ro = get_db(self.db_label)
     else:
         ro = get_ro(self.db_label)
     s3_path = self.get_s3_path()
     dump_sif(s3_path, ro=ro)
Ejemplo n.º 10
0
def get_db_statistics(fname=None, db=None, tables=None):
    """Get statistics on the contents of the database"""
    if db is None:
        db = get_db('primary')

    task_dict = {
        'text_ref': get_text_ref_stats,
        'text_content': get_text_content_stats,
        'readings': get_readings_stats,
        'raw_statements': get_statements_stats,
        'pa_statements': get_pa_statement_stats
    }

    task_order = [
        'text_ref', 'text_content', 'readings', 'raw_statements',
        'pa_statements'
    ]

    # Get the statistics
    if tables is None:
        for task_name in task_order:
            stat_meth = task_dict[task_name]
            stat_meth(fname, db)
    else:
        table_set = set(tables)
        for task_name in [tn for tn in task_order if tn in table_set]:
            task_dict[task_name](fname, db)

    return
Ejemplo n.º 11
0
def get_statements_stats(fname=None, db=None, indra_version=None):
    if db is None:
        db = get_db('primary')
    tc_rdng_link = db.TextContent.id == db.Reading.text_content_id
    stmt_rdng_link = db.Reading.id == db.RawStatements.reader_ref

    __report_stat('\nStatement Statistics:', fname)
    __report_stat('---------------------', fname)
    if indra_version is not None:
        filters = [db.RawStatements.indra_version == indra_version]
    else:
        filters = []
    total_raw_statements = db.count(db.RawStatements, *filters)
    __report_stat("Total number of raw statements: %d" % total_raw_statements,
                  fname)
    readers = db.session.query(db.Reading.reader).distinct().all()
    sources = db.session.query(db.TextContent.source).distinct().all()
    stats = ''
    for reader, in readers:
        for src, in sources:
            cnt = db.count(db.RawStatements, stmt_rdng_link, tc_rdng_link,
                           db.Reading.reader == reader,
                           db.TextContent.source == src, *filters)
            stats += ('    Raw statements from %s reading %s: %d\n' %
                      (reader, src, cnt))
    __report_stat("Statements by reader and content source:\n%s" % stats,
                  fname)
    _report_groups(db, db.RawStatements.id, db.DBInfo.db_name, fname,
                   db.RawStatements.db_info_id == db.DBInfo.id)
    if indra_version is None:
        _report_groups(db, db.RawStatements.id, db.RawStatements.indra_version,
                       fname)
    return
Ejemplo n.º 12
0
def _main():
    parser = _make_parser()
    args = parser.parse_args()
    if args.debug:
        logger.setLevel(logging.DEBUG)
        from indra_db.databases import logger as db_logger
        db_logger.setLevel(logging.DEBUG)
    print("Getting %s database." % args.database)
    db = get_db(args.database)
    assert db is not None
    db.grab_session()
    s3_cache = S3Path.from_string(args.cache)
    pa = DbPreassembler(args.batch,
                        s3_cache,
                        stmt_type=args.stmt_type,
                        yes_all=args.yes_all)

    desc = 'Continuing' if args.continuing else 'Beginning'
    print("%s to %s preassembled corpus." % (desc, args.task))
    if args.task == 'create':
        pa.create_corpus(db, args.continuing)
    elif args.task == 'update':
        pa.supplement_corpus(db, args.continuing)
    else:
        raise IndraDBPreassemblyError('Unrecognized task: %s.' % args.task)
Ejemplo n.º 13
0
def test_has_hash():
    ro = get_db('primary')
    hashes = {h for h, in ro.session.query(ro.SourceMeta.mk_hash).limit(10)}
    q = HasHash(hashes)
    res = q.get_statements(ro, limit=5, ev_limit=8)
    assert set(res.results.keys()) < hashes
    assert set(res.results.keys()) == set(res.source_counts.keys())
Ejemplo n.º 14
0
def load_readonly_dump(db_label, ro_label, dump_file):
    principal_db = get_db(db_label)
    readonly_db = get_ro(ro_label)
    logger.info("Using dump_file = \"%s\"." % dump_file)
    logger.info("%s - Beginning upload of content (est. ~30 minutes)" %
                datetime.now())
    with ReadonlyTransferEnv(principal_db, readonly_db):
        readonly_db.load_dump(dump_file)
Ejemplo n.º 15
0
def test_from_papers():
    ro = get_db('primary')
    pmid = '27014235'
    q = FromPapers([('pmid', pmid)])
    res = q.get_statements(ro, limit=5)
    assert res.statements()
    assert all(
        any(ev.text_refs.get('PMID') == pmid for ev in s.evidence)
        for s in res.statements())
Ejemplo n.º 16
0
 def dump(self, continuing=False):
     if self.use_principal:
         ro = get_db(self.db_label)
     else:
         ro = get_ro(self.db_label)
     query_res = ro.session.query(ro.FastRawPaLink.pa_json.distinct())
     json_list = [json.loads(js[0]) for js in query_res.all()]
     s3 = boto3.client('s3')
     s3.put_object(Body=json.dumps(json_list), **self.get_s3_path().kw())
Ejemplo n.º 17
0
def load_readonly(from_dump):
    """Load the readonly database with readonly schema dump."""
    start = Start.from_date(from_dump)
    dump_file = Readonly.from_list(start.manifest).get_s3_path()
    if not dump_file:
        print(f"ERROR: No readonly dump for {start.date_stamp}")
        return
    load_readonly_dump(get_db('primary', protected=True),
                       get_ro('primary', protected=False), dump_file)
Ejemplo n.º 18
0
def get_text_refs_for_pubmed_search_term(search_term, **kwargs):
    """"Returns text ref IDs for PMIDs obtained using a PubMed search."""
    print('Searching for %s' % search_term)
    pmids = pubmed_client.get_ids(search_term, **kwargs)
    print('Getting TextRefs for %d PMIDs' % len(pmids))
    db = get_db('primary')
    tr_pmids = db.select_all(db.TextRef.id, db.TextRef.pmid_in(pmids))
    trids = {res.id for res in tr_pmids}
    return trids
Ejemplo n.º 19
0
def test_get_agents():
    ro = get_db('primary')
    query = HasAgent('TP53')
    res = query.get_agents(ro, limit=10)
    assert isinstance(res, QueryResult)
    assert len(res.results) <= 10, len(res.results)
    js = res.json()
    assert 'results' in js
    assert len(js['results']) == len(res.results)
Ejemplo n.º 20
0
def get_reach_readings(tr_dicts, dump_dir=None):
    db = get_db('primary')
    # Get text ref dicts with article metadata aligned between DB and CORD19
    # Get REACH readings
    reach_data = db.select_all((db.Reading, db.TextRef, db.TextContent.source,
                                db.TextContent.text_type),
                               db.TextRef.id.in_(tr_dicts.keys()),
                               db.TextContent.text_ref_id == db.TextRef.id,
                               db.Reading.text_content_id == db.TextContent.id,
                               db.Reading.reader == 'REACH')

    # Group readings by TextRef
    def tr_id_key_func(rd):
        return rd[1].id

    def content_priority_func(rd):
        text_type_priorities = {'fulltext': 0, 'abstract': 1, 'title': 2}
        source_priorities = {
            'pmc_oa': 0,
            'manuscripts': 1,
            'elsevier': 2,
            'pubmed': 3
        }
        return (rd[1].id, text_type_priorities[rd[3]],
                source_priorities[rd[2]])

    # Sort by TextRef ID and content type/source
    reach_data.sort(key=content_priority_func)
    # Iterate over groups
    rds_filt = []
    for tr_id, tr_group in groupby(reach_data, tr_id_key_func):
        rds = list(tr_group)
        best_reading = rds[0]
        tr_dicts[tr_id]['READING_ID'] = best_reading.Reading.id
        rds_filt.append(best_reading)
    # If a dump directory is given, put all files in it
    trs_by_cord = {}
    if dump_dir:
        json_dir = join(dump_dir, 'json')
        os.mkdir(json_dir)
        for reading_result in rds_filt:
            tr = reading_result.TextRef
            reading = reading_result.Reading
            # If the reading output is empty, skip
            if not reading.bytes:
                continue
            text_ref = tr_dicts[tr.id]
            cord_uid = text_ref['CORD19_UID']
            trs_by_cord[cord_uid] = text_ref
            with open(join(json_dir, f'{cord_uid}.json'), 'wt') as f:
                content = zlib.decompress(reading.bytes, 16 + zlib.MAX_WBITS)
                f.write(content.decode('utf8'))
        # Dump the metadata dictionary
        with open(join(dump_dir, 'metadata.json'), 'wt') as f:
            json.dump(trs_by_cord, f, indent=2)
    return rds_filt
Ejemplo n.º 21
0
def get_raw_stmts(tr_dicts, date_limit=None):
    """Return all raw stmts in INDRA DB for a given set of TextRef IDs.

    Parameters
    ----------
    tr_dicts : dict of text ref information
        Keys are text ref IDs (ints) mapped to dictionaries of text ref
        metadata.

    date_limit : Optional[int]
        A number of days to check the readings back.

    Returns
    -------
    list of stmts
        Raw INDRA Statements retrieved from the INDRA DB.
    """
    # Get raw statement IDs from the DB for the given TextRefs
    db = get_db('primary')
    # Get statements for the given text refs
    text_ref_ids = list(tr_dicts.keys())
    print(f"Distilling statements for {len(text_ref_ids)} TextRefs")
    start = time.time()
    clauses = [
        db.TextRef.id.in_(text_ref_ids),
        db.TextContent.text_ref_id == db.TextRef.id,
        db.Reading.text_content_id == db.TextContent.id,
        db.RawStatements.reading_id == db.Reading.id
    ]
    if date_limit:
        start_date = (datetime.datetime.utcnow() -
                      datetime.timedelta(days=date_limit))
        print(f'Limiting to stmts from readings in the last {date_limit} days')
        clauses.append(db.Reading.create_date > start_date)
    db_stmts = distill_stmts(db, get_full_stmts=True, clauses=clauses)
    # Group lists of statements by the IDs TextRef that they come from
    stmts_by_trid = {}
    for stmt in db_stmts:
        trid = stmt.evidence[0].text_refs['TRID']
        if trid not in stmts_by_trid:
            stmts_by_trid[trid] = [stmt]
        else:
            stmts_by_trid[trid].append(stmt)
    # For every statement, update the text ref dictionary of the evidence
    # object with the aligned DB/CORD19 dictionaries obtained from the
    # function cord19_metadata_for_trs:
    stmts_flat = []
    for tr_id, stmt_list in stmts_by_trid.items():
        tr_dict = tr_dicts[tr_id]
        if tr_dict:
            for stmt in stmt_list:
                stmt.evidence[0].text_refs.update(tr_dict)
        stmts_flat += stmt_list
    elapsed = time.time() - start
    print(f"{elapsed} seconds")
    return stmts_flat
Ejemplo n.º 22
0
    def dump(self, continuing=False):
        if self.use_principal:
            ro = get_db(self.db_label)
        else:
            ro = get_ro(self.db_label)

        q = ro.select_all([ro.MeshMeta.mk_hash, ro.MeshMeta.mesh_num])

        s3 = boto3.client('s3')
        s3.put_object(Body=pickle.dumps(q.all()), **self.get_s3_path().kw())
Ejemplo n.º 23
0
def show_list():
    """List the readers and their most recent runs."""
    import tabulate
    from indra_db.util import get_db

    db = get_db('primary')
    rows = [(rn, format_date(lu))
            for rn, lu in ReadingManager.get_latest_updates(db).items()]
    headers = ('Reader', 'Last Updated')
    print(tabulate.tabulate(rows, headers))
Ejemplo n.º 24
0
def test_evidence_count_is_10():
    ro = get_db('primary')
    query = HasAgent('TP53') - HasOnlySource('medscan')
    res = query.get_statements(ro, limit=2, ev_limit=10)
    assert isinstance(res, StatementQueryResult)
    stmts = res.statements()
    assert len(stmts) == 2
    assert all(len(s.evidence) <= 10 for s in stmts)
    assert res.returned_evidence == 20
    assert sum(res.evidence_totals.values()) > 20
Ejemplo n.º 25
0
def show_list():
    """List the knowledge sources and their status."""
    import tabulate
    from indra_db.util import get_db
    db = get_db('primary')
    rows = [(M.name, M.short_name, format_date(M.get_last_update(db)))
            for M in KnowledgebaseManager.__subclasses__()]
    print(
        tabulate.tabulate(rows, ('Name', 'Short Name', 'Last Updated'),
                          tablefmt='simple'))
def managed_db(db_label='primary', protected=False):
    """Get indra_db handle managed with contextmanager

    Cleans up even if an error occurs while handle is open
    """
    db = get_db(db_label, protected)
    try:
        yield db
    finally:
        db.session.rollback()
        db.session.close()
Ejemplo n.º 27
0
def test_has_sources():
    ro = get_db('primary')
    q = HasSources(['reach', 'sparser'])
    res = q.get_statements(ro, limit=5, ev_limit=8)
    assert len(res.results) == 5
    stmts = res.statements()
    res_json = res.json()
    assert 'results' in res_json
    assert len(stmts) == len(res.results)
    assert all(sc[r] > 0 for sc in res.source_counts.values()
               for r in ['reach', 'sparser'])
Ejemplo n.º 28
0
def main(db_name):
    db = get_db(db_name)
    data_json = {}

    data_json.update(get_all_daily_counts(db))

    print('Dumping json...')
    with open(db_name + '_stats.json', 'w') as f:
        json.dump(data_json, f, indent=2)

    return
Ejemplo n.º 29
0
    def dump(self, continuing=False):
        principal_db = get_db(self.db_label)

        logger.info("%s - Generating readonly schema (est. a long time)" %
                    datetime.now())
        principal_db.generate_readonly(allow_continue=continuing)

        logger.info(
            "%s - Beginning dump of database (est. 1 + epsilon hours)" %
            datetime.now())
        principal_db.dump_readonly(self.get_s3_path())
        return
Ejemplo n.º 30
0
def test_evidence_count_is_none():
    ro = get_db('primary')
    query = HasAgent('TP53') - HasOnlySource('medscan')
    res = query.get_statements(ro, limit=2)
    assert isinstance(res, StatementQueryResult)
    stmts = res.statements()
    assert len(stmts) == 2
    ev_list = stmts[0].evidence
    assert len(ev_list) > 10
    assert all(
        len(s.evidence) == res.evidence_totals[s.get_hash()] for s in stmts)
    assert res.returned_evidence == sum(res.evidence_totals.values())