Esempio n. 1
0
def get_db_readings(id_dict,
                    readers,
                    force_fulltext=False,
                    batch_size=1000,
                    db=None):
    """Get readings from the database."""
    if db is None:
        db = get_primary_db()

    # Get any previous readings. Note that we do this BEFORE posting the new
    # readings. Otherwise we would have duplicates.
    previous_readings_query = get_readings_query(id_dict,
                                                 readers,
                                                 db=db,
                                                 force_fulltext=force_fulltext)
    if previous_readings_query is not None:
        prev_readings = [
            ReadingData(
                r.text_content_id, r.reader, r.reader_version, r.format,
                json.loads(
                    zlib.decompress(r.bytes,
                                    16 + zlib.MAX_WBITS).decode('utf8')), r.id)
            for r in previous_readings_query.yield_per(batch_size)
        ]
    else:
        prev_readings = []
    return prev_readings
Esempio n. 2
0
def produce_statements(output_list,
                       enrich=True,
                       no_upload=False,
                       pickle_file=None,
                       n_proc=1,
                       db=None):
    """Convert the reader output into a list of StatementData instances."""
    if db is None:
        db = get_primary_db()

    if enrich:
        _enrich_reading_data(output_list, db=db)

    stmt_data_list = make_statements(output_list, n_proc)

    if not no_upload:
        try:
            upload_statements(stmt_data_list, db=db)
        except Exception as e:
            logger.exception(e)
            if pickle_file is None:
                pickle_file = ("failure_stmt_dump_%s.pkl" %
                               datetime.now().strftime('%Y%m%d_%H%M%S'))
            logger.error(
                "Could not upload statements. Results pickled in: %s." %
                pickle_file)
    if pickle_file is not None:
        with open(pickle_file, 'wb') as f:
            pickle.dump([sd.statement for sd in stmt_data_list], f)
        print("Statements pickled in %s." % pickle_file)

    return stmt_data_list
Esempio n. 3
0
def upload_readings(output_list, db=None):
    """Put the reading output on the database."""
    if db is None:
        db = get_primary_db()

    # Create the list of records to be copied, ensuring no uniqueness conflicts
    r_list = db.select_all(
        db.Readings,
        db.Readings.text_content_id.in_([rd.tcid for rd in output_list]))
    exisiting_tcid_set = set([r.text_content_id for r in r_list])
    upload_list = []
    for reading_data in output_list:
        # First check if this tcid is even in the set of existing tcids in the
        # readings table.
        if reading_data.tcid in exisiting_tcid_set:
            r_tcid_list = [
                r for r in r_list if r.text_content_id == reading_data.tcid
            ]
            # Now check for any exact matches:
            if any([reading_data.matches(r) for r in r_tcid_list]):
                continue

        # If there were no conflicts, we can add this to the copy list.
        upload_list.append(reading_data.make_tuple())

    # Copy into the database.
    logger.info("Adding %d/%d reading entries to the database." %
                (len(upload_list), len(output_list)))
    db.copy('readings', upload_list, ReadingData.get_cols())
    return
Esempio n. 4
0
def get_priority_tcids(id_dict, priorities, always_add=None, db=None):
    """For all ids, besides tcids, choose best content available.

    This function will convert all ids to tcids.
    """
    if db is None:
        db = get_primary_db()

    def is_better(new, old):
        if new in priorities and old in priorities:
            return priorities.index(new) < priorities.index(old)
        return False

    logger.debug("Getting content prioritized by %s." % str(priorities))
    tcids = set(id_dict.pop('tcid', []))
    clauses = get_clauses(id_dict, db)
    tcid_source = set()
    for clause in clauses:
        q = (db.session.query(db.TextRef.id, db.TextContent.id,
                              db.TextContent.source).filter(
                                  db.TextContent.text_ref_id == db.TextRef.id,
                                  clause))
        id_set = set(q.all())
        logger.debug("Got %d more ids." % len(id_set))
        tcid_source |= id_set
    logger.debug("Got %d id's total." % len(tcid_source))
    tr_best = {}
    for trid, tcid, source in tcid_source:
        if trid not in tr_best.keys() or is_better(source, tr_best[trid][0]):
            tr_best[trid] = (source, tcid)
        if always_add is not None and source in always_add:
            tcids.add(tcid)
    tcids |= {tcid for _, tcid in tr_best.values()}
    return tcids
Esempio n. 5
0
def get_id_dict(id_str_list):
    """Parse the list of id string into a dict."""
    id_types = get_primary_db().TextRef.__table__.columns.keys()
    id_types.remove('id')
    id_types += ['trid', 'tcid']
    id_dict = {id_type: [] for id_type in id_types}
    for id_entry in id_str_list:
        id_type, id_val = _convert_id_entry(id_entry, id_types)
        if id_type in ['trid', 'tcid']:
            id_dict[id_type].append(int(id_val))
        else:
            id_dict[id_type].append(id_val)
    return id_dict
Esempio n. 6
0
def upload_statements(stmt_data_list, db=None):
    """Upload the statements to the database."""
    if db is None:
        db = get_primary_db()

    logger.info("Uploading %d statements to the database." %
                len(stmt_data_list))
    db.copy('statements', [s.make_tuple() for s in stmt_data_list],
            StatementData.get_cols())

    logger.info("Uploading agents to the database.")
    reading_id_set = set([sd.reading_id for sd in stmt_data_list])
    if len(reading_id_set):
        insert_agents(db, [sd.statement for sd in stmt_data_list],
                      db.Statements.reader_ref.in_(reading_id_set))
    return
Esempio n. 7
0
def upload_statements(stmt_data_list, db=None):
    """Upload the statements to the database."""
    if db is None:
        db = get_primary_db()

    logger.info("Uploading %d statements to the database." %
                len(stmt_data_list))
    db.copy('raw_statements', [s.make_tuple() for s in stmt_data_list],
            StatementData.get_cols())

    logger.info("Uploading agents to the database.")
    reading_id_set = set([sd.reading_id for sd in stmt_data_list])
    if len(reading_id_set):
        db_stmts = (db.select_one(db.RawStatements,
                                  db.RawStatements.uuid.like(s.uuid))
                    for s in stmt_data_list)
        insert_agents(db, 'raw', db_stmts, verbose=True)
    return
Esempio n. 8
0
def _enrich_reading_data(reading_data_iter, db=None):
    """Get db ids for all ReadingData objects that correspond to a db ref.

    Note that the objects are modified IN PLACE, so nothing is returned, and if
    a copy of the objects is passed as an argument, this function will have no
    effect. This does nothing if the readings are not in the database.
    """
    logger.debug("Enriching the reading data with database refs.")
    if db is None:
        db = get_primary_db()
    possible_matches = db.select_all(
        'readings',
        db.Readings.text_content_id.in_(
            [rd.tcid for rd in reading_data_iter if rd.reading_id is None]))
    for rdata in reading_data_iter:
        for reading in possible_matches:
            if rdata.matches(reading):
                rdata.reading_id = reading.id
                break
    return
Esempio n. 9
0
def upload_statements(stmt_data_list, db=None):
    """Upload the statements to the database."""
    if db is None:
        db = get_primary_db()

    logger.info("Uploading %d statements to the database." %
                len(stmt_data_list))
    db.copy('raw_statements', [s.make_tuple() for s in stmt_data_list],
            StatementData.get_cols())

    logger.info("Uploading agents to the database.")
    reading_id_set = set([sd.reading_id for sd in stmt_data_list])
    if len(reading_id_set):
        uuid_set = {s.statement.uuid for s in stmt_data_list}
        insert_agents(db,
                      'raw',
                      db.RawStatements.uuid.in_(uuid_set),
                      verbose=True,
                      override_default_query=True)
    return
Esempio n. 10
0
def get_db_readings(id_dict, readers, force_fulltext=False, batch_size=1000,
                    db=None):
    """Get readings from the database."""
    if db is None:
        db = get_primary_db()

    # Get any previous readings. Note that we do this BEFORE posting the new
    # readings. Otherwise we would have duplicates.
    previous_readings_query = get_readings_query(
        id_dict,
        readers,
        db=db,
        force_fulltext=force_fulltext
        )
    if previous_readings_query is not None:
        prev_readings = [
            ReadingData.from_db_reading(r)
            for r in previous_readings_query.yield_per(batch_size)
            ]
    else:
        prev_readings = []
    return prev_readings
Esempio n. 11
0
if __name__ == '__main__':
    # Load the statements.
    if args.source == 'from-pickle':
        logger.info("Getting statements from pickle file.")
        results = load_file(args.file_path)
        all_stmts = [stmt for reader_stmts in results.values()
                     for paper_stmts in reader_stmts
                     for stmt in paper_stmts]
    if args.source == 'from-db':
        logger.info("Getting statements from the database.")
        from indra.db import get_primary_db
        from indra.db.util import get_raw_stmts_frm_db_list, \
            _get_statement_object, _set_evidence_text_ref

        db = get_primary_db()
        clauses = []
        if args.indra_version:
            clauses.append(db.RawStatements.indra_version == args.indra_version)
        if args.date_range:
            min_date_str, max_date_str = args.date_range.split(':')
            if min_date_str:
                min_date = datetime.strptime(min_date_str, '%Y%m%d%H%M%S')
                clauses.add(db.RawStatements.create_date > min_date)
            if max_date_str:
                max_date = datetime.strptime(max_date_str, '%Y%m%d%H%M%S')
                clauses.add(db.RawStatements.create_date < max_date)

        all_stmts, results = load_stmts_from_db(clauses, db)

    report_stmt_counts(results['reach'], plot_prefix='raw_reach')
Esempio n. 12
0
def produce_readings(id_dict,
                     reader_list,
                     verbose=False,
                     read_mode='unread-all',
                     force_fulltext=False,
                     batch_size=1000,
                     no_upload=False,
                     pickle_file=None,
                     db=None,
                     log_readers=True,
                     prioritize=False):
    """Produce the reading output for the given ids, and upload them to db.

    This function will also retrieve pre-existing readings from the database,
    thus improving performance.

    Parameters
    ----------
    id_dict : dict {<id_type>:[<id value>, ...]}
        A dict of lists of the id's to be read, keyed by id_type.
    reader_list : list [Reader]
        A list of Reader descendents to be used in reading.
    verbose : bool
        Optional, default False - If True, log and print the output of the
        commandline reader utilities, if False, don't.
    read_mode : str : 'all', 'unread-all', 'unread-unread', or 'none'
        Optional, default 'undread-all' - If 'all', read everything (generally
        slow); if 'unread-all', only read things that were unread, but use the
        cache of old readings to get everything (as fast as you can be while
        still getting everything); if 'unread-unread', just like 'unread-all',
        but only return the unread content; if 'none', don't read, and only get
        existing readings.
    force_fulltext : bool
        Optional, default False - If True, only read fulltext article, ignoring
        abstracts.
    batch_size : int
        Optional, default 1000 - The number of text content entries to be
        yielded by the database at a given time.
    no_read : bool
        Optional, default False - If True, do not perform any new readings, and
        only retrieve existing readings from the database.
    no_upload : bool
        Optional, default False - If True, do not upload content to the
        database.
    pickle_file : str or None
        Optional, default None - otherwise the path to a file in which the
        reading data will be saved.
    db : indra.db.DatabaseManager instance
        Optional, default is None, in which case the primary database provided
        by `get_primary_db` function is used. Used to interface with a
        different databse.

    Returns
    -------
    outputs : list [ReadingData]
        A list of the outputs of the readings in the form of ReadingData
        instances.
    """
    logger.debug("Producing readings in %s mode." % read_mode)
    if db is None:
        db = get_primary_db()

    # Sort out our priorities
    if prioritize:
        logger.debug("Prioritizing...")
        tcids = get_priority_tcids(id_dict,
                                   ['pmc_oa', 'manuscripts', 'elsevier'],
                                   always_add=['pubmed'],
                                   db=db)
        id_dict = {'tcid': list(tcids)}

    prev_readings = []
    skip_reader_tcid_dict = None
    if read_mode not in ['all', 'unread-unread']:
        prev_readings = get_db_readings(id_dict,
                                        reader_list,
                                        force_fulltext,
                                        batch_size,
                                        db=db)
        skip_reader_tcid_dict = {r.name: [] for r in reader_list}
        logger.info("Found %d pre-existing readings." % len(prev_readings))
        if read_mode != 'none':
            for rd in prev_readings:
                skip_reader_tcid_dict[rd.reader].append(rd.tcid)
    outputs = []
    if read_mode != 'none':
        outputs = make_db_readings(id_dict,
                                   reader_list,
                                   verbose=verbose,
                                   skip_dict=skip_reader_tcid_dict,
                                   db=db,
                                   force_fulltext=force_fulltext,
                                   force_read=(read_mode == 'all'),
                                   batch_size=batch_size,
                                   log=log_readers)
        logger.info("Made %d new readings." % len(outputs))

    if not no_upload:
        try:
            upload_readings(outputs, db=db)
        except Exception as e:
            logger.exception(e)
            if pickle_file is None:
                pickle_file = ("failure_reading_dump_%s.pkl" %
                               datetime.now().strftime('%Y%m%d_%H%M%S'))
            logger.error(
                "Cound not upload readings. Results are pickled in: " +
                pickle_file)

    outputs += prev_readings

    if pickle_file is not None:
        with open(pickle_file, 'wb') as f:
            pickle.dump([output.make_tuple() for output in outputs], f)
        print("Reading outputs stored in %s." % pickle_file)

    return outputs
Esempio n. 13
0
def make_db_readings(id_dict,
                     readers,
                     batch_size=1000,
                     force_fulltext=False,
                     force_read=False,
                     skip_dict=None,
                     db=None,
                     **kwargs):
    """Read contents retrieved from the database.

    The content will be retrieved in batchs, given by the `batch` argument.
    This prevents the system RAM from being overloaded.

    Parameters
    ----------
    id_dict : dict {<id_type>:[<id value>, ...]}
        A dict of lists of the id's to be read, keyed by id_type.
    readers : list of reader objects
        A list of the readers that will be use, for example ['reach'] if you
        wanted to use the reach reader.
    batch_size : int
        The number of content entries read for each batch. Default 1000.
    force_fulltext : bool
        If True, only get fulltext content from the database. Default False.
    force_read : bool
        If True, read even if text_content id is found in skip_dict.
    skip_dict : dict {<reader> : list [int]}
        A dict containing text content id's to be skipped.
    db : indra.db.DatabaseManager instance
        A handle to a database. Default None; if None, a handle to the primary
        database (see indra.db) is retrieved.

    Other keyword arguments are passed to the `read` methods of the readers.

    Returns
    -------
    outputs : list of ReadingData instances
        The results of the readings with relevant metadata.
    """
    if db is None:
        db = get_primary_db()

    # Get the iterator.
    logger.debug("Getting iterator.")
    tc_read_q = get_content_query(id_dict,
                                  readers,
                                  db=db,
                                  force_fulltext=force_fulltext,
                                  force_read=force_read)
    logger.debug("Begginning to iterate.")
    batch_list_dict = {r.name: [] for r in readers}
    new_outputs = []
    if tc_read_q is not None:
        for text_content in tc_read_q.yield_per(batch_size):
            # The get_content function returns an iterator which yields
            # results in batches, so as not to overwhelm RAM. We need to read
            # in batches for much the same reason.
            for r in readers:
                if not force_read:
                    if skip_dict is not None:
                        if text_content.id in skip_dict[r.name]:
                            continue
                    else:
                        # Try to get a previous reading from this reader.
                        reading = db.select_one(
                            db.Readings,
                            db.Readings.text_content_id == text_content.id,
                            r.matches_clause(db))
                        if reading is not None:
                            continue
                processed_content = process_content(text_content)
                if processed_content is not None:
                    batch_list_dict[r.name].append(processed_content)

                if (len(batch_list_dict[r.name]) + 1) % batch_size is 0:
                    # TODO: this is a bit cludgy...maybe do this better?
                    # Perhaps refactor read_content.
                    logger.debug("Reading batch of files for %s." % r.name)
                    results = r.read(batch_list_dict[r.name], **kwargs)
                    if results is not None:
                        new_outputs += results
                    batch_list_dict[r.name] = []
        logger.debug("Finished iteration.")
        # Pick up any stragglers.
        for r in readers:
            if len(batch_list_dict[r.name]) > 0:
                logger.debug("Reading remaining files for %s." % r.name)
                results = r.read(batch_list_dict[r.name], **kwargs)
                if results is not None:
                    new_outputs += results
    return new_outputs
Esempio n. 14
0
def get_readings_query(ids, readers, db=None, force_fulltext=False):
    """Create a query to access all the relevant existing readings.

    Note that if ids is not 'all' and ids is a dict with no ids in it,
    this function returns None.

    Parameters
    ----------
    ids : 'all' or dict {<id_type> : [str/int]}
        If 'all', then all possible readings in the database matching the given
        readers and other conditions will be returned. Otherwise, only those
        that correspond to one of the ids in ids dict will be contained. If an
        ids dict has no ids in it, None is returned.
    readers : list [Reader child instances]
        A list of the readers whose names and versions you wish to match in the
        readings queried from the database.
    db : indra.db.DatabaseManager instance
        Optional, default None, in which case the primary database is used. If
        specified, the alternative database will be used. This function should
        not alter the database.
    force_fulltext : bool
        Optional, default False - If True, only readings corresponding to
        fulltext content will be read, as opposed to including readings created
        from abstracts.

    Returns
    -------
    readings_query : sql query instance or None
        Returns a query that can be used to access the specified content, or
        else None if no content was specified.
    """
    if db is None:
        db = get_primary_db()
    clauses = [
        # Bind conditions on readings to conditions on content.
        db.Readings.text_content_id == db.TextContent.id,

        # Bind text content to text refs
        db.TextContent.text_ref_id == db.TextRef.id,

        # Check if at least one of the readers has read the content
        sql.or_(*[reader.matches_clause(db) for reader in readers])
    ]
    if force_fulltext:
        clauses.append(db.TextContent.text_type == texttypes.FULLTEXT)

    if ids == 'all' or any([id_list for id_list in ids.values()]):
        if ids != 'all':
            sub_clauses = get_clauses(ids, db)
            if len(sub_clauses) > 1:
                clauses.append(sql.or_(*sub_clauses))
            else:
                clauses.append(*sub_clauses)

        readings_query = db.filter_query(
            db.Readings,

            # Bind conditions on readings to conditions on content.
            db.Readings.text_content_id == db.TextContent.id,

            # Bind text content to text refs
            db.TextContent.text_ref_id == db.TextRef.id,

            # Check if at least one of the readers has read the content
            sql.or_(*[reader.matches_clause(db) for reader in readers]),

            # Conditions generated from the list of ids. These include a
            # text-ref text-content binding to connect with id data.
            *clauses)
    else:
        return None

    return readings_query.distinct()
Esempio n. 15
0
def get_content_query(ids,
                      readers,
                      db=None,
                      force_fulltext=False,
                      force_read=False,
                      debug=False,
                      print_summary=False):
    """Construct a query to access all the content that will be read.

    If ids is not 'all', and does not contain any ids, None is returned.

    Parameters
    ----------
    ids : 'all' or dict {<id type> : [str/int]}
        If 'all', then all the content will be included in the query. Otherwise
        a the content will be constrained to that corresponding to the ids in
        id_dict, which are matched using text refs.
    readers : list [Reader child instances]
        A list of the reader objects, which contain the required metadata (name
        and version of the reader) used to find content that needs to be read.
    db : indra.db.DatabaseManager instance
        Optional, default None, in which case the primary database is used. If
        specified, the alternative database will be used. This function should
        not alter the database.
    force_fulltext : bool
        Optional, default False - If True, only fulltext content will be read,
        as opposed to including abstracts.
    force_read : bool
        Optional, default False - If True, all content will be returned,
        whether it has been read or not.

    Returns
    -------
    tc_tbr_query : sqlalchemy query object or None
        The query of the text content to be read (tc_tbr). If there are no ids
        contained in ids, or it is not 'all', return None.
    """
    if debug:
        logger.setLevel(logging.DEBUG)
    if db is None:
        db = get_primary_db()
    logger.debug("Got db handle.")

    # These allow conditions on different tables to equal conditions on the
    # dependent tables.
    tc_tr_binding = db.TextContent.text_ref_id == db.TextRef.id
    rd_tc_binding = db.Readings.text_content_id == db.TextContent.id

    # Begin the list of clauses with the binding between text content and
    # text refs.
    clauses = [tc_tr_binding]

    # Add a fulltext requirement, if applicable.
    if force_fulltext:
        clauses.append(db.TextContent.text_type == texttypes.FULLTEXT)

    # If we are actually getting anything, else we return None.
    if ids == 'all' or any([len(id_list) > 0 for id_list in ids.values()]):
        if ids is not 'all':
            sub_clauses = get_clauses(ids, db)
            if len(sub_clauses) > 1:
                clauses.append(sql.or_(*sub_clauses))
            else:
                clauses.append(*sub_clauses)

        # Get the text content query object
        tc_query = db.filter_query(db.TextContent, *clauses).distinct()

        if not force_read:
            logger.debug("Getting content to be read.")
            # Each sub query is a set of content that has been read by one of
            # the readers.
            tc_q_subs = [
                tc_query.filter(rd_tc_binding, r.matches_clause(db))
                for r in readers
            ]
            tc_tbr_query = tc_query.except_(sql.intersect(*tc_q_subs))
        else:
            logger.debug('All content will be read (force_read).')
            tc_tbr_query = tc_query

        if print_summary:
            try:
                logger.debug("Going to try to make a nice summary...")
                logger.info(get_text_content_summary_string(tc_tbr_query, db))
            except Exception:
                logger.debug("Could not print summary of results.")
    else:
        logger.debug("No ids in id_dict, so no query formed.")
        return None

    return tc_tbr_query.distinct()
Esempio n. 16
0
    with open(fname, 'wb') as fh:
        pickle.dump(stmts, fh)


def load_statements():
    fnames = glob.glob(data_path + 'pa_stmts_*.pkl')
    all_stmts = []
    for fname in fnames:
        print('Loading %s' % fname)
        with open(fname, 'rb') as fh:
            stmts = pickle.load(fh)
            all_stmts += stmts
    return all_stmts


def assemble_cx(statements):
    cxa = CxAssembler(statements)
    model = cxa.make_model()
    cxa.save_model('model.cx')

if __name__ == '__main__':
    db = get_primary_db()
    res = db.filter_query(db.PAStatements).yield_per(20000)
    stmts = []
    for idx, r in enumerate(res):
        stmt = make_stmts_from_db_list([r])
        stmts.append(stmt[0])
        if idx > 0 and idx % 20000 == 0:
            dump_statement_batch(stmts, data_path + 'pa_stmts_%d.pkl' % idx)
            stmts = []