def get_db_agent_mod_stmts(filename, cached=True): with open(filename, 'rb') as fh: site_stmts = pickle.load(fh) return site_stmts def has_mod_agents(stmt): mod_agents = [] for agent in stmt.agent_list(): if agent is not None: for mc in agent.mods: if has_site_pos(mc): return True return False def has_site_pos(mc): return mc.position is not None and mc.residue is not None batch_size = 100000 db = get_primary_db() site_stmts = [] for idx, db_stmt_batch in db.select_all_batched( batch_size, db.RawStatements, db.RawStatements.reading_id.isnot(None)): stmt_tuples = get_raw_stmts_frm_db_list(db, db_stmt_batch, fix_refs=False) stmts = [s[1] for s in stmt_tuples] for stmt in stmts: if has_mod_agents(stmt): site_stmts.append(stmt) print('Finished batch %d' % idx) print('Currently have %d site statements' % len(site_stmts)) with open(filename, 'wb') as f: pickle.dump(site_stmts, f) return site_stmts
def _process_pa_statement_res_wev(db, stmt_iterable, count=1000, fix_refs=True): warnings.warn(('This module is being taken out of service, as the tools ' 'have become deprecated. Moreover, the service has been ' 're-implemented to use newer tools as best as possible, ' 'but some results may be subtly different.'), DeprecationWarning) # Iterate over the batches to create the statement objects. stmt_dict = {} ev_dict = {} raw_stmt_dict = {} total_ev = 0 for stmt_pair_batch in batch_iter(stmt_iterable, count): # Instantiate the PA statement objects, and record the uuid # evidence (raw statement) links. raw_stmt_objs = [] for pa_stmt_db_obj, raw_stmt_db_obj in stmt_pair_batch: k = pa_stmt_db_obj.mk_hash if k not in stmt_dict.keys(): stmt_dict[k] = get_statement_object(pa_stmt_db_obj) ev_dict[k] = [ raw_stmt_db_obj.id, ] else: ev_dict[k].append(raw_stmt_db_obj.id) raw_stmt_objs.append(raw_stmt_db_obj) total_ev += 1 logger.info("Up to %d pa statements, with %d pieces of " "evidence in all." % (len(stmt_dict), total_ev)) # Instantiate the raw statements. raw_stmt_sid_tpls = get_raw_stmts_frm_db_list(db, raw_stmt_objs, fix_refs, with_sids=True) raw_stmt_dict.update({sid: s for sid, s in raw_stmt_sid_tpls}) logger.info("Processed %d raw statements." % len(raw_stmt_sid_tpls)) # Attach the evidence logger.info("Inserting evidence.") for k, sid_list in ev_dict.items(): stmt_dict[k].evidence = [ raw_stmt_dict[sid].evidence[0] for sid in sid_list ] return stmt_dict
def get_evidence(pa_stmt_list, db=None, fix_refs=True, use_views=True): """Fill in the evidence for a list of pre-assembled statements. Parameters ---------- pa_stmt_list : list[Statement] A list of unique statements, generally drawn from the database pa_statement table (via `get_statemetns`). db : DatabaseManager instance or None An instance of a database manager. If None, defaults to the "primary" database, as defined in the db_config.ini file in .config/indra. fix_refs : bool The paper refs within the evidence objects are not populated in the database, and thus must be filled using the relations in the database. If True (default), the `pmid` field of each Statement Evidence object is set to the correct PMIDs, or None if no PMID is available. If False, the `pmid` field defaults to the value populated by the reading system. Returns ------- None - modifications are made to the Statements "in-place". """ warnings.warn(('This module is being taken out of service, as the tools ' 'have become deprecated. Moreover, the service has been ' 're-implemented to use newer tools as best as possible, ' 'but some results may be subtly different.'), DeprecationWarning) if db is None: db = get_primary_db() # Turn the list into a dict. stmt_dict = {s.get_hash(shallow=True): s for s in pa_stmt_list} if use_views: if fix_refs: raw_links = db.select_all([ db.FastRawPaLink.mk_hash, db.FastRawPaLink.raw_json, db.FastRawPaLink.reading_id ], db.FastRawPaLink.mk_hash.in_(stmt_dict.keys())) rel_refs = ['pmid', 'rid'] ref_cols = [getattr(db.ReadingRefLink, k) for k in rel_refs] else: raw_links = db.select_all( [db.FastRawPaLink.mk_hash, db.FastRawPaLink.raw_json], db.FastRawPaLink.mk_hash.in_(stmt_dict.keys())) rid_ref_dict = {} myst_rid_rs_dict = defaultdict(list) for info in raw_links: if fix_refs: mk_hash, raw_json, rid = info else: mk_hash, raw_json = info rid = None json_dict = json.loads(raw_json.decode('utf-8')) ev_json = json_dict.get('evidence', []) assert len(ev_json) == 1, \ "Raw statements must have one evidence, got %d." % len(ev_json) ev = Evidence._from_json(ev_json[0]) stmt_dict[mk_hash].evidence.append(ev) if fix_refs: ref_dict = rid_ref_dict.get(rid) if ref_dict is None: myst_rid_rs_dict[rid].append(ev) if len(myst_rid_rs_dict) >= 1000: ref_data_list = db.select_all( ref_cols, db.ReadingRefLink.rid.in_(myst_rid_rs_dict.keys())) for pmid, rid in ref_data_list: rid_ref_dict[rid] = pmid for ev in myst_rid_rs_dict[rid]: ev.pmid = pmid myst_rid_rs_dict.clear() else: ev.pmid = rid_ref_dict[rid] else: # Get the data from the database raw_list = db.select_all( [db.PAStatements.mk_hash, db.RawStatements], db.PAStatements.mk_hash.in_(stmt_dict.keys()), db.PAStatements.mk_hash == db.RawUniqueLinks.pa_stmt_mk_hash, db.RawUniqueLinks.raw_stmt_id == db.RawStatements.id) # Note that this step depends on the ordering being maintained. mk_hashes, raw_stmt_objs = zip(*raw_list) raw_stmts = get_raw_stmts_frm_db_list(db, raw_stmt_objs, fix_refs, with_sids=False) raw_stmt_mk_pairs = zip(mk_hashes, raw_stmts) # Now attach the evidence for mk_hash, raw_stmt in raw_stmt_mk_pairs: # Each raw statement can have just one piece of evidence. stmt_dict[mk_hash].evidence.append(raw_stmt.evidence[0]) return
def get_statements(clauses, count=1000, do_stmt_count=False, db=None, preassembled=True, with_support=False, fix_refs=True, with_evidence=True): """Select statements according to a given set of clauses. Parameters ---------- clauses : list list of sqlalchemy WHERE clauses to pass to the filter query. count : int Number of statements to retrieve and process in each batch. do_stmt_count : bool Whether or not to perform an initial statement counting step to give more meaningful progress messages. db : :py:class:`DatabaseManager` Optionally specify a database manager that attaches to something besides the primary database, for example a local database instance. preassembled : bool If true, statements will be selected from the table of pre-assembled statements. Otherwise, they will be selected from the raw statements. Default is True. with_support : bool Choose whether to populate the supports and supported_by list attributes of the Statement objects. General results in slower queries. with_evidence : bool Choose whether or not to populate the evidence list attribute of the Statements. As with `with_support`, setting this to True will take longer. fix_refs : bool The paper refs within the evidence objects are not populated in the database, and thus must be filled using the relations in the database. If True (default), the `pmid` field of each Statement Evidence object is set to the correct PMIDs, or None if no PMID is available. If False, the `pmid` field defaults to the value populated by the reading system. Returns ------- list of Statements from the database corresponding to the query. """ warnings.warn(('This module is being taken out of service, as the tools ' 'have become deprecated. Moreover, the service has been ' 're-implemented to use newer tools as best as possible, ' 'but some results may be subtly different.'), DeprecationWarning) cnt = count if db is None: db = get_primary_db() stmts_tblname = 'pa_statements' if preassembled else 'raw_statements' if not preassembled: stmts = [] q = db.filter_query(stmts_tblname, *clauses) if do_stmt_count: logger.info("Counting statements...") num_stmts = q.count() logger.info("Total of %d statements" % num_stmts) db_stmts = q.yield_per(cnt) for subset in batch_iter(db_stmts, cnt): stmts.extend( get_raw_stmts_frm_db_list(db, subset, with_sids=False, fix_refs=fix_refs)) if do_stmt_count: logger.info("%d of %d statements" % (len(stmts), num_stmts)) else: logger.info("%d statements" % len(stmts)) else: logger.info("Getting preassembled statements.") if with_evidence: logger.info("Getting preassembled statements.") # Get pairs of pa statements with their linked raw statements clauses += [ db.PAStatements.mk_hash == db.RawUniqueLinks.pa_stmt_mk_hash, db.RawStatements.id == db.RawUniqueLinks.raw_stmt_id ] pa_raw_stmt_pairs = \ db.select_all([db.PAStatements, db.RawStatements], *clauses, yield_per=cnt) stmt_dict = _process_pa_statement_res_wev(db, pa_raw_stmt_pairs, count=cnt, fix_refs=fix_refs) else: # Get just pa statements without their supporting raw statement(s). pa_stmts = db.select_all(db.PAStatements, *clauses, yield_per=cnt) stmt_dict = _process_pa_statement_res_nev(pa_stmts, count=cnt) # Populate the supports/supported by fields. if with_support: get_support(stmt_dict, db=db) stmts = list(stmt_dict.values()) logger.info("In all, there are %d pa statements." % len(stmts)) return stmts