def process_args(self, args_json): for arg in args_json: if arg == 'stmt_type': args_json[arg] = get_statement_by_name(args_json[arg]) elif arg in ['matches_fun', 'refinement_fun']: args_json[arg] = pipeline_functions[args_json[arg]] elif arg == 'curations': Curation = namedtuple( 'Curation', ['pa_hash', 'source_hash', 'tag']) args_json[arg] = [ Curation(cur['pa_hash'], cur['source_hash'], cur['tag']) for cur in args_json[arg]] elif arg == 'belief_scorer': if args_json[arg] == 'wm': args_json[arg] = get_eidos_scorer() else: args_json[arg] = None elif arg == 'ontology': if args_json[arg] == 'wm': args_json[arg] = world_ontology else: args_json[arg] = bio_ontology elif arg == 'whitelist' or arg == 'mutations': args_json[arg] = { gene: [tuple(mod) for mod in mods] for gene, mods in args_json[arg].items()} return args_json
def make_stmt_from_sort_key(key, verb, agents=None): """Make a Statement from the sort key. Specifically, the sort key used by `group_and_sort_statements`. """ def make_agent(name): if name == 'None' or name is None: return None return Agent(name) StmtClass = get_statement_by_name(verb) inps = list(key[1]) if agents is None: agents = [] if verb == 'Complex': agents.extend([make_agent(name) for name in inps]) stmt = StmtClass(agents[:]) elif verb == 'Conversion': names_from = [make_agent(name) for name in inps[1]] names_to = [make_agent(name) for name in inps[2]] agents.extend(names_from + names_to) stmt = StmtClass(make_agent(inps[0]), names_from, names_to) elif verb == 'ActiveForm' or verb == 'HasActivity': agents.extend([make_agent(inps[0])]) stmt = StmtClass(agents[0], inps[1], inps[2]) elif verb == 'Influence': agents.extend([make_agent(inp) for inp in inps[:2]]) stmt = Influence(*[Event(ag) for ag in agents]) elif verb == 'Association': agents.extend([make_agent(inp) for inp in inps]) stmt = StmtClass([Event(ag) for ag in agents]) else: agents.extend([make_agent(name) for name in inps]) stmt = StmtClass(*agents) return stmt
def _english_from_agents_type(agA_name, agB_name, stmt_type): agA = Agent(agA_name) agB = Agent(agB_name) StmtClass = get_statement_by_name(stmt_type) if stmt_type.lower() == 'complex': stmt = StmtClass([agA, agB]) else: stmt = StmtClass(agA, agB) return EnglishAssembler([stmt]).make_model()
def _make_query(query_dict, use_grouding_service=True): stmt_type = query_dict['typeSelection'] stmt_class = get_statement_by_name(stmt_type) subj = get_agent_from_text(query_dict['subjectSelection'], use_grouding_service) obj = get_agent_from_text(query_dict['objectSelection'], use_grouding_service) stmt = stmt_class(subj, obj) query = PathProperty(path_stmt=stmt) return query
def _run(self, subject=None, object=None, agents=None, stmt_type=None, use_exact_type=False, persist=True, strict_stop=False, **api_params): self.__started = False self.__done_dict = defaultdict(lambda: False) self.__page_dict = defaultdict(lambda: 0) self.__th = None self.__quota = api_params['max_stmts'] # Make sure we got at least SOME agents (the remote API will error if # we proceed with no arguments). if subject is None and object is None and not agents: raise ValueError("At least one agent must be specified, or else " "the scope will be too large.") # Make timeouts apply differently in this case if not strict_stop: timeout = api_params.pop('timeout', None) else: timeout = api_params.get('timeout', None) # Formulate inputs for the agents.. key_val_list = [('subject', subject), ('object', object)] params = {param_key: param_val for param_key, param_val in key_val_list if param_val is not None} params.update(api_params) agent_strs = [] if agents is None else ['agent%d=%s' % (i, ag) for i, ag in enumerate(agents)] # Handle the type(s). stmt_types = [stmt_type] if stmt_type else [] if stmt_type is not None and not use_exact_type: stmt_class = get_statement_by_name(stmt_type) descendant_classes = get_all_descendants(stmt_class) stmt_types += [cls.__name__ for cls in descendant_classes] # Handle the content if we were limited. args = [agent_strs, stmt_types, params, persist] logger.debug("The remainder of the query will be performed in a " "thread...") self.__th = Thread(target=self._run_queries, args=args) self.__th.start() if timeout is None: logger.debug("Waiting for thread to complete...") self.__th.join() elif timeout: # is not 0 logger.debug("Waiting at most %d seconds for thread to complete..." % timeout) self.__th.join(timeout) return
def __init__(self, subject=None, object=None, agents=None, stmt_type=None, use_exact_type=False, persist=True, timeout=None, ev_limit=10, best_first=True, tries=2, max_stmts=None): self.statements = [] self.statements_sample = None self.__statement_jsons = {} self.__done_dict = defaultdict(lambda: False) self.__evidence_counts = {} self.__started = False self.__page_dict = defaultdict(lambda: 0) self.__th = None self.__quota = max_stmts # Make sure we got at least SOME agents (the remote API will error if # we proceed with no arguments). if subject is None and object is None and not agents: raise ValueError("At least one agent must be specified, or else " "the scope will be too large.") # Formulate inputs for the agents.. agent_strs = [] if agents is None else ['agent%d=%s' % (i, ag) for i, ag in enumerate(agents)] key_val_list = [('subject', subject), ('object', object)] params = {param_key: param_val for param_key, param_val in key_val_list if param_val is not None} params['best_first'] = best_first params['ev_limit'] = ev_limit params['tries'] = tries # Handle the type(s). stmt_types = [stmt_type] if stmt_type else [] if stmt_type is not None and not use_exact_type: stmt_class = get_statement_by_name(stmt_type) descendant_classes = get_all_descendants(stmt_class) stmt_types += [cls.__name__ for cls in descendant_classes] # Handle the content if we were limited. args = [agent_strs, stmt_types, params, persist] logger.info("The remainder of the query will be performed in a " "thread...") self.__th = Thread(target=self._run_queries, args=args) self.__th.start() if timeout is None: logger.info("Waiting for thread to complete...") self.__th.join() elif timeout: # is not 0 logger.info("Waiting at most %d seconds for thread to complete..." % timeout) self.__th.join(timeout) return
def get_argument_value(self, arg_json): """Get a value of an argument from its json version.""" if self.is_function(arg_json, 'function'): # Argument is a function if arg_json.get('no_run', False): value = self.get_function_from_name(arg_json['function']) # Argument is a result of a function else: value = self.run_function(arg_json) # Argument is a statement type elif self.is_function(arg_json, 'stmt_type'): value = get_statement_by_name(arg_json.get('stmt_type')) # Argument is a simple value (str, int, boolean, etc.) else: value = arg_json return value
def _make_query(query_dict): if 'typeSelection' in query_dict.keys(): stmt_type = query_dict['typeSelection'] stmt_class = get_statement_by_name(stmt_type) subj = get_agent_from_text(query_dict['subjectSelection']) obj = get_agent_from_text(query_dict['objectSelection']) stmt = stmt_class(subj, obj) query = PathProperty(path_stmt=stmt) tab = 'static' elif 'agentSelection' in query_dict.keys(): agent = get_agent_from_trips(query_dict['agentSelection']) value = query_dict['valueSelection'] if not value: value = None pattern = query_dict['patternSelection'] query = DynamicProperty(agent, pattern, value) tab = 'dynamic' return query, tab
def test_has_type(): ro = get_db('primary') q = HasType(['Phosphorylation', 'Activation']) res = q.get_statements(ro, limit=5, ev_limit=8) stmts = res.statements() assert all(s.__class__.__name__ in ('Phosphorylation', 'Activation') for s in stmts) type_list = ['SelfModification', 'RegulateAmount', 'Translocation'] q = HasType(type_list, include_subclasses=True) res = q.get_statements(ro, limit=5, ev_limit=8) stmts = res.statements() types = { t for bt in (get_statement_by_name(n) for n in type_list) for t in [bt] + get_all_descendants(bt) } assert all(type(s) in types for s in stmts)
def process_args(self, args_json): for arg in args_json: if arg == 'stmt_type': args_json[arg] = get_statement_by_name(args_json[arg]) elif arg in ['matches_fun', 'refinement_fun']: args_json[arg] = pipeline_functions[args_json[arg]] elif arg == 'belief_scorer': # Here we could handle various string values of args_json[arg] # but there currently aren't any specific options args_json[arg] = None elif arg == 'ontology': # Here we could handle various string values of args_json[arg] # but there currently aren't any specific options args_json[arg] = bio_ontology elif arg == 'whitelist' or arg == 'mutations': args_json[arg] = { gene: [tuple(mod) for mod in mods] for gene, mods in args_json[arg].items()} return args_json
def stmt_from_interaction(interaction): """Get a shell statement from an interaction.""" StmtClass = get_statement_by_name(interaction['type']) if interaction['type'] == 'Complex': agents = [Agent(name) for name in interaction['agents'].values()] stmt = StmtClass(agents) elif interaction['type'] == 'ActiveForm': name = interaction['agents'][0] agent = Agent(name) stmt = StmtClass(agent, interaction['activity'], interaction['is_active']) else: agents = [ Agent(interaction['agents'][i]) if interaction['agents'].get(i) else None for i in range(len(StmtClass._agent_order)) ] stmt = StmtClass(*agents) return stmt
def make_stmt_from_sort_key(key, verb): """Make a Statement from the sort key. Specifically, the sort key used by `group_and_sort_statements`. """ def make_agent(name): if name == 'None' or name is None: return None return Agent(name) StmtClass = get_statement_by_name(verb) inps = list(key[1]) if verb == 'Complex': stmt = StmtClass([make_agent(name) for name in inps]) elif verb == 'Conversion': stmt = StmtClass(make_agent(inps[0]), [make_agent(name) for name in inps[1]], [make_agent(name) for name in inps[2]]) elif verb == 'ActiveForm' or verb == 'HasActivity': stmt = StmtClass(make_agent(inps[0]), inps[1], inps[2]) else: stmt = StmtClass(*[make_agent(name) for name in inps]) return stmt
def make_stmt_from_sort_key(key, verb): """Make a Statement from the sort key. Specifically, the sort key used by `group_and_sort_statements`. """ def make_agent(name): if name == 'None' or name is None: return None return Agent(name) StmtClass = get_statement_by_name(verb) inps = list(key[1]) if verb == 'Complex': stmt = StmtClass([make_agent(name) for name in inps]) elif verb == 'Conversion': stmt = StmtClass(make_agent(inps[0]), [make_agent(name) for name in inps[1]], [make_agent(name) for name in inps[2]]) elif verb == 'ActiveForm' or verb == 'HasActivity': stmt = StmtClass(make_agent(inps[0]), inps[1], inps[2]) else: stmt = StmtClass(*[make_agent(name) for name in inps]) return stmt
def get_statements(subject=None, object=None, agents=None, stmt_type=None, use_exact_type=False, persist=True, timeout=None, simple_response=True, ev_limit=10, best_first=True, tries=2, max_stmts=None): """Get Statements from the INDRA DB web API matching given agents and type. There are two types of response available. You can just get a list of INDRA Statements, or you can get an IndraRestResponse object, which allows Statements to be loaded in a background thread, providing a sample of the best* content available promptly in the sample_statements attribute, and populates the statements attribute when the paged load is complete. *In the sense of having the most supporting evidence. Parameters ---------- subject/object : str Optionally specify the subject and/or object of the statements in you wish to get from the database. By default, the namespace is assumed to be HGNC gene names, however you may specify another namespace by including `@<namespace>` at the end of the name string. For example, if you want to specify an agent by chebi, you could use `CHEBI:6801@CHEBI`, or if you wanted to use the HGNC id, you could use `6871@HGNC`. agents : list[str] A list of agents, specified in the same manner as subject and object, but without specifying their grammatical position. stmt_type : str Specify the types of interactions you are interested in, as indicated by the sub-classes of INDRA's Statements. This argument is *not* case sensitive. If the statement class given has sub-classes (e.g. RegulateAmount has IncreaseAmount and DecreaseAmount), then both the class itself, and its subclasses, will be queried, by default. If you do not want this behavior, set use_exact_type=True. Note that if max_stmts is set, it is possible only the exact statement type will be returned, as this is the first searched. The processor then cycles through the types, getting a page of results for each type and adding it to the quota, until the max number of statements is reached. use_exact_type : bool If stmt_type is given, and you only want to search for that specific statement type, set this to True. Default is False. persist : bool Default is True. When False, if a query comes back limited (not all results returned), just give up and pass along what was returned. Otherwise, make further queries to get the rest of the data (which may take some time). timeout : positive int or None If an int, block until the work is done and statements are retrieved, or until the timeout has expired, in which case the results so far will be returned in the response object, and further results will be added in a separate thread as they become available. If simple_response is True, all statements available will be returned. Otherwise (if None), block indefinitely until all statements are retrieved. Default is None. simple_response : bool If True, a simple list of statements is returned (thus block should also be True). If block is False, only the original sample will be returned (as though persist was False), until the statements are done loading, in which case the rest should appear in the list. This behavior is not encouraged. Default is True (for the sake of backwards compatibility). ev_limit : int or None Limit the amount of evidence returned per Statement. Default is 10. best_first : bool If True, the preassembled statements will be sorted by the amount of evidence they have, and those with the most evidence will be prioritized. When using `max_stmts`, this means you will get the "best" statements. If False, statements will be queried in arbitrary order. tries : int > 0 Set the number of times to try the query. The database often caches results, so if a query times out the first time, trying again after a timeout will often succeed fast enough to avoid a timeout. This can also help gracefully handle an unreliable connection, if you're willing to wait. Default is 2. max_stmts : int or None Select the maximum number of statements to return. When set less than 1000 the effect is much the same as setting persist to false, and will guarantee a faster response. Default is None. Returns ------- stmts : list[:py:class:`indra.statements.Statement`] A list of INDRA Statement instances. Note that if a supporting or supported Statement was not included in your query, it will simply be instantiated as an `Unresolved` statement, with `uuid` of the statement. """ # Make sure we got at least SOME agents (the remote API will error if we # we proceed with no arguments. if subject is None and object is None and agents is None: raise ValueError("At least one agent must be specified, or else " "the scope will be too large.") # Formulate inputs for the agents.. agent_strs = [] if agents is None else [ 'agent%d=%s' % (i, ag) for i, ag in enumerate(agents) ] key_val_list = [('subject', subject), ('object', object)] params = { param_key: param_val for param_key, param_val in key_val_list if param_val is not None } params['best_first'] = best_first params['ev_limit'] = ev_limit params['tries'] = tries # Handle the type(s). stmt_types = [stmt_type] if stmt_type else [] if stmt_type is not None and not use_exact_type: stmt_class = get_statement_by_name(stmt_type) descendant_classes = get_all_descendants(stmt_class) stmt_types += [cls.__name__ for cls in descendant_classes] # Get the response object resp = IndraDBRestResponse(max_stmts=max_stmts) resp.make_stmts_queries(agent_strs, stmt_types, params, persist, timeout) # Format the result appropriately. if simple_response: ret = resp.statements else: ret = resp return ret
def get_statements(subject=None, object=None, agents=None, stmt_type=None, use_exact_type=False, on_limit='sample'): """Get statements from INDRA's database using the web api. Parameters ---------- subject/object : str Optionally specify the subject and/or object of the statements in you wish to get from the database. By default, the namespace is assumed to be HGNC gene names, however you may specify another namespace by including `@<namespace>` at the end of the name string. For example, if you want to specify an agent by chebi, you could use `CHEBI:6801@CHEBI`, or if you wanted to use the HGNC id, you could use `6871@HGNC`. agents : list[str] A list of agents, specified in the same manner as subject and object, but without specifying their grammatical position. stmt_type : str Specify the types of interactions you are interested in, as indicated by the sub-classes of INDRA's Statements. This argument is *not* case sensitive. If the statement class given has sub-classes (e.g. RegulateAmount has IncreaseAmount and DecreaseAmount), then both the class itself, and its subclasses, will be queried, by default. If you do not want this behavior, set use_exact_type=True. use_exact_type : bool If stmt_type is given, and you only want to search for that specific statement type, set this to True. Default is False. on_limit : str There are four options for handling the a query that is to large: `sample` - (default) take a sample of statements from the result, `truncate` - simply return the first 10,000 statements of the result, `error` - raise an error if the query is too large, or `persist` - perform as many queries as needed to get all the statements. Note that this last option generally takes much much longer to execute. Returns ------- stmts : list[:py:class:`indra.statements.Statement`] A list of INDRA Statement instances. Note that if a supporting or supported Statement was not included in your query, it will simply be instantiated as an `Unresolved` statement, with `uuid` of the statement. """ # Make sure we got at least SOME agents (the remote API will error if we # we proceed with no arguments. if subject is None and object is None and agents is None: raise ValueError("At least one agent must be specified, or else " "the scope will be too large.") # Formulate inputs for the agents.. agent_strs = [] if agents is None else ['agent=%s' % ag for ag in agents] key_val_list = [('subject', subject), ('object', object)] params = {param_key: param_val for param_key, param_val in key_val_list if param_val is not None} params['on_limit'] = on_limit # Handle the type(s). if stmt_type is not None: if use_exact_type: params['type'] = stmt_type stmts = _make_stmts_query(agent_strs, params) else: stmt_class = get_statement_by_name(stmt_type) descendant_classes = get_all_descendants(stmt_class) stmt_types = [cls.__name__ for cls in descendant_classes] \ + [stmt_type] stmts = _query_stmt_types(agent_strs, params, stmt_types) else: stmts = _make_stmts_query(agent_strs, params) return stmts
def _get_pa_stmt_jsons_w_mkhash_subquery(db, mk_hashes_q, best_first=True, max_stmts=None, offset=None, ev_limit=None): # Handle limiting. mk_hashes_q = mk_hashes_q.distinct() if best_first: mk_hashes_q = mk_hashes_q.order_by(desc(db.PaMeta.ev_count)) if max_stmts is not None: mk_hashes_q = mk_hashes_q.limit(max_stmts) if offset is not None: mk_hashes_q = mk_hashes_q.offset(offset) # Create the link mk_hashes_al = mk_hashes_q.subquery('mk_hashes') raw_json_c = db.FastRawPaLink.raw_json.label('raw_json') pa_json_c = db.FastRawPaLink.pa_json.label('pa_json') reading_id_c = db.FastRawPaLink.reading_id.label('rid') cont_q = db.session.query(raw_json_c, pa_json_c, reading_id_c) cont_q = cont_q.filter(db.FastRawPaLink.mk_hash == mk_hashes_al.c.mk_hash) if ev_limit is not None: cont_q = cont_q.limit(ev_limit) # TODO: Only make a lateral-joined query when evidence is limited. json_content_al = cont_q.subquery().lateral('json_content') stmts_q = (mk_hashes_al.outerjoin(json_content_al, true()).outerjoin( db.ReadingRefLink, db.ReadingRefLink.rid == json_content_al.c.rid)) ref_link_keys = [ k for k in db.ReadingRefLink.__dict__.keys() if not k.startswith('_') ] selection = (select([ mk_hashes_al.c.mk_hash, mk_hashes_al.c.ev_count, json_content_al.c.raw_json, json_content_al.c.pa_json ] + [getattr(db.ReadingRefLink, k) for k in ref_link_keys]).select_from(stmts_q)) logger.debug("Executing sql to get statements:\n%s" % str(selection)) proxy = db.session.connection().execute(selection) res = proxy.fetchall() stmts_dict = OrderedDict() ev_totals = OrderedDict() total_evidence = 0 returned_evidence = 0 if res: logger.debug("res is %d row by %d cols." % (len(res), len(res[0]))) else: logger.debug("res is empty.") for row in res: mk_hash, ev_count, raw_json_bts, pa_json_bts = row[:4] ref_dict = { ref_link_keys[i]: row[4 + i] for i in range(len(ref_link_keys)) } returned_evidence += 1 raw_json = json.loads(raw_json_bts.decode('utf-8')) ev_json = raw_json['evidence'][0] # Add a new statements if the hash is new if mk_hash not in stmts_dict.keys(): total_evidence += ev_count ev_totals[mk_hash] = ev_count stmts_dict[mk_hash] = json.loads(pa_json_bts.decode('utf-8')) stmts_dict[mk_hash]['evidence'] = [] # Fix the pmid if ref_dict['pmid']: ev_json['pmid'] = ref_dict['pmid'] # Add agents' raw text to annotations. raw_text = [] for ag_name in get_statement_by_name(raw_json['type'])._agent_order: ag_value = raw_json.get(ag_name, None) if isinstance(ag_value, dict): raw_text.append(ag_value['db_refs'].get('TEXT')) elif ag_value is None: raw_text.append(None) else: for ag in ag_value: raw_text.append(ag['db_refs'].get('TEXT')) if 'annotations' not in ev_json.keys(): ev_json['annotations'] = {} ev_json['annotations']['agents'] = {'raw_text': raw_text} if 'prior_uuids' not in ev_json['annotations'].keys(): ev_json['annotations']['prior_uuids'] = [] ev_json['annotations']['prior_uuids'].append(raw_json['id']) if 'text_refs' not in ev_json.keys(): ev_json['text_refs'] = {} ev_json['text_refs'].update( {k.upper(): v for k, v in ref_dict.items() if v is not None}) if ref_dict['source']: ev_json['annotations']['content_source'] = ref_dict['source'] # TODO: Remove this eventually. This is a patch! if 'source_hash' not in ev_json.keys(): s = str(ev_json.get('source_api')) + str(ev_json.get('source_id')) if ev_json.get('text') and isinstance(ev_json['text'], str): s += ev_json['text'] elif ev_json.get('pmid') and isinstance(ev_json['pmid'], str): s += ev_json['pmid'] ev_json['source_hash'] = _make_hash(s, 16) stmts_dict[mk_hash]['evidence'].append(ev_json) ret = { 'statements': stmts_dict, 'evidence_totals': ev_totals, 'total_evidence': total_evidence, 'evidence_returned': returned_evidence } return ret
def expand_signed(df: pd.DataFrame, sign_dict: Dict[str, int], stmt_types: List[str], use_descendants: bool = True) \ -> pd.DataFrame: """Expands out which statements should be added to the signed graph The statements types provided in 'stmt_types' will be added for both signs. To add more statement types of just one sign, add it to 'sign_dict'. Parameters ---------- df : pd.DataFrame sign_dict : Dict[str, int] A dictionary mapping a Statement type to a sign to be used for the edge. By default only Activation and IncreaseAmount are added as positive edges and Inhibition and DecreaseAmount are added as negative edges, but a user can pass any other Statement types in a dictionary. stmt_types : List[str] The statement types to match to expand signs to. The rows matching these types will be duplicated and each copy gets a distinct sign. use_descendants : bool If True, also match descendants of the statements provided in 'stmt_types' when adding the extended signs. Returns ------- pd.DataFrame """ if use_descendants: logger.info('Getting descendants to match for expanded signed graph') # Get name of descendants more_stmt_types = set(stmt_types) for s in stmt_types: more_stmt_types.update({ s.__name__ for s in get_all_descendants(get_statement_by_name(s)) }) stmt_types = list(more_stmt_types) # Add new sign column, set to None. Using 'initial_sign' allows usage of # IndraNet.to_signed_graph df['initial_sign'] = None # Locate relevant rows standard_sign = df.stmt_type.isin(sign_dict.keys()) expand_sign = df.stmt_type.isin(stmt_types) assert sum(standard_sign) + sum(expand_sign) > 0, \ 'All rows filtered out from DataFrame. Check that statement types ' \ 'in sign_dict and stmt_types exist in the DataFrame.' if sum(expand_sign) == 0: logger.warning('No rows can be used for expanded signed edges. Check ' 'that statement types in stmt_types exist in the ' 'DataFrame.') # Add sign for signed statements logger.info('Setting initial sign for signed types') df.loc[standard_sign, 'initial_sign'] = \ df.loc[standard_sign, 'stmt_type'].apply(lambda st: sign_dict.get(st)) # Add positive sign to the rows with types in stmt_types df.loc[expand_sign, 'initial_sign'] = INT_PLUS # Copy rows for expand sign and switch sign logger.info('Setting initial sign for expanded signed types') add_rows = [] for _, expand_row in df[expand_sign].iterrows(): exp_row = [ INT_MINUS if col == 'initial_sign' else val for col, val in expand_row.items() ] add_rows.append(exp_row) logger.info('Appending extended signed rows') extra_df = pd.DataFrame(add_rows, columns=df.columns.values) df = df.append(extra_df) # Remove all rows without assigned sign logger.info('Removing rows without signed') df = df[~df.initial_sign.isna()] # Re-cast sign column as int try: df.initial_sign = df.initial_sign.astype(pd.Int32Dtype()) except Exception as exc: link = 'https://pandas.pydata.org/pandas-docs/stable/user_guide' \ '/integer_na.html' logger.warning(f'Could not set sign column as Nullable Integer Data ' f'Type. MAke sure to use pandas v0.24+. See {link}') return df
def _build_test_set(): agents = [{ 'NAME': 'ERK', 'FPLX': 'ERK', 'TEXT': 'MAPK' }, { 'NAME': 'TP53', 'HGNC': '11998' }, { 'NAME': 'MEK', 'FPLX': 'MEK' }, { 'NAME': 'Vemurafenib', 'CHEBI': 'CHEBI:63637' }] stypes = ['Phosphorylation', 'Activation', 'Inhibition', 'Complex'] sources = [('medscan', 'rd'), ('reach', 'rd'), ('pc11', 'db'), ('signor', 'db')] mesh_ids = ['D000225', 'D002352', 'D015536'] mesh_combos = [] for num_mesh in range(0, 3): if num_mesh == 1: mesh_groups = [[mid] for mid in mesh_ids] else: mesh_groups = combinations(mesh_ids, num_mesh) mesh_combos.extend(list(mesh_groups)) random.shuffle(mesh_combos) source_data = [] for num_srcs in range(1, 5): if num_srcs == 1: src_iter = [[src] for src in sources] else: src_iter = combinations(sources, num_srcs) for src_list in src_iter: only_src = None if len(src_list) > 1 else src_list[0][0] has_rd = any(t == 'rd' for _, t in src_list) if has_rd: mesh_ids = mesh_combos[len(source_data) % len(mesh_combos)] else: mesh_ids = [] source_data.append({ 'sources': {src: random.randint(1, 50) for src, _ in src_list}, 'has_rd': any(t == 'rd' for _, t in src_list), 'has_db': any(t == 'db' for _, t in src_list), 'only_src': only_src, 'mesh_ids': mesh_ids }) random.shuffle(source_data) stmts = [ tuple(tpl) + (None, None) for tpl in product(stypes, permutations(agents, 2)) ] stmts += [('ActiveForm', (ref, ), activity, is_active) for activity, is_active, ref in product( ['transcription', 'activity'], [True, False], agents)] complex_pairs = [] name_meta_rows = [] name_meta_cols = ('mk_hash', 'ag_num', 'db_id', 'role_num', 'type_num', 'ev_count', 'activity', 'is_active', 'agent_count') text_meta_rows = [] text_meta_cols = ('mk_hash', 'ag_num', 'db_id', 'role_num', 'type_num', 'ev_count', 'activity', 'is_active', 'agent_count') other_meta_rows = [] other_meta_cols = ('mk_hash', 'ag_num', 'db_name', 'db_id', 'role_num', 'type_num', 'ev_count', 'activity', 'is_active', 'agent_count') source_meta_rows = [] source_meta_cols = ('mk_hash', 'reach', 'medscan', 'pc11', 'signor', 'ev_count', 'type_num', 'activity', 'is_active', 'agent_count', 'num_srcs', 'src_json', 'only_src', 'has_rd', 'has_db') mesh_meta_rows = [] mesh_meta_cols = ('mk_hash', 'ev_count', 'mesh_num', 'type_num', 'activity', 'is_active', 'agent_count') for stype, refs, activity, is_active in stmts: # Extract agents, and make a Statement. StmtClass = get_statement_by_name(stype) if stype == 'ActiveForm': ag = make_agent_from_ref(refs[0]) stmt = StmtClass(ag, activity=activity, is_active=is_active) else: ag1 = make_agent_from_ref(refs[0]) ag2 = make_agent_from_ref(refs[1]) if stype == 'Complex': if {ag1.name, ag2.name} in complex_pairs: continue stmt = StmtClass([ag1, ag2]) complex_pairs.append({ag1.name, ag2.name}) else: stmt = StmtClass(ag1, ag2) # Connect with a source. source_dict = source_data[len(source_meta_rows) % len(source_data)] ev_count = sum(source_dict['sources'].values()) src_row = (stmt.get_hash(), ) for src_name in ['reach', 'medscan', 'pc11', 'signor']: src_row += (source_dict['sources'].get(src_name), ) src_row += (ev_count, ro_type_map.get_int(stype), activity, is_active, len(refs), len(source_dict['sources']), json.dumps(source_dict['sources']), source_dict['only_src'], source_dict['has_rd'], source_dict['has_db']) source_meta_rows.append(src_row) # Add mesh rows for mesh_id in source_dict['mesh_ids']: mesh_meta_rows.append( (stmt.get_hash(), ev_count, int(mesh_id[1:]), ro_type_map.get_int(stype), activity, is_active, len(refs))) # Generate agent rows. ref_rows, _, _ = extract_agent_data(stmt, stmt.get_hash()) for row in ref_rows: row = row[:4] + (ro_role_map.get_int( row[4]), ro_type_map.get_int(stype), ev_count, activity, is_active, len(refs)) if row[2] == 'NAME': row = row[:2] + row[3:] name_meta_rows.append(row) elif row[2] == 'TEXT': row = row[:2] + row[3:] text_meta_rows.append(row) else: other_meta_rows.append(row) db = get_temp_db(clear=True) src_meta_cols = [{'name': col} for col, _ in sources] db.SourceMeta.load_cols(db.engine, src_meta_cols) for tbl in [ db.SourceMeta, db.MeshMeta, db.NameMeta, db.TextMeta, db.OtherMeta ]: tbl.__table__.create(db.engine) db.copy('readonly.source_meta', source_meta_rows, source_meta_cols) db.copy('readonly.mesh_meta', mesh_meta_rows, mesh_meta_cols) db.copy('readonly.name_meta', name_meta_rows, name_meta_cols) db.copy('readonly.text_meta', text_meta_rows, text_meta_cols) db.copy('readonly.other_meta', other_meta_rows, other_meta_cols) return db