def get_related(self, stmt, possibly_related=None, direction='less_specific'): # Corner case: if this is a new statement that wasn't part of the # initialization, it is possible that it has a type that we've not # seen during initialization at all. In this case, we can assume # there are no refinements for it. stmt_type = indra_stmt_type(stmt) if stmt_type not in self.shared_data: return {} # Step 1. Recover relevant parts ot the initialized data hash_to_agent_key = self.shared_data[stmt_type]['hash_to_agent_key'] agent_key_to_hash = self.shared_data[stmt_type]['agent_key_to_hash'] all_keys_by_role = self.shared_data[stmt_type]['all_keys_by_role'] # Step 2. We iterate over all statements and find ones that this one # can refine stmt_hash = stmt.get_hash() relevants = possibly_related # We now iterate over all the agent roles in the given statement # type for role, hash_to_agent_key_for_role in hash_to_agent_key.items(): # If we have seen this statement before during initialization then # we can use its precalculated agent keys, otherwise we # calculate new agent keys for it. if stmt_hash in hash_to_agent_key_for_role: agent_keys = hash_to_agent_key_for_role[stmt_hash] else: agent_keys = self._agent_keys_for_stmt_role(stmt, role) # We get all the agent keys in all other statements that the # agent in this given role in this statement can refine. for agent_key in agent_keys: relevant_keys = get_relevant_keys(agent_key, all_keys_by_role[role], self.ontology, direction=direction) # We now get the actual statement hashes that these other # potentially refined agent keys appear in in the given role role_relevant_stmt_hashes = set.union( *[agent_key_to_hash[role][rel] for rel in relevant_keys]) - {stmt_hash} # In the first iteration, we initialize the set with the # relevant statement hashes if relevants is None: relevants = role_relevant_stmt_hashes # In subsequent iterations, we take the intersection of # the relevant sets per role else: relevants &= role_relevant_stmt_hashes # These hashes are now the ones that this statement needs # to be compared against. Importantly, the relationship is in # a well-defined direction so we don't need to test both ways. return relevants
def ontology_refinement_filter(stmts_by_hash, stmts_to_compare, ontology): """Return possible refinement relationships based on an ontology. Parameters ---------- stmts_by_hash : dict A dict whose keys are statement hashes that point to the (deduplicated) statement with that hash as a value. stmts_to_compare : dict or None A dict of existing statements to compare that will be further filtered down in this function and then returned. ontology : indra.ontology.IndraOntology An IndraOntology instance iwth respect to which this filter is applied. Returns ------- dict A dict whose keys are statement hashes and values are sets of statement hashes that can potentially be refined by the statement identified by the key. """ ts = time.time() stmts_by_type = collections.defaultdict(set) for stmt_hash, stmt in stmts_by_hash.items(): stmts_by_type[indra_stmt_type(stmt)].add(stmt_hash) stmts_by_type = dict(stmts_by_type) first_filter = stmts_to_compare is None if first_filter: stmts_to_compare = collections.defaultdict(set) for stmt_type, stmt_hashes in stmts_by_type.items(): logger.info('Finding ontology-based refinements for %d %s statements' % (len(stmts_by_type[stmt_type]), stmt_type.__name__)) stmts_by_hash_this_type = { stmt_hash: stmts_by_hash[stmt_hash] for stmt_hash in stmt_hashes } stmts_to_compare_by_type = \ ontology_refinement_filter_by_stmt_type(stmts_by_hash_this_type, ontology) if first_filter: stmts_to_compare.update(stmts_to_compare_by_type) else: for k, v in stmts_to_compare_by_type.items(): stmts_to_compare[k] = stmts_to_compare[k] & v te = time.time() logger.debug('Identified ontology-based possible refinements in %.2fs' % (te - ts)) # Make an empty dict to make sure we don't return a None if stmts_to_compare is None: stmts_to_compare = {} return stmts_to_compare
def find_contradicts(self): """Return pairs of contradicting Statements. Returns ------- contradicts : list(tuple(Statement, Statement)) A list of Statement pairs that are contradicting. """ # Make a dict of Statement by type stmts_by_type = collections.defaultdict(lambda: []) for idx, stmt in enumerate(self.stmts): stmts_by_type[indra_stmt_type(stmt)].append((idx, stmt)) # Handle Statements with polarity first pos_stmts = AddModification.__subclasses__() neg_stmts = [modclass_to_inverse[c] for c in pos_stmts] pos_stmts += [Activation, IncreaseAmount] neg_stmts += [Inhibition, DecreaseAmount] contradicts = [] for pst, nst in zip(pos_stmts, neg_stmts): poss = stmts_by_type.get(pst, []) negs = stmts_by_type.get(nst, []) pos_stmt_by_group = self._get_stmt_by_group( pst, poss, self.ontology) neg_stmt_by_group = self._get_stmt_by_group( nst, negs, self.ontology) for key, pg in pos_stmt_by_group.items(): ng = neg_stmt_by_group.get(key, []) for (_, st1), (_, st2) in itertools.product(pg, ng): if st1.contradicts(st2, self.ontology): contradicts.append((st1, st2)) # Handle neutral Statements next neu_stmts = [Influence, ActiveForm] for stt in neu_stmts: stmts = stmts_by_type.get(stt, []) for (_, st1), (_, st2) in itertools.combinations(stmts, 2): if st1.contradicts(st2, self.ontology): contradicts.append((st1, st2)) return contradicts
def find_contradicts(self): """Return pairs of contradicting Statements. Returns ------- contradicts : list(tuple(Statement, Statement)) A list of Statement pairs that are contradicting. """ eh = self.hierarchies['entity'] # Make a dict of Statement by type stmts_by_type = collections.defaultdict(lambda: []) for idx, stmt in enumerate(self.stmts): stmts_by_type[indra_stmt_type(stmt)].append((idx, stmt)) # Handle Statements with polarity first pos_stmts = AddModification.__subclasses__() neg_stmts = [modclass_to_inverse[c] for c in pos_stmts] pos_stmts += [Activation, IncreaseAmount] neg_stmts += [Inhibition, DecreaseAmount] contradicts = [] for pst, nst in zip(pos_stmts, neg_stmts): poss = stmts_by_type.get(pst, []) negs = stmts_by_type.get(nst, []) pos_stmt_by_group = self._get_stmt_by_group(pst, poss, eh) neg_stmt_by_group = self._get_stmt_by_group(nst, negs, eh) for key, pg in pos_stmt_by_group.items(): ng = neg_stmt_by_group.get(key, []) for (_, st1), (_, st2) in itertools.product(pg, ng): if st1.contradicts(st2, self.hierarchies): contradicts.append((st1, st2)) # Handle neutral Statements next neu_stmts = [Influence, ActiveForm] for stt in neu_stmts: stmts = stmts_by_type.get(stt, []) for (_, st1), (_, st2) in itertools.combinations(stmts, 2): if st1.contradicts(st2, self.hierarchies): contradicts.append((st1, st2)) return contradicts
def find_contradicts(self): """Return pairs of contradicting Statements. Returns ------- contradicts : list(tuple(Statement, Statement)) A list of Statement pairs that are contradicting. """ # Make a dict of Statement by type stmts_by_type = collections.defaultdict(list) for stmt in self.stmts: stmts_by_type[indra_stmt_type(stmt)].append(stmt) stmts_by_type = dict(stmts_by_type) # Handle Statements with polarity first pos_stmts = AddModification.__subclasses__() neg_stmts = [modclass_to_inverse[c] for c in pos_stmts] pos_stmts += [Activation, IncreaseAmount] neg_stmts += [Inhibition, DecreaseAmount] contradicts = [] # Handle statements with polarity first # TODO: we could probably do some optimization here # to not have to check statements combinatorially for pst, nst in zip(pos_stmts, neg_stmts): poss = stmts_by_type.get(pst, []) negs = stmts_by_type.get(nst, []) for ps, ns in itertools.product(poss, negs): if ps.contradicts(ns, self.ontology): contradicts.append((ps, ns)) # Handle neutral Statements next neu_stmts = [Influence, ActiveForm] for stt in neu_stmts: stmts = stmts_by_type.get(stt, []) for st1, st2 in itertools.combinations(stmts, 2): if st1.contradicts(st2, self.ontology): contradicts.append((st1, st2)) return contradicts
def extend(self, stmts_by_hash): self.shared_data['stmts_by_hash'].update(stmts_by_hash) # Build up data structure of statement hashes by # statement type stmts_by_type = collections.defaultdict(set) for stmt_hash, stmt in stmts_by_hash.items(): stmts_by_type[indra_stmt_type(stmt)].add(stmt_hash) stmts_by_type = dict(stmts_by_type) # Now iterate over each statement type and build up # data structures for quick filtering for stmt_type, stmts_this_type in stmts_by_type.items(): # Step 1. initialize data structures # noinspection PyProtectedMember roles = stmts_by_hash[next(iter(stmts_this_type))]._agent_order if stmt_type not in self.shared_data: self.shared_data[stmt_type] = {} # Mapping agent keys to statement hashes self.shared_data[stmt_type]['agent_key_to_hash'] = \ {role: collections.defaultdict(set) for role in roles} # Mapping statement hashes to agent keys self.shared_data[stmt_type]['hash_to_agent_key'] = \ {role: collections.defaultdict(set) for role in roles} # All agent keys for a given agent role self.shared_data[stmt_type]['all_keys_by_role'] = {} # Step 2. Fill up the initial data structures in preparation # for identifying potential refinements for sh in stmts_this_type: for role in roles: agent_keys = self._agent_keys_for_stmt_role( stmts_by_hash[sh], role) for agent_key in agent_keys: self.shared_data[stmt_type]['agent_key_to_hash'][ role][agent_key].add(sh) self.shared_data[stmt_type]['hash_to_agent_key'][ role][sh].add(agent_key) for role in roles: self.shared_data[stmt_type]['all_keys_by_role'][role] = \ set(self.shared_data[stmt_type]['agent_key_to_hash'][role])
def _generate_id_maps(self, unique_stmts, *args, **kwargs): """Connect statements using their refinement relationship.""" # Make a list of Statement types stmt_to_idx = { stmt.get_hash(matches_fun=self.matches_fun): idx for idx, stmt in enumerate(unique_stmts) } stmts_by_type = collections.defaultdict(list) for stmt in unique_stmts: stmts_by_type[indra_stmt_type(stmt)].append(stmt) stmts_by_type = dict(stmts_by_type) # Here we handle split_idx to allow finding refinements between # to distinct groups of statements (identified by an index at which we # split the unique_statements list) rather than globally across # all unique statements. split_idx = kwargs.pop('split_idx', None) if split_idx: # This dict maps statement hashes to a bool value based on which # of the two groups the statement belongs to. hash_to_split_group = { sh: (idx <= split_idx) for sh, idx in stmt_to_idx.items() } else: hash_to_split_group = None maps = [] for stmt_type, stmts in stmts_by_type.items(): logger.info('Finding refinements for %d %s statements' % (len(stmts), stmt_type.__name__)) maps += self._generate_hash_maps_by_stmt_type( stmts, stmts[0]._agent_order, split_groups=hash_to_split_group) idx_maps = [(stmt_to_idx[refinement], stmt_to_idx[refined]) for refinement, refined in maps] return idx_maps
def _generate_id_maps(self, unique_stmts, poolsize=None, size_cutoff=100, split_idx=None): """Connect statements using their refinement relationships.""" if not self.ontology._initialized: self.ontology.initialize() if len(unique_stmts) > 10000: self.ontology._build_transitive_closure() # Check arguments relating to multiprocessing if poolsize is None: logger.debug('combine_related: poolsize not set, ' 'not using multiprocessing.') use_mp = False elif sys.version_info[0] >= 3 and sys.version_info[1] >= 4: use_mp = True logger.info('combine_related: Python >= 3.4 detected, ' 'using multiprocessing with poolsize %d, ' 'size_cutoff %d' % (poolsize, size_cutoff)) else: use_mp = False logger.info('combine_related: Python < 3.4 detected, ' 'not using multiprocessing.') # Make a list of Statement types stmts_by_type = collections.defaultdict(lambda: []) for idx, stmt in enumerate(unique_stmts): stmts_by_type[indra_stmt_type(stmt)].append((idx, stmt)) child_proc_groups = [] parent_proc_groups = [] skipped_groups = 0 # Each Statement type can be preassembled independently for stmt_type, stmts_this_type in stmts_by_type.items(): logger.info('Grouping %s (%s)' % (stmt_type.__name__, len(stmts_this_type))) stmt_by_group = self._get_stmt_by_group(stmt_type, stmts_this_type, self.ontology) # Divide statements by group size # If we're not using multiprocessing, then all groups are local for g_name, g in stmt_by_group.items(): if len(g) < 2: skipped_groups += 1 continue if use_mp and len(g) >= size_cutoff: child_proc_groups.append(g) else: parent_proc_groups.append(g) # Now run preassembly! logger.debug( "Groups: %d parent, %d worker, %d skipped." % (len(parent_proc_groups), len(child_proc_groups), skipped_groups)) supports_func = functools.partial(_set_supports_stmt_pairs, ontology=self.ontology, split_idx=split_idx, check_entities_match=False, refinement_fun=self.refinement_fun) # Check if we are running any groups in child processes; note that if # use_mp is False, child_proc_groups will be empty if child_proc_groups: # Get a multiprocessing context ctx = mp.get_context('spawn') pool = ctx.Pool(poolsize) # Run the large groups remotely logger.debug("Running %d groups in child processes" % len(child_proc_groups)) res = pool.map_async(supports_func, child_proc_groups) workers_ready = False else: workers_ready = True # Run the small groups locally logger.debug("Running %d groups in parent process" % len(parent_proc_groups)) stmt_ix_map = [ supports_func(stmt_tuples) for stmt_tuples in parent_proc_groups ] logger.debug("Done running parent process groups") while not workers_ready: logger.debug("Checking child processes") if res.ready(): workers_ready = True logger.debug('Child process group comparisons successful? %s' % res.successful()) if not res.successful(): # The get method re-raises the underlying error that we can # now catch and print. try: res.get() except Exception as e: raise Exception( "Sorry, there was a problem with " "preassembly in the child processes: %s" % e) else: stmt_ix_map += res.get() logger.debug("Closing pool...") pool.close() logger.debug("Joining pool...") pool.join() logger.debug("Pool closed and joined.") time.sleep(1) logger.debug("Done.") # Combine all redundant map edges stmt_ix_map_set = set([]) for group_ix_map in stmt_ix_map: for ix_pair in group_ix_map: stmt_ix_map_set.add(ix_pair) return stmt_ix_map_set
def _generate_id_maps(self, unique_stmts, poolsize=None, size_cutoff=100, split_idx=None): """Connect statements using their refinement relationships.""" # Check arguments relating to multiprocessing if poolsize is None: logger.debug('combine_related: poolsize not set, ' 'not using multiprocessing.') use_mp = False elif sys.version_info[0] >= 3 and sys.version_info[1] >= 4: use_mp = True logger.info('combine_related: Python >= 3.4 detected, ' 'using multiprocessing with poolsize %d, ' 'size_cutoff %d' % (poolsize, size_cutoff)) else: use_mp = False logger.info('combine_related: Python < 3.4 detected, ' 'not using multiprocessing.') eh = self.hierarchies['entity'] # Make a list of Statement types stmts_by_type = collections.defaultdict(lambda: []) for idx, stmt in enumerate(unique_stmts): stmts_by_type[indra_stmt_type(stmt)].append((idx, stmt)) child_proc_groups = [] parent_proc_groups = [] skipped_groups = 0 # Each Statement type can be preassembled independently for stmt_type, stmts_this_type in stmts_by_type.items(): logger.info('Grouping %s (%s)' % (stmt_type.__name__, len(stmts_this_type))) stmt_by_group = self._get_stmt_by_group(stmt_type, stmts_this_type, eh) # Divide statements by group size # If we're not using multiprocessing, then all groups are local for g_name, g in stmt_by_group.items(): if len(g) < 2: skipped_groups += 1 continue if use_mp and len(g) >= size_cutoff: child_proc_groups.append(g) else: parent_proc_groups.append(g) # Now run preassembly! logger.debug("Groups: %d parent, %d worker, %d skipped." % (len(parent_proc_groups), len(child_proc_groups), skipped_groups)) supports_func = functools.partial(_set_supports_stmt_pairs, hierarchies=self.hierarchies, split_idx=split_idx, check_entities_match=False) # Check if we are running any groups in child processes; note that if # use_mp is False, child_proc_groups will be empty if child_proc_groups: # Get a multiprocessing context ctx = mp.get_context('spawn') pool = ctx.Pool(poolsize) # Run the large groups remotely logger.debug("Running %d groups in child processes" % len(child_proc_groups)) res = pool.map_async(supports_func, child_proc_groups) workers_ready = False else: workers_ready = True # Run the small groups locally logger.debug("Running %d groups in parent process" % len(parent_proc_groups)) stmt_ix_map = [supports_func(stmt_tuples) for stmt_tuples in parent_proc_groups] logger.debug("Done running parent process groups") while not workers_ready: logger.debug("Checking child processes") if res.ready(): workers_ready = True logger.debug('Child process group comparisons successful? %s' % res.successful()) if not res.successful(): raise Exception("Sorry, there was a problem with " "preassembly in the child processes.") else: stmt_ix_map += res.get() logger.debug("Closing pool...") pool.close() logger.debug("Joining pool...") pool.join() logger.debug("Pool closed and joined.") time.sleep(1) logger.debug("Done.") # Combine all redundant map edges stmt_ix_map_set = set([]) for group_ix_map in stmt_ix_map: for ix_pair in group_ix_map: stmt_ix_map_set.add(ix_pair) return stmt_ix_map_set