def filter_ud(graph, blacklist): not_words = ["no", "not", "nicht", "kein"] edges = [edge for edge in graph.edges(data=True)] cond_nodes = [] for in_node, out_node, t in edges: if t["color"] == "case" and out_node.split("_")[0] in blacklist: for in_, out_, t_ in edges: if t_["color"] == "nmod" and (in_ == in_node or out_ == in_node): cond_nodes.append(in_node) if in_node.split("_")[0] in not_words or out_node.split( "_")[0] in not_words: cond_nodes.append(in_node) to_delete = [] for cond_node in cond_nodes: for node in graph.nodes(): if cond_node in graph and node in graph: if algorithms.has_path(graph, cond_node, node): to_delete.append(node) for node in to_delete: if node in graph.nodes(default=None): graph.remove_node(node)
def get_contigs(G, start_node_list, end_node_list): """Get all paths from input to output nodes : returns a list of contigs and their size""" contigs = [] for source in start_node_list: for target in end_node_list: if algorithms.has_path(G, source, target) == True: path = algorithms.shortest_path(G, source, target) contig = path[0] for i in range(len(path) - 1): contig += path[i + 1][-1] contigs.append((contig, len(contig))) return contigs
def get_contigs(graph, list_start_node, list_end_node): '''takes a graph, a list of entry nodes and a list of exit nodes and returns a list of tuple (contig, contig_size) ''' contigs = [] for source in list_start_node: for target in list_end_node: if algorithms.has_path(graph, source, target) == True: path = algorithms.shortest_path(graph, source, target) contig = path[0] for i in range(len(path) - 1): contig += path[i + 1][-1] contigs.append((contig, len(contig))) return contigs
def asim_jac_nodes_with_backup(self, graph_premise, graph_hypothesis): """ Asymmetric Jaccard similarity between the nodes of the definition graphs, if the score is not 1 it calculates the asymmetric Jaccard similarity between the edges without the hypothesis root node :param graph_premise: the definition graph of the premise :param graph_hypothesis: the definition graph of the hypothesis :return: the ratio of overlapping nodes per the length of the hypothesis definition """ node_score = self.asim_jac_nodes(graph_premise, graph_hypothesis) edge_score = 0 if 0.0 < node_score < 1.0: root = graph_hypothesis.d_clean( graph_hypothesis.root).split("_")[0] if root in graph_premise.get_nodes(): root_id = [ node for node in graph_premise.G.nodes() if self.clear_node(node) == root ][0] graph_premise_only_zero = copy.deepcopy(graph_premise) delete_list = [] for edge in graph_premise_only_zero.G.adj.items(): for output_node in edge[1].items(): inner_delete_list = [] for edge_type in output_node[1].items(): if edge_type[1]["color"]: inner_delete_list.append(edge_type[0]) for inner_del in inner_delete_list: del output_node[1]._atlas[inner_del] if len(output_node[1]) < 1: delete_list.append(output_node[0]) for to_del in delete_list: if to_del in edge[1]._atlas: del edge[1]._atlas[to_del] try: if algorithms.has_path(graph_premise_only_zero.G, graph_premise.root, root_id): return 1.0 except Exception as e: print("Error occured:", e) graph_hypothesis_wo_root = copy.deepcopy(graph_hypothesis) graph_hypothesis_wo_root.G.remove_node( graph_hypothesis_wo_root.root) #edge_score = self.asim_jac_edges(graph_premise, graph_hypothesis_wo_root) return self.asim_jac_edges(graph_premise, graph_hypothesis_wo_root) #return max([node_score, edge_score]) return node_score
def filter_graph(self, condition): nodes = self.G.nodes(default=None) cond_nodes = [] to_delete = [] for node in nodes: cl = self.d_clean(node) if condition == cl.split("_")[0]: cond_nodes.append(node) for cond_node in cond_nodes: for node in nodes: if cond_node in self.G and node in self.G: if algorithms.has_path(self.G, cond_node, node): to_delete.append(node) for node in to_delete: if node in self.G.nodes(default=None): self.G.remove_node(node)
def blacklisting(self, graph): one_two_blacklist = ["A", "a", "b", "B"] for adj in graph.G._adj.values(): for a in adj.items(): if {'color': 2} in a[1].values(): new_blacklist_item = a[0] for node in graph.G.nodes: if algorithms.has_path(graph.G, new_blacklist_item, node): blacklist_node = graph.d_clean(node) if blacklist_node != graph.root: one_two_blacklist.append( blacklist_node.split('_')[0]) new_blacklist_item = graph.d_clean(new_blacklist_item) if new_blacklist_item != graph.root: one_two_blacklist.append( new_blacklist_item.split('_')[0]) return one_two_blacklist
def whitelisting(self, graph): whitelist = [graph.root] zero_graph = copy.deepcopy(graph) delete_list = [] for edge in zero_graph.G.adj.items(): for output_node in edge[1].items(): inner_delete_list = [] for edge_type in output_node[1].items(): if edge_type[1]["color"]: inner_delete_list.append(edge_type[0]) for inner_del in inner_delete_list: del output_node[1]._atlas[inner_del] if len(output_node[1]) < 1: delete_list.append(output_node[0]) for to_del in delete_list: if to_del in edge[1]._atlas: del edge[1]._atlas[to_del] for node in zero_graph.G.nodes(): if algorithms.has_path(zero_graph.G, graph.root, node): whitelist.append(node) whitelist.append(graph.root) return whitelist
def find_complexSV(self): pool = [] nodes = self.graph.nodes() for n1 in nodes: for n2 in nodes: if n1 == n2: continue pre = has_path(self.graph, n1, n2) if pre: # do not consider edge weight paths = list(all_shortest_paths(self.graph, n1, n2, weight=None)) for p in paths: if not self._containloop(p): pool.append((len(p), p)) pool.sort(reverse=True) # so far, the candidate paths contain no self-loops, but are still redundant # check distance-decay for each pair of regions queue = [(self.clr, self._change_format(p[1]), self.span, self.balance_type, p[1], self.protocol) for p in pool] log.info('Filtering {0} redundant candidates ...'.format(len(queue))) jobs = Parallel(n_jobs=self.n_jobs, verbose=10)(delayed(filterAssembly)(*i) for i in queue) pre_alleles = [] for ck, p in jobs: if ck: pre_alleles.append(p) # these assembly should exist within the same allele alleles = [] for p in pre_alleles: for v in alleles: if self._issubset(p, v) or self._issubset(p, self._getreverse(v)): break else: alleles.append(p) self.alleles = alleles
def answer_quest(q, talker): ''' given question q, interacts with talker and returns its best answers ''' max_answers = talker.params.max_answers db = talker.db sent_data, l2occ = db unknowns = [] q_lemmas = [] if talker.params.with_answerer: answerer = Talker(from_text=q) q_sent_data, _ = answerer.db for j, q_lemma in enumerate(q_sent_data[0][LEMMA]): q_sent_data, q_l2occ = answerer.db q_tag = q_sent_data[0][TAG][j] if q_tag[0] not in "NVJ": continue # ppp(q_lemma,q_tag) q_lemmas.append((q_lemma, wn_tag(q_tag))) else: answerer = None from nltk.tokenize import word_tokenize from nltk.stem import WordNetLemmatizer wnl = WordNetLemmatizer() toks = word_tokenize(q) tag = None for t in toks: tag = 'n' l = wnl.lemmatize(t, tag) if l == t: tag = 'v' l = wnl.lemmatize(t, tag) if l == t: tag = 'a' l = wnl.lemmatize(t, tag) l = l.lower() q_lemmas.append((l, tag)) matches = [] nears = [] sharesDict = defaultdict(set) count = defaultdict(int) for q_lemma, wn_q_tag in q_lemmas: if not good_word(q_lemma) or q_lemma in ".?": continue # actual QA starts here ys = l2occ.get(q_lemma) if ys: matches.append(q_lemma) for sent, _pos in ys: sharesDict[sent].add(q_lemma) count[q_lemma] += 1 else: if talker.params.expand_query > 0: related = wn_all(talker.params.expand_query, 3, q_lemma, wn_q_tag) for r_lemma in related: if not good_word(q_lemma): continue zs = l2occ.get(r_lemma) if not zs: tprint("UNKNOWNS:", q_lemma, '\n') continue nears.append(r_lemma) tprint('EXPANDED:', q_lemma, '-->', r_lemma) sharesDict[sent].add(r_lemma) count[r_lemma] += 1 print('count:', count) ignored = [] for lemma in count: if (count[lemma] > 3): ignored.append(lemma) print('ignored:', ignored) lavg = talker.avg_len best = [] for id in sharesDict: sent = sent_data[id][SENT] lsent = len(sent) if lsent > 2 * lavg: sharedNum = len(sharesDict[id]) if sharedNum == 1: shares = list(sharesDict[id]) if shares[0] in ignored: continue r = 0 for key in matches: if (key in ignored): if (key in sharesDict[id]): r += 1.0 continue if (nxAlg.has_path(talker.g, key, id)): nodes = nxAlg.shortest_path(talker.g, key, id) if (len(nodes) < 6): n = math.pow(2, len(nodes) - 1) r += 16.0 / n for key in nears: if (key in ignored): if (key in sharesDict[id]): r += 0.5 continue if (nxAlg.has_path(talker.g, key, id)): nodes = nxAlg.shortest_path(talker.g, key, id) print('****************nears, key:id=', key, ':', id, ', get nodes, length:', len(nodes), 'nodes:', nodes) if (len(nodes) < 6): n = math.pow(2, len(nodes) - 1) r += 8.0 / n best.append((r, id, sharesDict[id], sent)) best.sort(reverse=True) answers = [] last_rank = 0 for i, b in enumerate(best): if i >= max_answers: break #ppp(i,b) rank, id, shared, sent = b if last_rank != 0: if rank / last_rank < 0.70: break last_rank = rank answers.append((id, sent, round(rank, 4), shared)) return answers, answerer
def hasPath(self,nodA,nodB): ''' See if a path exists between two nodes. ''' return alg.has_path(self.graph,nodA,nodB)
def hasPath(self, nodA, nodB): ''' See if a path exists between two nodes. ''' return alg.has_path(self.graph, nodA, nodB)