def exclude(edge): if sym.is_edge(edge): rel = edge[0] if sym.is_edge(rel): return False return rel in EXCLUDE_RELS else: return True
def rel_contains(full_edge, term): if sym.is_edge(full_edge) and len(full_edge) > 2: if len(full_edge) > 3 or sym.is_edge(full_edge[2]): rel = full_edge[0] if sym.is_edge(rel): return term in rel else: return rel == term return False
def rel_has_term(self, edge): if sym.is_edge(edge) and len(edge) > 2: if len(edge) > 3 or sym.is_edge(edge[2]): rel = edge[0] if sym.is_edge(rel): return self.term in rel else: return rel == self.term return False
def edge2label(edge): if sym.is_edge(edge): _edge = list(edge[:]) if _edge[0] == '+': _edge = _edge[1:] if not sym.is_edge(_edge[0]): if _edge[0][0] == '+': _edge[0] = _edge[0][1:] return ' '.join([edge2label(item) for item in _edge]) else: return str(edge)
def get_label(self, edge): edges = self.pattern2edges([const.has_label, edge, None]) if len(edges) > 0: label_symbol = edges.pop()[2] if not sym.is_edge(label_symbol): return sym.symbol2str(label_symbol) return sym.symbol2str(edge)
def add_edge(self, edge_ns): is_edge = sym.is_edge(edge_ns) edge = ed.without_namespaces(edge_ns) # discard common words if not is_edge: word = self.parser.make_word(edge) if word.prob > MAX_PROB: return False orig = edge2str(edge) # add to edge_map if orig not in self.edge_map: self.edge_map[orig] = set() self.edge_map[orig].add(edge_ns) concept = is_concept(edge) self.vertices.add(orig) self.atoms[orig] = ed.depth(edge) if is_edge: for e in edge_ns: targ = edge2str(e) if targ: if self.add_edge(e): if concept: self.add_link(orig, targ) return True
def enrich_edge(parser, edge): if sym.is_edge(edge): eedge = [enrich_edge(parser, item) for item in edge] prob = 1. total_prob = 0. word_count = 0 words = [] for item in eedge: word_count += item['word_count'] prob *= item['prob'] total_prob += item['prob'] * item['word_count'] words += item['words'] mean_prob = total_prob / word_count return {'edge': edge, 'eedge': eedge, 'words': words, 'prob': prob, 'word_count': word_count, 'mean_prob': mean_prob} ngram = sym.symbol2str(edge) tokens = [token for token in ngram.split(' ') if len(token) > 0] for i in range(len(tokens)): if tokens[i][0] == '+': tokens[i] = tokens[i][1:] tokens = [token for token in tokens if len(token) > 0] words = [parser.make_word(token) for token in tokens] prob = 1. total_prob = 0. for word in words: p = math.exp(word.prob) prob *= p total_prob += p word_count = len(words) if word_count > 0: mean_prob = total_prob / word_count else: mean_prob = 1. return {'symbol': edge, 'words': words, 'prob': prob, 'word_count': word_count, 'mean_prob': mean_prob}
def infer_from_edge(self, edge, arity): pred = edge[0] if not self.pred_table[pred][str(arity)]: return actor_orig = syn.main_synonym(self.hg, edge[1]) if self.is_actor(actor_orig): actor_targs = set() concepts = set() for entity in edge[2:]: syn_entity = syn.main_synonym(self.hg, entity) if self.is_actor(syn_entity): actor_targs.add(syn_entity) else: concepts |= self.get_concepts(entity) if self.pred_table[pred]['claim']: if len(edge) > 2 and sym.is_edge(edge[2]): self.claims += 1 for concept in concepts: self.add_mention(actor_orig, concept, edge) if self.pred_table[pred]['conflict']: for actor in actor_targs: self.conflicts += 1 self.add_conflict(actor_orig, actor) for concept in concepts: self.add_conflict_over(actor_orig, actor, concept, edge)
def is_concept(edge): if sym.is_edge(edge): if len(edge) > 1: for item in edge[1:]: if not is_concept(item): return False return edge[0] == '+' return True
def is_candidate(edge): if sym.is_edge(edge) and len(edge) > 1: # discard posessives if edge[1] in { "'s", 'in', 'of', 'with', 'and', 'a', 'on', 'for', 'to', 'from' }: return False return True
def post_assignments(self, edge): if sym.is_edge(edge): for e in edge: self.post_assignments(e) else: term = self.edge2str(edge) if term in self.edge_map: if edge[-4:] == 'noun' or edge[-5:] == 'propn': self.edge_map[term].add(edge)
def add_edges(self, edge): if sym.is_edge(edge): for item in edge: self.add_edges(item) edge_str = ed.edge2str(edge, namespaces=False) if not sym.is_edge(edge): if edge_str[0] == '+': edge_str = edge_str[1:] if len(edge_str) == 0: return if not edge_str[0].isalnum(): return if self.parser.make_word(edge_str).prob > MAX_PROB: return if edge_str not in self.edge_counts: self.edge_counts[edge_str] = 0 self.edge_counts[edge_str] += 1
def recover_words(self, edge): if sym.is_edge(edge): for e in edge: self.recover_words(e) else: term = edge2str(edge) if term in self.edge_map: if edge[-4:] == 'noun' or edge[-5:] == 'propn': self.edge_map[term].add(edge)
def contains(edge, concept, deep=False): if sym.is_edge(edge): for x in edge: if x == concept: return True if deep: if contains(x, concept, True): return True return False else: return edge == concept
def main_synonym(hg, edge, in_adp=False): """Finds the main synonym of an edge or symbol. The main synonym is usually a special type of symbol that all synonyms point to, used as an identifier for the synonym set. If parameter in_adp is True, in case of adpositional phrases this function looks for the main synonym contained in the phrase. E.g. in (+/gb with/nlp.with.adp india/nlp.india.propn) the main synonym for india/nlp.india.propn is returned. In case no main synonym exists, the edge or symbol itself is returned.""" if in_adp and sym.is_edge(edge): if len(edge) == 3 and edge[0] == '+/gb': if not sym.is_edge(edge[1]) and edge[1][-4:] == '.adp': # if ed.is_concept(edge[2]): return main_synonym(hg, edge[2]) elif not sym.is_edge(edge[2]) and edge[2][-4:] == '.adp': # if ed.is_concept(edge[1]): return main_synonym(hg, edge[1]) edges = hg.pattern2edges([cons.are_synonyms, edge, None]) if len(edges) > 0: return edges.pop()[2] return edge
def add_claim(self, edge): orig = self.edge2str(edge) if not orig: return self.vertices.add(orig) self.atoms[orig] = ed.depth(edge) if sym.is_edge(edge): for element in edge: targ = self.edge2str(element) if targ: self.vertices.add(targ) self.atoms[targ] = ed.depth(element) self.add_link(orig, targ) self.add_claim(element)
def synonym_ids_in(self, edge): sids = set() atom = self.edge2str(edge) atom_syn_id = self.syn_id(atom) if atom_syn_id: sids.add(atom_syn_id) if sym.is_edge(edge): for element in edge: atom = self.edge2str(element) atom_syn_id = self.syn_id(atom) if atom_syn_id: sids.add(atom_syn_id) sids = sids.union(self.synonym_ids_in(element)) return sids
def lemmatize(self, edge): if edge in self.lemmas: return self.lemmas[edge] lemma = edge if sym.is_edge(edge): lemma = tuple([self.lemmatize(item) for item in edge]) else: edges = self.hg.pattern2edges((const.have_same_lemma, edge, None)) if len(edges) > 0: lemma = edges.pop()[2] self.lemmas[edge] = lemma return lemma
def find_co_synonyms(self, edge): co_syns = set() if sym.is_edge(edge): for item in edge: co_syns = co_syns.union(self.find_co_synonyms(item)) edge_str = ed.edge2str(edge, namespaces=False) for atom in self.atom_set: if atom == edge_str: co_syns.add(self.synonym_map[atom]) return co_syns return co_syns
def get_concepts(self, edge): if sym.is_edge(edge): concepts = {syn.main_synonym(self.hg, edge)} if len(edge) > 1: for item in edge[1:]: concepts |= self.get_concepts(item) return concepts else: word = self.parser.make_word(unidecode(ed.without_namespaces(edge))) if word.prob > MAX_PROB: return set() if edge[0] in {'`', '_', "'"}: return set() else: return {syn.main_synonym(self.hg, edge)}
def edge2str(self, edge): s = ed.edge2str(edge, namespaces=False) if sym.is_edge(edge): return s if s[0] == '+': s = s[1:] if len(s) == 0: return None word = self.parser.make_word(s) if word.prob < MAX_PROB: return s return None
def valid_symbol(s): if sym.is_edge(s): return True if sym.is_root(s): return False if sym.nspace(s) == 'gb': return False if s[0] == '+': return False if sym.nspace(s)[:3] != 'nlp': return False if sym.nspace(s)[-3:] == 'adp': return False if sym.nspace(s)[-3:] == 'det': return False if sym.nspace(s)[-4:] == 'verb': return False if sym.nspace(s)[-4:] == 'pron': return False return True
def add_claim(self, edge): orig = self.edge2syn(edge) if not orig: return self.vertices.add(orig) if sym.is_edge(edge): elements = [] # links from part to whole for element in edge: targ = self.edge2syn(element) if targ: elements.append(targ) self.vertices.add(targ) self.add_link(orig, targ) self.add_claim(element) # links between peers combs = itertools.combinations(elements, 2) for comb in combs: self.add_link(*comb)
def json_str(hg, symbol): labels = {symbol: hg.get_label(symbol)} actors = set() conflict_map = {} for edge in hg.pattern2edges(('conflict/gb.inf', symbol, None, None)): targ = edge[2] if not sym.is_edge(targ): actors.add(targ) labels[targ] = hg.get_label(targ) if targ not in conflict_map: conflict_map[targ] = {'topics': set()} topic = edge[3] labels[topic] = hg.get_label(topic) conflict_map[targ]['topics'].add(topic) for edge in hg.pattern2edges(('conflict/gb.inf', None, symbol, None)): targ = edge[1] if not sym.is_edge(targ): actors.add(targ) labels[targ] = hg.get_label(targ) if targ not in conflict_map: conflict_map[targ] = {'topics': set()} topic = edge[3] labels[topic] = hg.get_label(topic) conflict_map[targ]['topics'].add(topic) conflict = [{ 'target': targ, 'topics': tuple(conflict_map[targ]['topics']) } for targ in conflict_map] nodes = [] actor_id = {} i = 0 for actor in actors: actor_id[actor] = i nodes.append({'label': labels[actor], 'r': 3}) i += 1 links = [] for actor in actors: targets = conflict_targets(hg, actors, actor) for target in targets: links.append({ 'source': actor_id[actor], 'target': actor_id[target] }) nodes[actor_id[actor]]['r'] += 1 nodes[actor_id[target]]['r'] += 1 data = { 'entity': symbol, 'labels': labels, 'conflict': conflict, 'conflict_graph': { 'nodes': nodes, 'links': links } } return data
def generate(hg): print('starting parser...') parser = par.Parser() mer = Meronomy(hg, parser) print('reading edges...') total_edges = 0 total_beliefs = 0 total_verts = hg.symbol_count() + hg.edge_count() i = 0 with progressbar.ProgressBar(max_value=total_verts) as bar: for vertex in hg.all(): if sym.is_edge(vertex): edge = vertex total_edges += 1 if hg.is_belief(edge): mer.add_edge(edge) total_beliefs += 1 i += 1 if (i % 1000) == 0: bar.update(i) print('edges: %s; beliefs: %s' % (total_edges, total_beliefs)) print('post assignments...') i = 0 with progressbar.ProgressBar(max_value=total_verts) as bar: for vertex in hg.all(): if sym.is_edge(vertex): edge = vertex if hg.is_belief(edge): mer.post_assignments(edge) i += 1 if (i % 1000) == 0: bar.update(i) print('generating meronomy graph...') mer.generate() print('normalizing meronomy graph...') mer.normalize_graph() print('generating synonyms...') mer.generate_synonyms() print('writing synonyms...') i = 0 with progressbar.ProgressBar(max_value=len(mer.synonym_sets)) as bar: for syn_id in mer.synonym_sets: edges = set() for atom in mer.synonym_sets[syn_id]: if atom in mer.edge_map: edges |= mer.edge_map[atom] best_count = -1 best_label_edge = None for edge in edges: if mer.edge_counts[edge] > best_count: best_count = mer.edge_counts[edge] best_label_edge = edge label = hg.get_label(best_label_edge) syn_symbol = sym.build(label, 'syn%s' % syn_id) for edge in edges: syn_edge = (cons.are_synonyms, edge, syn_symbol) hg.add(syn_edge) label_symbol = sym.build(label, cons.label_namespace) label_edge = (cons.has_label, syn_symbol, label_symbol) hg.add(label_edge) i += 1 if i % 1000 == 0: bar.update(i) bar.update(i) print('%s synonym sets created' % len(mer.synonym_sets)) print('done.')
def is_concept(edge): rel = edge[0] if sym.is_edge(rel): return False return rel[0] == '+'