def generate_synonyms(self): # find atoms and parent-child synonym relationships children = {} for i in range(len(self.sorted_atoms)): parents = [] satom1 = self.sorted_atoms[i][0] atom1 = ed.str2edge(satom1) cur_depth = self.sorted_atoms[i][1] start = i while start < len(self.sorted_atoms ) and self.sorted_atoms[start][1] <= cur_depth: start += 1 for j in range(start, len(self.sorted_atoms)): satom2 = self.sorted_atoms[j][0] atom2 = ed.str2edge(satom2) if atom1 in atom2: parents.append(satom2) if len(parents) == 1: satom2 = parents[0] if satom2 not in children: children[satom2] = [] children[satom2].append(satom1) if satom1 in children: children[satom2] += children[satom1] # build synonym sets self.synonym_sets = [] for satom in children: synonyms = [satom] + children[satom] count = 0 for synonym in synonyms: count += self.edge_counts[synonym] del self.atoms[synonym] self.synonym_sets.append({ 'edges': synonyms, 'count': count, 'index': len(self.synonym_sets) }) print(synonyms) for atom in self.atoms: self.synonym_sets.append({ 'edges': [atom], 'count': self.edge_counts[atom], 'index': len(self.synonym_sets) }) # build synonym map self.synonym_map = {} for sset in self.synonym_sets: for synonym in sset['edges']: self.synonym_map[synonym] = sset['index'] self.synonym_map[sset['index']] = sset
def similar_edges(self, targ_edge): edges = self.hg.all() targ_eedge = enrich_edge(self.parser, targ_edge) sims = {} for edge in edges: if edge != targ_edge and not exclude(edge): eedge = enrich_edge(self.parser, edge) total_sim = eedge_similarity(targ_eedge, eedge) if total_sim >= self.sim_threshold: sims[ed.edge2str(edge)] = total_sim sorted_edges = sorted(sims.items(), key=operator.itemgetter(1), reverse=True) result = [] for e in sorted_edges: edge_data = { 'edge': e[0], 'sim': e[1], 'text': self.hg.get_str_attribute(ed.str2edge(e[0]), 'text') } result.append(edge_data) return result
def edges_with_similar_concepts(self, targ_edge): edges = self.hg.all() targ_eedge = enrich_edge(self.parser, targ_edge) sims = {} for edge in edges: if edge != targ_edge and not exclude(edge): eedge = enrich_edge(self.parser, edge) total_sim, worst_sim, complete, matches = edge_concepts_similarity( targ_eedge, eedge) if complete and worst_sim >= self.sim_threshold: sims[ed.edge2str(edge)] = (worst_sim, total_sim, matches) sorted_edges = sorted(sims.items(), key=operator.itemgetter(1), reverse=True) result = [] for e in sorted_edges: edge_data = { 'edge': e[0], 'worst_sim': e[1][0], 'sim': e[1][1], 'matches': e[1][2], 'text': self.hg.get_str_attribute(ed.str2edge(e[0]), 'text') } result.append(edge_data) return result
def write_edge_data(edge_data, file_path): f = open(file_path, 'w') for e in edge_data: # f.write('%s\n' % json.dumps(e, separators=(',', ':'))) f.write('%s\n' % str(e['sim'])) f.write('%s\n' % e['text']) f.write('%s\n' % ed.edge2str(ed.without_namespaces(ed.str2edge(e['edge'])))) f.close()
def all(self): """Returns a lazy sequence of all the vertices in the hypergraph.""" start_str = 'v' end_str = str_plus_1(start_str) start_key = (u'%s' % start_str).encode('utf-8') end_key = (u'%s' % end_str).encode('utf-8') for key, value in self.db.iterator(start=start_key, stop=end_key): vert = ed.str2edge(key.decode('utf-8')[1:]) yield vert
def f_all(self, f): """Returns a lazy sequence resulting from applying f to every vertex map (including non-atomic) in the hypergraph. A vertex map contains the keys vertex and degree.""" cur = self.open_cursor() cur.execute('SELECT id, degree FROM vertices') for row in cur: vmap = {'vertex': ed.str2edge(row[0]), 'degree': row[1]} yield f(vmap) self.close_cursor(cur, local=True, commit=False)
def perm2edge(perm_str): """Transforms a permutation string from a database query into an edge.""" try: tokens = ed.split_edge_str(perm_str[1:]) nper = int(tokens[-1]) tokens = tokens[:-1] tokens = unpermutate(tokens, nper) return ed.str2edge(' '.join(tokens)) except ValueError as v: print(u'VALUE ERROR! %s perm2edge %s' % (v, perm_str))
def generate_synonyms(self): sorted_atoms = sorted(self.atoms.items(), key=operator.itemgetter(1), reverse=False) for atom_pair in sorted_atoms: orig = self.graph.vs.find(atom_pair[0]) edges = self.graph.incident(orig.index, mode='in') if len(edges) > 0: max_weight = max([self.graph.es[e]['weight'] for e in edges]) else: max_weight = 0. if max_weight > .1: for e in edges: edge = self.graph.es[e] if edge['weight'] == max_weight: source = self.graph.vs[edge.source]['name'] target = self.graph.vs[edge.target]['name'] source_syn_id = self.syn_id(source) target_syn_id = self.syn_id(target) if not (source_syn_id and target_syn_id): if self.valid_synonym_parent(source): if source_syn_id: self.syn_ids[target] = source_syn_id elif target_syn_id: self.syn_ids[source] = target_syn_id else: syn_id = self.new_syn_id() self.syn_ids[source] = syn_id self.syn_ids[target] = syn_id else: if not target_syn_id: syn_id = self.new_syn_id() self.syn_ids[target] = syn_id # filter out multiple synonyms delete_synonyms = set() for atom in self.syn_ids: if len(self.synonym_ids_in(ed.str2edge(atom))) > 1: delete_synonyms.add(self.syn_ids[atom]) # generate synonym sets for atom in self.atoms: syn_id = self.syn_id(atom) if syn_id: if syn_id in delete_synonyms: new_id = self.new_syn_id() self.syn_ids[atom] = new_id self.synonym_sets[new_id] = {atom} else: if syn_id not in self.synonym_sets: self.synonym_sets[syn_id] = set() self.synonym_sets[syn_id].add(atom) else: new_id = self.new_syn_id() self.syn_ids[atom] = new_id self.synonym_sets[new_id] = {atom}
def synonym_label(self, syn_id, short=False): if short: best_size = 0 best_edge = None for atom in self.synonym_sets[syn_id]: edge = ed.str2edge(atom) if ed.size(edge) > best_size: best_edge = edge best_size = ed.size(edge) return edge2label(best_edge).replace('"', ' ') return '{%s}' % ', '.join([atom for atom in self.synonym_sets[syn_id]])
def symbols_with_root(self, root): """Find all symbols with the given root.""" start_str = '%s/' % root end_str = str_plus_1(start_str) start_key = (u'v%s' % start_str).encode('utf-8') end_key = (u'v%s' % end_str).encode('utf-8') symbs = set() for key, value in self.db.iterator(start=start_key, stop=end_key): symb = ed.str2edge(key.decode('utf-8')[1:]) symbs.add(symb) return symbs
def cur2edges(cur): """Transforms a cursor from a database query into a set of edges.""" edges = [] for row in cur: res = row[0] tokens = ed.split_edge_str(res) nper = int(tokens[-1]) tokens = tokens[:-1] tokens = unpermutate(tokens, nper) edge = ed.str2edge(' '.join(tokens)) edges.append(edge) return set(edges)
def perm2edge(perm_str): """Transforms a permutation string from a database query into an edge.""" try: tokens = ed.split_edge_str(perm_str[1:]) if tokens is None: return None nper = int(tokens[-1]) tokens = tokens[:-1] tokens = unpermutate(tokens, nper) return ed.str2edge(' '.join(tokens)) except ValueError as v: return None
def all_attributes(self): """Returns a lazy sequence with a tuple for each vertex in the hypergraph. The first element of the tuple is the vertex itself, the second is a dictionary of attribute values (as strings).""" start_str = 'v' end_str = str_plus_1(start_str) start_key = (u'%s' % start_str).encode('utf-8') end_key = (u'%s' % end_str).encode('utf-8') for key, value in self.db.iterator(start=start_key, stop=end_key): vert = ed.str2edge(key.decode('utf-8')[1:]) attributes = decode_attributes(value) yield (vert, attributes)
def test_str2edge(self): self.assertEqual(ed.str2edge('(is graphbrain/1 great/1)'), ('is', 'graphbrain/1', 'great/1')) self.assertEqual(ed.str2edge('(size graphbrain/1 7)'), ('size', 'graphbrain/1', 7)) self.assertEqual(ed.str2edge('(size graphbrain/1 7.0)'), ('size', 'graphbrain/1', 7.)) self.assertEqual(ed.str2edge('(size graphbrain/1 -7)'), ('size', 'graphbrain/1', -7)) self.assertEqual(ed.str2edge('(size graphbrain/1 -7.0)'), ('size', 'graphbrain/1', -7.)) self.assertEqual(ed.str2edge('(src graphbrain/1 (is graphbrain/1 great/1))'), ('src', 'graphbrain/1', ('is', 'graphbrain/1', 'great/1')))
def generate_atoms(self): # create atoms map -- edges with more than one occurrence self.atoms = {} for key in self.edge_counts: if self.edge_counts[key] > 1: self.atoms[key] = ed.depth(ed.str2edge(key)) # build atom_set self.atom_set = set([atom for atom in self.atoms]) # sorted by depth self.sorted_atoms = sorted(self.atoms.items(), key=operator.itemgetter(1), reverse=False)
def html(hg, eid): vertex = ed.str2edge(eid) if sym.sym_type(vertex) == sym.SymbolType.EDGE: title = edge_html(hg, vertex) else: title = '<h1>%s</h1>' % sym.symbol2str(eid) return """ <div class="container" role="main"> <div class="page-header"> %s <h4>%s</h4> </div> %s </div> """ % (title, eid, edges_html(hg, vertex))
def test_str2edge(self): self.assertEqual(ed.str2edge('(is graphbrain/1 great/1)'), ('is', 'graphbrain/1', 'great/1')) self.assertEqual(ed.str2edge('(size graphbrain/1 7)'), ('size', 'graphbrain/1', 7)) self.assertEqual(ed.str2edge('(size graphbrain/1 7.0)'), ('size', 'graphbrain/1', 7.)) self.assertEqual(ed.str2edge('(size graphbrain/1 -7)'), ('size', 'graphbrain/1', -7)) self.assertEqual(ed.str2edge('(size graphbrain/1 -7.0)'), ('size', 'graphbrain/1', -7.)) self.assertEqual( ed.str2edge('(src graphbrain/1 (is graphbrain/1 great/1))'), ('src', 'graphbrain/1', ('is', 'graphbrain/1', 'great/1'))) self.assertEqual(ed.str2edge('((is my) graphbrain/1 (super great/1))'), (('is', 'my'), 'graphbrain/1', ('super', 'great/1'))) self.assertEqual(ed.str2edge('.'), '.')
def print_atom_groups(self): n = 0 for k in self.atom_groups: atom_group = self.atom_groups[k] size = len(atom_group['sentences']) if size > 3: n += 1 print('ATOM_GROUP id: %s' % n) print('Base concepts: %s' % atom_group['label']) print('size: %s' % size) print('sentences:') for sentence in atom_group['sentences']: print('* %s' % sentence) print('edges:') for edge in atom_group['edges']: print( '* %s' % ed.edge2str(ed.without_namespaces(ed.str2edge(edge)))) print()
return True if __name__ == '__main__': print('creating parser...') par = par.Parser() print('parser created.') # read data # edge_data = json_tools.read('edges_similar_concepts.json') edge_data = json_tools.read('all.json') # build full edges list full_edges = [] for it in edge_data: full_edges.append(ed.without_namespaces(ed.str2edge(it['edge']))) # synonym_set synset1 = [] synset2 = [] synset3 = [] synset1 = ['trump', 'donald', '(+ donald trump)'] # synset2 = ['ryan', '(+ paul ryan)', 'paul'] synset2 = ['vladimir', '(+ vladimir putin)', 'putin'] concepts1 = [ed.str2edge(x) for x in synset1] concepts2 = [ed.str2edge(x) for x in synset2] concepts3 = [ed.str2edge(x) for x in synset3] concept_sets = [concepts1, concepts2, concepts3]
'sim': e[1][1], 'matches': e[1][2], 'text': self.hg.get_str_attribute(ed.str2edge(e[0]), 'text') } result.append(edge_data) return result def write_similar_edges(self, targ_edge, file_path): edge_data = self.similar_edges(targ_edge) write_edge_data(edge_data, file_path) def write_edges_with_similar_concepts(self, targ_edge, file_path): edge_data = self.edges_with_similar_concepts(targ_edge) write_edge_data(edge_data, file_path) if __name__ == '__main__': hgr = hyperg.HyperGraph({'backend': 'leveldb', 'hg': 'reddit-politics.hg'}) print('creating parser...') par = par.Parser() print('parser created.') te = '(clinches/nlp.clinch.verb clinton/nlp.clinton.noun ' \ '(+/gb democratic/nlp.democratic.adj nomination/nlp.nomination.noun))' s = SimilarityFilter(hgr, par) s.write_edges_with_similar_concepts(ed.str2edge(te), 'edges_similar_concepts.json') # s.write_similar_edges(ed.str2edge(te), 'similar_edges.json')
from sklearn.cluster import DBSCAN import gb.tools.json as json_tools import gb.hypergraph.edge as ed import gb.nlp.parser as par from gb.explore.similarity import edge_similarity if __name__ == '__main__': print('creating parser...') par = par.Parser() print('parser created.') edge_data = json_tools.read('edges_similar_concepts.json') extra_edges = {} for item in edge_data: edge = ed.str2edge(item['edge']) matched = [ed.str2edge(match[1]) for match in item['matches']] for part in edge[1:]: if part not in matched: key = ed.edge2str(part) if key in extra_edges: extra_edges[key] += 1 else: extra_edges[key] = 1 sorted_edges = sorted(extra_edges.items(), key=operator.itemgetter(1), reverse=False) print(sorted_edges) print(len(sorted_edges))
'worst_sim': e[1][0], 'sim': e[1][1], 'matches': e[1][2], 'text': self.hg.get_str_attribute(ed.str2edge(e[0]), 'text') } result.append(edge_data) return result def write_similar_edges(self, targ_edge, file_path): edge_data = self.similar_edges(targ_edge) write_edge_data(edge_data, file_path) def write_edges_with_similar_concepts(self, targ_edge, file_path): edge_data = self.edges_with_similar_concepts(targ_edge) write_edge_data(edge_data, file_path) if __name__ == '__main__': hgr = hyperg.HyperGraph({'backend': 'leveldb', 'hg': 'reddit-politics.hg'}) print('creating parser...') par = par.Parser() print('parser created.') te = '(clinches/nlp.clinch.verb clinton/nlp.clinton.noun ' \ '(+/gb democratic/nlp.democratic.adj nomination/nlp.nomination.noun))' s = Similarity(hgr, par) # s.write_edges_with_similar_concepts(ed.str2edge(te), 'edges_similar_concepts.json') s.write_similar_edges(ed.str2edge(te), 'similar_edges.json')
def generate_synonyms(self): # init synonym data self.syn_ids = {} self.synonym_sets = {} self.cur_syn_id = 0 total_atoms = len(self.atoms) # generate synonyms print('generating synonyms') i = 0 with progressbar.ProgressBar(max_value=total_atoms) as bar: sorted_atoms = sorted(self.atoms.items(), key=operator.itemgetter(1), reverse=False) for atom_pair in sorted_atoms: orig = self.graph.vs.find(atom_pair[0]) edges = self.graph.incident(orig.index, mode='in') edges = [self.graph.es[edge] for edge in edges] edges = [(self.graph.vs[edge.source]['name'], self.graph.vs[edge.target]['name'], edge['weight'], edge['norm_weight']) for edge in edges] edges = sorted(edges, key=operator.itemgetter(3), reverse=True) ambiguous = False for pos in range(len(edges)): is_synonym = False edge = edges[pos] source = edge[0] target = edge[1] weight = edge[2] norm_weight = edge[3] source_edge = ed.str2edge(source) if weight > WEIGHT_THRESHOLD: if semantic_synonyms(source, target): is_synonym = True elif not ambiguous and norm_weight >= NORM_WEIGHT_THRESHOLD and is_candidate( source_edge): pos_next = next_candidate_pos(edges, pos) if pos_next < 0: is_synonym = True else: next_weight = edges[pos_next][3] if next_weight < NORM_WEIGHT_THRESHOLD: is_synonym = True else: ambiguous = True if is_synonym: source_syn_id = self.syn_id(source) target_syn_id = self.syn_id(target) if target_syn_id: self.syn_ids[source] = target_syn_id elif source_syn_id: self.syn_ids[target] = source_syn_id else: syn_id = self.new_syn_id() self.syn_ids[source] = syn_id self.syn_ids[target] = syn_id i += 1 if (i % 1000) == 0: bar.update(i) bar.update(i) # generate synonym sets print('generating synonym sets') i = 0 with progressbar.ProgressBar(max_value=total_atoms) as bar: for atom in self.atoms: syn_id = self.syn_id(atom) if syn_id: if syn_id not in self.synonym_sets: self.synonym_sets[syn_id] = set() self.synonym_sets[syn_id].add(atom) else: new_id = self.new_syn_id() self.syn_ids[atom] = new_id self.synonym_sets[new_id] = {atom} i += 1 if (i % 1000) == 0: bar.update(i) bar.update(i)
def next_candidate_pos(edges, pos): for i in range(pos + 1, len(edges)): if is_candidate(ed.str2edge(edges[i][0])): return i return -1
def contains_synonym(self, full_edge, syn_id): for atom in self.meronomy.synonym_sets[syn_id]: edge = ed.str2edge(atom) if ed.contains(full_edge, edge, deep=True): return True return False
# build extra edges list # extra_edges = [] # full_edges = [] # for it in edge_data: # e = ed.str2edge(it['edge']) # full_edges.append(e) # matched = [ed.str2edge(match[1]) for match in it['matches']] # for part in e[1:]: # if part not in matched: # extra_edges.append(part) edge_data = json_tools.read('all.json') # build full edges list extra_edges = [] for it in edge_data: extra_edges.append(ed.without_namespaces(ed.str2edge(it['edge']))) full_edges = extra_edges ag = AtomGroups(par) print('set edges') ag.set_edges(extra_edges) print('generate_atoms') ag.generate_atoms() print('generate synonyms') ag.generate_synonyms() print('generate atom groups') ag.generate_atom_groups() ag.print_atom_groups() print('generate atom group clusters') ag.generate_atom_group_clusters(full_edges) ag.print_atom_group_clusters()
def generate_atom_groups(self): nsyns = len(self.synonym_sets) # build coocurrence sparse matrix synonym_cooc = sps.lil_matrix((nsyns, nsyns)) for edge in extra_edges: co_synonyms = self.find_co_synonyms(edge) if len(co_synonyms) > 1: for pair in itertools.combinations(co_synonyms, 2): synonym_cooc[pair[0], pair[1]] += 1 synonym_cooc[pair[1], pair[0]] += 1 # normalize matrix synonym_cooc = normalize(synonym_cooc, norm='l1', axis=1, copy=False) # iterate matrix, build graph gedges = [] weights = [] cx = synonym_cooc.tocoo() for i, j, v in zip(cx.row, cx.col, cx.data): gedges.append((i, j)) weights.append(v) g = igraph.Graph() g.add_vertices(nsyns) g.add_edges(gedges) g.es['weight'] = weights # community detection comms = igraph.Graph.community_multilevel(g, weights='weight', return_levels=False) # build atom_groups self.atom_groups = {} for i in range(len(comms)): comm = comms[i] count = 0 syns = [] sentences = set() edges = [] for item in comm: edges += self.synonym_map[item]['edges'] for atom in self.synonym_map[item]['edges']: for edat in edge_data: if ed.contains(ed.str2edge( ed.edge2str(ed.str2edge(edat['edge']), namespaces=False)), ed.str2edge(atom), deep=True): if edat['text']: sentences.add(edat['text']) syns.append(self.synonym_map[item]) count += self.synonym_map[item]['count'] label = ', '.join(edges) atom_group = { 'label': label, 'syns': syns, 'count': count, 'sentences': sentences, 'edges': edges } self.atom_groups[i] = atom_group