def _wvResolve(self, proname, flatEntityList): """Resolve using word vector similarities.""" ret = "" snames = list() svecs = list() sim = 0. for key in self.G.successors(proname): name = preprocessText(key) if name in self.wv: snames.append(name) svecs.append(self.wv[name]) for key2 in self.G.successors(key): name = preprocessText(key2) if name in self.wv: snames.append(name) svecs.append(self.wv[name]) if len(svecs) > 0: for item in flatEntityList: rawitem = preprocessText(item) if rawitem not in snames and rawitem in self.wv: score = harmonicSim(svecs, self.wv[rawitem]) if sim < score: sim = score ret = item if sim > 0.7: return ret else: return "" else: return ""
def _addAllMP(self, inps): """Parallel implementation of addAll function.""" if self.lang == "ja": if self.gtype == "d": inps = [[self.pos + x, preprocessText(inps[x])] for x in range(len(inps))] results = self.pool.starmap(self._addMP_ja_d, inps) elif self.gtype == "k": inps = [[self.pos + x, preprocessText(inps[x]), self.autosub] for x in range(len(inps))] results = self.pool.starmap(self._addMP_ja_k, inps) else: raise ValueError("Unknown graph type: {0}".format(self.gtype)) else: raise ValueError("Unsupported language: {0}".format(self.lang)) self.pos += len(inps) final = self._reduce(results) self.G = _mergeGraph(self.G, final[0]) self.entityList = _mergeEntityList(self.entityList, final[1]) self.proList = _mergeProList(self.proList, final[2])
def resolveSynonym(self): """Resolve synonyms in the given text.""" # initialize a graph of synonym GS = nx.Graph() # Get flatten entity list flatEntityList = list() # Add person and organization to flatEntityList for i in [1, 3]: for key in self.entityList[i].keys(): flatEntityList.append(key) # Find syntatic synonyms for i in range(len(flatEntityList)): for j in range(i + 1, len(flatEntityList)): A = preprocessText(flatEntityList[i]) B = preprocessText(flatEntityList[j]) inc = inclusive(A, B) if not self.wv: sim = 1. print("Word vector model is not set correctly. Skipping part of coreference resolution.") else: if A in self.wv and B in self.wv: sim = cosSimilarity(self.wv[A], self.wv[B]) else: sim = 1. if inc == 1 and sim > 0.5: # self.G.nodes[flatEntityList[i]]['count'] += 1 GS.add_edge(flatEntityList[i], flatEntityList[j]) self.G.add_edge(flatEntityList[i], flatEntityList[j], weight=1, label="同義語候補", type="synonym") elif inc == -1 and sim > 0.5: # self.G.nodes[flatEntityList[j]]['count'] += 1 GS.add_edge(flatEntityList[i], flatEntityList[j]) self.G.add_edge(flatEntityList[j], flatEntityList[i], weight=1, label="同義語候補", type="synonym") # Process GS for subG in nx.connected_components(GS): lshort = 10000 nshort = "" for node in subG: if lshort > len(node): lshort = len(node) nshort = node self.synonymDict.add(nshort) for node in subG: self.G.nodes[node]['synonym'] = nshort return flatEntityList
def _processMeaningless(self): """This function makes meaningless words tagged with its meaning.""" nck = len(self.chunks) for i in range(nck): if preprocessText(self.chunks[i].main) in MeaninglessDict: if len(self.childrenList[i]) > 0: self.chunks[i].meaning = self.chunks[self.childrenList[i] [-1]].main self.chunks[i].main = "({0})\n{1}".format( self.chunks[self.childrenList[i][-1]].surface, self.chunks[i].main)
def _processNegative(self): """This function makes the words that has negative child tagged negative.""" nck = len(self.chunks) for i in range(nck): if preprocessText(self.chunks[i].main) in [ "ない", ]: if len(self.childrenList[i]) > 0: self.chunks[self.childrenList[i][-1]].main += "\n(否定)" self.chunks[self.childrenList[i][-1]].negative = 1 self.chunks[i].meaning = self.chunks[self.childrenList[i] [-1]].main self.chunks[i].main = self.chunks[i].main.replace("\n(否定)", "")
def add(self, inp): """Add a sentence to graph.""" inp = preprocessText(inp) if inp == "": return [inp] self.core.add(inp, self.pos) self.pos += 1 self.G = _mergeGraph(self.G, self.core.G) self.core.G.clear() self.entityList = _mergeEntityList(self.entityList, self.core.entityList) self.core.entityList = [dict() for x in range(len(NEList))] self.proList = _mergeProList(self.proList, self.core.proList) self.core.proList = list() flatEntityList = None if self.synonym: flatEntityList = self.resolveSynonym() if self.coref: self.resolveCoref(flatEntityList) return [inp]
def addUrls(self, urls): """Add the information from given urls to KSG.""" context = self._grabTextFromUrls(urls) self.addAll(context) return [preprocessText(item) for item in context]
def _addEntity(self, pid, chunks): """Add parent nodes that are nouns.""" parent = chunks[pid] sub = None # Find subject for i in range(len(parent.children)): child = chunks[parent.children[i]] if child.func in SubDict: sub = child if child.func == "では": if child.negative != 0 or any([val.negative != 0 for key, val in self.G.successors(child.main)]): pass else: sub = None if sub: self._addNode(parent, sub=sub.main) self._addEdge(sub.main, parent.main, label="陳述", etype="stat") else: self._addNode(parent) # Lopp through all children for i in range(len(parent.children)): child = chunks[parent.children[i]] # If child is noun if child.func in SubDict: if child.func == "では": if child.negative != 0 or any([val.negative != 0 for key, val in self.G.successors(child.main)]): pass else: self._addNode(child) self._addEdge(child.main, parent.main, label=child.func, etype="attr") elif child.type == 0 and child.func in ["と", "などと"] and child.id + 1 == parent.id and preprocessText(chunks[parent.parent].main) not in ["交代", "交換"]: self._addNode(child) self._addEdge(child.main, parent.main, label="並列", etype="para") self._addEdge(parent.main, child.main, label="並列", etype="para") self.para.append([child.main, parent.main]) elif child.type == 0 and child.func in ParallelDict and child.id + 1 == parent.id: self._addNode(child) self._addEdge(child.main, parent.main, label="並列", etype="para") self._addEdge(parent.main, child.main, label="並列", etype="para") self.para.append([child.main, parent.main]) else: self._addNode(child) self._addEdge(child.main, parent.main, label=child.func, etype="attr")