def roleTag(vertexList, wordNetAll): tagList = [] for vertex in vertexList: if Nature.ns == vertex.getNature( ) and vertex.getAttribute().totalFrequency <= 1000: # 二字地名,认为其可以再接一个后缀或前缀 if len(vertex.realword) < 3: nsEnumItem = EnumItem().init2(NS.H, NS.G).labelMap.items() tagList.append(nsEnumItem) # 否则只可以再加后缀 else: nsEnumItem = EnumItem().init2(NS.G).labelMap.items() tagList.append(nsEnumItem) continue # 此处用等效词,更加精准 NSEnumItem = PlaceDictionary.dictionary.get(vertex.word) if NSEnumItem is not None: NSEnumItem = sorted(NSEnumItem, key=itemgetter(1), reverse=True) if NSEnumItem is None: NSEnumItem = EnumItem().init1( NS.Z, PlaceDictionary.transformMatrixDictionary. getTotalFrequency(NS.Z)).labelMap.items() tagList.append(NSEnumItem) return tagList
def onLoadValue(self, path): valueArray = self.loadDat1(path + '.value.dat') if valueArray is not None: return valueArray valueList = [] line = None try: br = open(path, 'r') while 1: line = br.readline().strip(' \n\t\r') if not line: break args = EnumItem.create(line) nrEnumItem = EnumItem() for e in args.values()[0]: nrEnumItem = nrEnumItem.init1(NS.valueOf(e.keys()[0]), int(e.values()[0])) valueList.append(nrEnumItem.labelMap.items()) self.onSaveValue(valueList, path) except Exception, e: self.logger.error("读取%s失败[%s]\n该词典这一行格式不对:%s" % (path, str(e), line)) return None
def loadDat1(self, path): try: bytes = pickle.load(open(path + Predefine.PIC_EXT, 'rb')) except Exception as e: bytes = IOUtil().readBytes(path) out = open(path + Predefine.PIC_EXT, 'wb') pickle.dump(bytes, out) if bytes is None: return None nsArray = list(NT) index = 0 size = ByteUtil.bytesHighFirstToInt(bytes, index) index += 4 valueArray = [None] * size item = None for i in range(size): currentSize = ByteUtil.bytesHighFirstToInt(bytes, index) index += 4 item = EnumItem() tm_dict = {} for j in range(currentSize): ns = nsArray[ByteUtil.bytesHighFirstToInt(bytes, index)] index += 4 frequency = ByteUtil.bytesHighFirstToInt(bytes, index) index += 4 item = item.init1(ns, frequency) valueArray[i] = item.labelMap.items() return valueArray
def roleTag(vertexList, wordNetAll): tagList = [] for vertex in vertexList: nature = vertex.guessNature() if nature == Nature.nrf: if vertex.getAttribute().totalFrequency <= 1000: ntEnumItem = EnumItem().init1(NT.F, 1000).labelMap.items() tagList.append(ntEnumItem) continue elif nature in [Nature.ni, Nature.nic, Nature.nis, Nature.nit]: initdict = {NT.K: 1000, NT.D: 1000} ntEnumItem = EnumItem().init4(initdict).labelMap.items() tagList.append(ntEnumItem) continue elif nature == Nature.m: ntEnumItem = EnumItem().init1(NT.M, 1000).labelMap.items() tagList.append(ntEnumItem) continue # 此处用等效词,更加精准 NTEnumItem = OrganizationDictionary.dictionary.get(vertex.word) if NTEnumItem is not None: NTEnumItem = sorted(NTEnumItem, key=itemgetter(1), reverse=True) if NTEnumItem is None: NTEnumItem = EnumItem().init1( NT.Z, OrganizationDictionary.transformMatrixDictionary. getTotalFrequency(NT.Z)).labelMap.items() tagList.append(NTEnumItem) return tagList
def isBadCase(name): """ 因为任何算法都无法解决100%的问题,总是有一些bad case,这些bad case会以“盖公章 A 1”的形式加入词典中<BR> 这个方法返回是否是bad case :param name: :return: """ nrEnumItem = None place_list = PlaceDictionary.dictionary.get(name) if place_list is not None: initdict = dict(place_list) nrEnumItem = EnumItem().init3(initdict) if nrEnumItem is None: return False return nrEnumItem.containsLabel(NS.Z)
def roleObserve(wordSegResult): """ 角色观察(从模型中加载所有词语对应的角色,允许规则补充) :param word_seg_result 粗分结果 """ tagList = [] for vertex in wordSegResult: nrEnumItem = PersonDictionary.dictionary.get(vertex.realword) if nrEnumItem is None: for case in Switch(vertex.guessNature()): if case(Nature.nr): # 有些双名实际上可以构成更长的三名 if vertex.getAttribute( ).totalFrequency <= 1000 and len(vertex.realword) == 2: nrEnumItem = EnumItem().init2( NR.X, NR.G).labelMap.items() else: nrEnumItem = EnumItem().init1( NR.A, PersonDictionary.transformMatrixDictionary. getTotalFrequency(NR.A)).labelMap.items() break if case(Nature.nnt): # 姓+职位 nrEnumItem = EnumItem().init2(NR.G, NR.K).labelMap.items() break if case(): # nrEnumItem = [(NR.A, PersonDictionary.transformMatrixDictionary.getTotalFrequency(NR.A))] nrEnumItem = EnumItem().init1( NR.A, PersonDictionary.transformMatrixDictionary. getTotalFrequency(NR.A)).labelMap.items() break tagList.append(nrEnumItem) return tagList
def insert(vertexList, tagList, wordNetAll, line, ns): vertex = wordNetAll.getFirst(line) assert vertex is not None vertexList.append(vertex) tagList.append(EnumItem().init1(ns, 1000))
except Exception, e: bytes = IOUtil().readBytes(path) out = file(path + Predefine.PIC_EXT, 'wb') cPickle.dump(bytes, out) if bytes is None: return None nsArray = list(NS) index = 0 size = ByteUtil.bytesHighFirstToInt(bytes, index) index += 4 valueArray = [None] * size item = None for i in range(size): currentSize = ByteUtil.bytesHighFirstToInt(bytes, index) index += 4 item = EnumItem() tm_dict = {} for j in range(currentSize): ns = nsArray[ByteUtil.bytesHighFirstToInt(bytes, index)] index += 4 frequency = ByteUtil.bytesHighFirstToInt(bytes, index) index += 4 item = item.init1(ns, frequency) valueArray[i] = item.labelMap.items() return valueArray def onSaveValue(self, valueArray, path): return self.saveDat(path + '.value.dat', valueArray) def saveDat(self, path, valueArray): try: