def addState(self, character): character = character.encode('utf-8') nextState = self.nextStateIgnoreRootState(character) if nextState is None: nextState = State().init1(self.depth + 1) self.success.result[character] = nextState self.success = TreeMap(inputDict=self.success.result).sort() return nextState
def init2(self, *args): """ 创建一个条目,其标签频次都是1,各标签由参数指定 :param args: :return: """ for label in args: self.initdict[label] = 1 self.labelMap = TreeMap(self.initdict) return self
def __init__(self): # 模式串的长度,也是这个状态的深度 self.depth = int() # 只要这个状态可达,则记录模式串 self.emits = None # goto 表,也称转移函数。根据字符串的下一个字符转移到下一个状态 self.success = TreeMap({}) # 在双数组中的对应下标 self.index = int() # fail 函数,如果没有匹配到,则跳转到此状态。 self.failure = None
def load(self, path): self.logger.info("核心词典开始加载:%s" % path) print("核心词典开始加载:%s" % path) if self.loadDat(path): return True initdict = OrderedDict() try: f = open(path, 'r') line = '' MAX_FREQUENCY = 0 start = time() while 1: line = f.readline().strip(' \n\t\r') if not line: break param = line.split('\t') natureCount = int((len(param) - 1) / 2) attribute = CoreDictionary.Attribute().init1(natureCount) for i in range(natureCount): attribute.nature[i] = Nature.valueOf(param[1 + 2 * i]) attribute.frequency[i] = int(param[2 + 2 * i]) attribute.totalFrequency += attribute.frequency[i] initdict[param[0]] = attribute MAX_FREQUENCY += attribute.totalFrequency map = TreeMap(initdict) self.logger.info("核心词典读入词条%i,全部频次%i,耗时%fms" % (map.size(), MAX_FREQUENCY, (time() - start) * 1000)) print ("核心词典读入词条%i,全部频次%i,耗时%fms" % (map.size(), MAX_FREQUENCY, (time() - start) * 1000)) self.trie.build(map) self.logger.info("核心词典加载成功:%i个词条,下面将写入缓存" % self.trie.size1()) print ("核心词典加载成功:%i个词条,下面将写入缓存" % self.trie.size1()) try: out = open(self.path + Predefine.BIN_EXT, 'w+') attributeList = map.values() out.writelines(Convert.convert(len(attributeList))) for attribute in attributeList: out.writelines(Convert.convert(attribute.totalFrequency)) out.writelines(Convert.convert(len(attribute.nature))) for i in range(len(attribute.nature)): out.writelines(Convert.convert(Nature.ordinal(attribute.nature[i]))) out.writelines(Convert.convert(attribute.frequency[i])) self.trie.save(out) out.close() except Exception as e: self.logger.warning("保存失败%s" % str(e)) return False except IOError as e: self.logger.warning("核心词典%s不存在或读取错误!" % str(e)) return False return True
def load(self, path): start = time() valueArray = self.onLoadValue(path) if valueArray is None: self.logger.warning("加载值%s.value.dat失败,耗时%fms" % (path, (time() - start) * 1000)) return False self.logger.info("加载值%s.value.dat成功,耗时%fms" % (path, (time() - start) * 1000)) print("加载值%s.value.dat成功,耗时%fms" % (path, (time() - start) * 1000)) start = time() if self.loadDat(path + '.trie.dat', valueArray): self.logger.info("加载键%s.trie.dat成功,耗时%fms" % (path, (time() - start) * 1000)) print("加载键%s.trie.dat成功,耗时%fms" % (path, (time() - start) * 1000)) return True keyList = [] try: br = open(path, 'r') while 1: line = br.readline().encode('utf-8').strip(' \n\t\r') if not line: break paraArray = line.split(' ') keyList.append(paraArray[0]) except Exception as e: self.logger.warning("读取%s失败%s" % (path, str(e))) resultcode = self.trie.kvbuild(keyList, valueArray) if resultcode != 0: self.logger.warning("trie建立失败%i,正在尝试排序后重载" % resultcode) initdict = {} map = None for i in range(len(list(valueArray))): initdict[keyList[i]] = valueArray[i] map = TreeMap(initdict).sort() self.trie.build(map) i = 0 for v in map.values(): valueArray[i] = v i += 1 self.trie.save(path + '.trie.dat') self.logger.info(path + "加载成功") return True
def loadtxt(path, defaultNature, map, customNatureCollector): """ 加载用户词典(追加) :param path: 词典路径 :param defaultNature: 默认词性 :param map: :param customNatureCollector: 收集用户词性 :return: """ try: initdict = OrderedDict() br = open(path, 'r') while 1: line = br.readline().encode().strip() if not line: break param = line.split(" ") natureCount = (len(param) - 1) / 2 attribute = None if natureCount == 0: attribute = CoreDictionary.Attribute().init5(defaultNature) else: attribute = CoreDictionary.Attribute().init1(natureCount) for i in range(natureCount): attribute.nature[i] = Nature.valueOf(param[1 + 2 * i]) attribute.frequency[i] = int(param[2 + 2 * i]) attribute.totalFrequency += attribute.frequency[i] initdict[param[0]] = attribute map = TreeMap(initdict) except Exception, e: Predefine.logger.warning("自定义词典%s读取错误%s" % (path, e)) return False, map
class EnumItem(Enum): def __init__(self): Enum.__init__(self) self.initdict = {} self.labelMap = TreeMap({}) # Treemap()对象 def getFrequency(self, label): frequency = self.labelMap.get(label) if frequency is None: return 0 return frequency def init1(self, label, frequency): self.initdict[label] = frequency self.labelMap = TreeMap(self.initdict) return self def init2(self, *args): """ 创建一个条目,其标签频次都是1,各标签由参数指定 :param args: :return: """ for label in args: self.initdict[label] = 1 self.labelMap = TreeMap(self.initdict) return self def init3(self, initdict): self.initdict = initdict self.labelMap = TreeMap(self.initdict) return self def init4(self, initdict): self.initdict = initdict self.labelMap = TreeMap(self.initdict) return self def containsLabel(self, label): return label in self.labelMap.result.keys() @staticmethod def create(param): if param is None: return None array = param.split(' ') return EnumItem.create1(array) @staticmethod def create1(param): if len(param) % 2 == 0: return None natureCount = (len(param) - 1) / 2 entries = [None] * natureCount for i in range(natureCount): entries[i] = {param[1 + 2 * i]: int(param[2 + 2 * i])} return {param[0]: entries}
def load(self): TranslatedPersonDictionary.trie = DoubleArrayTrie() if self.loadDat(): return True initdict = OrderedDict() # map = TreeMap({}) # charFrequencyMap = TreeMap({}) br = open(TranslatedPersonDictionary.path, 'r') while 1: line = br.readline().encode().strip() if not line: break initdict[line] = True ''' map.put(line, True) print line # 音译人名常用字词典自动生成 for c in line.decode(): # 排除一些过于常用的字 if c in "不赞": continue f = charFrequencyMap.get(c) if f is None: f = 0 charFrequencyMap.put(c, f + 1) print c ''' ''' map.put(".", True) # 将常用字也加进去 for k, v in charFrequencyMap.items(): if v < 10: continue map.put(str(k), True) print str(k) print "开始排序" map.sort() print "排序完毕" ''' map = TreeMap(initdict) Predefine.logger.info("音译人名词典%s开始构建双数组..." % TranslatedPersonDictionary.path) print("音译人名词典%s开始构建双数组..." % TranslatedPersonDictionary.path) TranslatedPersonDictionary.trie.build(map) Predefine.logger.info("音译人名词典%s开始编译DAT文件..." % TranslatedPersonDictionary.path) print("音译人名词典%s开始编译DAT文件..." % TranslatedPersonDictionary.path) Predefine.logger.info( "音译人名词典%s编译结果:%s" % (TranslatedPersonDictionary.path, self.saveDat())) return True
def init(self): start = time() if not PersonDictionary.dictionary.load(Config.PersonDictionaryPath): self.logger.error("人名词典加载失败:%s" % Config.PersonDictionaryPath) sys.exit(0) PersonDictionary.transformMatrixDictionary.init1(NR) PersonDictionary.transformMatrixDictionary.load( Config.PersonDictionaryTrPath) initdict = {} for pattern in NRPattern: initdict[str(pattern)] = pattern map = TreeMap(initdict).sort() PersonDictionary.trie.build(map) self.logger.info("%s加载成功,耗时%fms" % (Config.PersonDictionaryPath, (time() - start) * 1000))
def load(self): start = time() PlaceDictionary.dictionary.load(Config.PlaceDictionaryPath) Predefine.logger.info("%s加载成功,耗时%fms" % (Config.PlaceDictionaryPath, (time() - start) * 1000)) print "%s加载成功,耗时%fms" % (Config.PlaceDictionaryPath, (time() - start) * 1000) PlaceDictionary.transformMatrixDictionary = PlaceDictionary.transformMatrixDictionary.init1( NS) PlaceDictionary.transformMatrixDictionary.load( Config.PlaceDictionaryTrPath) init_dict = {} init_dict["CDEH"] = "CDEH" init_dict["CDH"] = "CDH" init_dict["CH"] = "CH" init_dict["GH"] = "GH" PlaceDictionary.trie.build(TreeMap(init_dict))
class CustomNatureUtility(object): Predefine.logger.warning("已激活自定义词性功能,用户需对本地环境的兼容性和稳定性负责!\n") extraValueMap = TreeMap({}) enumBuster = EnumBuster() def __init__(self): pass def addNature(self, name): """ 增加词性 @param name 词性名称 :return: 词性 """ customNature = self.extraValueMap.get(name) if customNature != None: return customNature return customNature
def load(self): JapanesePersonDictionary.trie = DoubleArrayTrie() if self.loadDat(): return True initdict = OrderedDict() br = open(JapanesePersonDictionary.path, 'r') while 1: line = br.readline().encode().strip() if not line: break param = line.split(" ") initdict[param[0]] = param[1] map = TreeMap(initdict) Predefine.logger.info("日本人名词典%s开始构建双数组..." % JapanesePersonDictionary.path) JapanesePersonDictionary.trie.build(map) Predefine.logger.info("日本人名词典%s开始编译DAT文件..." % JapanesePersonDictionary.path) Predefine.logger.info("日本人名词典%s编译结果:%s" % (JapanesePersonDictionary.path, str(self.saveDat(map)))) return True
def loadMainDictionary(mainPath): Predefine.logger.info("自定义词典开始加载:%s" % mainPath) print "自定义词典开始加载:%s" % mainPath if CustomDictionary.loadDat(mainPath): return True CustomDictionary.dat = DoubleArrayTrie() map = TreeMap({}) customNatureCollector = set() try: for p in CustomDictionary.path: defaultNature = Nature.n Predefine.logger.info("以默认词性[%s]加载自定义词典%s中……" % (str(defaultNature), p)) print "以默认词性[%s]加载自定义词典%s中……" % (str(defaultNature), p) success, map = CustomDictionary.loadtxt( p, defaultNature, map, customNatureCollector) if not success: Predefine.logger.warning("失败:%s" % p) except IOError, e: Predefine.logger.error("自定义词典%s不存在或读取错误!%s" % (mainPath, e))
def loadMainDictionary(mainPath): Predefine.logger.info("自定义词典开始加载:%s" % mainPath) print("自定义词典开始加载:%s" % mainPath) if CustomDictionary.loadDat(mainPath): return True CustomDictionary.dat = DoubleArrayTrie() map = TreeMap({}) customNatureCollector = set() try: for p in CustomDictionary.path: defaultNature = Nature.n Predefine.logger.info("以默认词性[%s]加载自定义词典%s中……" % (str(defaultNature), p)) print("以默认词性[%s]加载自定义词典%s中……" % (str(defaultNature), p)) success, map = CustomDictionary.loadtxt( p, defaultNature, map, customNatureCollector) if not success: Predefine.logger.warning("失败:%s" % p) except IOError as e: Predefine.logger.error("自定义词典%s不存在或读取错误!%s" % (mainPath, e)) except Exception as e: Predefine.logger.error("自定义词典%s缓存失败!%s\n" % (mainPath, e)) if map.size() == 0: Predefine.logger.warning("没有加载到任何词条") # 当做空白占位符 map.put(Predefine.TAG_OTHER, None) Predefine.logger.info("正在构建DoubleArrayTrie……") CustomDictionary.dat.build(map) # 缓存成dat文件,下次加载会快很多 Predefine.logger.info("正在缓存词典为dat文件……") # 缓存值文件 attributeList = [] for key, value in map.items(): attributeList.append(value) out = open(mainPath + Predefine.BIN_EXT, 'w+') # 缓存用户词性 #IOUtil.writeCustomNature(out, customNatureCollector) # 缓存正文 out.writelines(Convert.convert(len(attributeList))) for attribute in attributeList: attribute.save(out) CustomDictionary.dat.save1(out) out.close() return True
def init4(self, initdict): self.initdict = initdict self.labelMap = TreeMap(self.initdict) return self
def load(self, path): if self.loadDat(CoreBiGramTableDictionary.datPath): return True # Treemap对象 map = TreeMap({}) # map = dict() try: br = open(path, 'r') line = "" total = 0 maxWordId = CoreDictionary.trie.size1() line_num = 1 while 1: line = br.readline().strip("\n\r\t ") if not line: break params = re.split(' ', line) twoWord = params[0].split("@") a = twoWord[0] idA = CoreDictionary.trie.exactMatchSearch(a) if idA == -1: continue b = twoWord[1] idB = CoreDictionary.trie.exactMatchSearch(b) if idB == -1: continue freq = int(params[1]) biMap = map.get(idA) if biMap is None: biMap = TreeMap({}) biMap.put(int(idB), freq) map.put(int(idA), biMap) total += 2 line_num += 1 for k, v in map.items(): map.put(k, v.sort_long()) map.sort_long() br.close() CoreBiGramTableDictionary.start = [int()] * (maxWordId + 1) # total是连续的个数*2 CoreBiGramTableDictionary.pair = [int()] * total offset = 0 for i in range(maxWordId): bMap = map.get(i) if bMap is not None: for k, v in bMap.items(): index = offset << 1 CoreBiGramTableDictionary.pair[index] = k CoreBiGramTableDictionary.pair[index + 1] = v offset += 1 CoreBiGramTableDictionary.start[i + 1] = offset self.logger.info("二元词典读取完毕:%s") except IOError, e: self.logger("二元词典%s不存在或读取错误!%s" % (path, e)) return False
class State(object): def __init__(self): # 模式串的长度,也是这个状态的深度 self.depth = int() # 只要这个状态可达,则记录模式串 self.emits = None # goto 表,也称转移函数。根据字符串的下一个字符转移到下一个状态 self.success = TreeMap({}) # 在双数组中的对应下标 self.index = int() # fail 函数,如果没有匹配到,则跳转到此状态。 self.failure = None def init1(self, depth): """ 构造深度为depth的节点 :param depth: :return: """ self.depth = depth return self def isAcceptable(self): """ 是否是终止状态 :return: """ return self.depth > 0 and self.emits is not None def getDepth(self): """ 获取节点深度 :return: """ return self.depth def getLargestValueId(self): """ 获取最大的值 :return: """ if self.emits is None or len(self.emits) == 0: return None return iter(self.emits).next() def addEmit(self, keyword): """ 添加一个匹配到的模式串(这个状态对应着这个模式串) :param keyword: :return: """ if self.emits is None: # self.emits是倒序排列的treeset # this.emits = new TreeSet<Integer>(Collections.reverseOrder()); self.emits = set() self.emits = set(tuple(self.emits)) self.emits.add(keyword) self.emits = sorted(self.emits, reverse=True) def addEmit1(self, emits): """ 添加一些匹配到的模式串 :param emits: :return: """ for emit in emits: self.addEmit(emit) def nextStateIgnoreRootState(self, character): return self.nextState1(character, True) def addState(self, character): character = character.encode('utf-8') nextState = self.nextStateIgnoreRootState(character) if nextState is None: nextState = State().init1(self.depth + 1) self.success.result[character] = nextState self.success = TreeMap(inputDict=self.success.result).sort() return nextState def getSuccess(self): """ 获取goto表 :return: """ return self.success def setIndex(self, index): self.index = index def getIndex(self): return self.index def getStates(self): return self.success.values() def setFailure(self, failState, fail): """ 设置failure状态 :param failState: :param fail: :return: """ self.failure = failState fail[self.index] = failState.index def getEmit(self): """ 获取这个节点代表的模式串(们) :return: """ if self.emits is None: return set() else: return self.emits def getTransitions(self): return set(self.success.keys()) def nextState(self, character): """ 按照character转移,根节点转移失败会返回自己(永远不会返回null) :param character: :return: """ return self.nextState1(character, False) def nextState1(self, character, ignoreRootState): """ 转移到下一个状态 :param character:希望按此字符转移 :param ignoreRootState:是否忽略根节点,如果是根节点自己调用则应该是true,否则为false :return:转移结果 """ nextState = self.success.get(character) if not ignoreRootState and nextState is None and self.depth == 0: nextState = self return nextState def getFailure(self): """ 获取failure状态 :return: """ return self.failure
def init1(self, label, frequency): self.initdict[label] = frequency self.labelMap = TreeMap(self.initdict) return self
def __init__(self): Enum.__init__(self) self.initdict = {} self.labelMap = TreeMap({}) # Treemap()对象
def __init__(self): self.extraValueMap = TreeMap({})
return False if __name__ == '__main__': inputDict = { 'aaa': 'aaa', 'fff': 'fff', 'bbb': 'bbb', '111': '111', '11': '11', 'ccc': 'ddd', 'ddd': 'ddd', 'd': 'd' } tm = TreeMap(inputDict) tm.sort() # #print type(tm.result) # #print type(tm.result.items()) # for key, value in tm.result.items(): # #print key, value #print tm.result.items() #print 'hdsj' trie = DoubleArrayTrie() print(trie.size) trie.build(tm.result) #print trie.size DoubleArrayTrie().loadBaseAndCheckByFileChannel( "D:/liepin_project/py-segmentation/data/dictionary/person/nr.txt.trie.dat" )