class CustomDictionary(object): # 用于储存用户动态插入词条的二分trie树 trie = BinTrie() dat = DoubleArrayTrie() # 第一个是主词典,其他是副词典 path = Config.CustomDictionaryPath start = time() def __init__(self): CustomDictionary.load() pass @staticmethod def load(): """ 自动加载词典 :param mainPath: :return: """ start = time() if not CustomDictionary.loadMainDictionary(CustomDictionary.path[0]): Predefine.logger.warning("自定义词典%s加载失败" % (" ".join(CustomDictionary.path))) else: Predefine.logger.info("自定义词典加载成功:%i个词条,耗时%fms" % (CustomDictionary.dat.size, (time() - start) * 1000)) print "自定义词典%s加载成功:%i个词条,耗时%fms" % (" ".join( CustomDictionary.path), CustomDictionary.dat.size1(), (time() - start) * 1000) return True @staticmethod def loadMainDictionary(mainPath): Predefine.logger.info("自定义词典开始加载:%s" % mainPath) print "自定义词典开始加载:%s" % mainPath if CustomDictionary.loadDat(mainPath): return True CustomDictionary.dat = DoubleArrayTrie() map = TreeMap({}) customNatureCollector = set() try: for p in CustomDictionary.path: defaultNature = Nature.n Predefine.logger.info("以默认词性[%s]加载自定义词典%s中……" % (str(defaultNature), p)) print "以默认词性[%s]加载自定义词典%s中……" % (str(defaultNature), p) success, map = CustomDictionary.loadtxt( p, defaultNature, map, customNatureCollector) if not success: Predefine.logger.warning("失败:%s" % p) except IOError, e: Predefine.logger.error("自定义词典%s不存在或读取错误!%s" % (mainPath, e)) except Exception, e: Predefine.logger.error("自定义词典%s缓存失败!%s\n" % (mainPath, e))
def load(self): TranslatedPersonDictionary.trie = DoubleArrayTrie() if self.loadDat(): return True initdict = OrderedDict() # map = TreeMap({}) # charFrequencyMap = TreeMap({}) br = open(TranslatedPersonDictionary.path, 'r') while 1: line = br.readline().encode().strip() if not line: break initdict[line] = True ''' map.put(line, True) print line # 音译人名常用字词典自动生成 for c in line.decode(): # 排除一些过于常用的字 if c in "不赞": continue f = charFrequencyMap.get(c) if f is None: f = 0 charFrequencyMap.put(c, f + 1) print c ''' ''' map.put(".", True) # 将常用字也加进去 for k, v in charFrequencyMap.items(): if v < 10: continue map.put(str(k), True) print str(k) print "开始排序" map.sort() print "排序完毕" ''' map = TreeMap(initdict) Predefine.logger.info("音译人名词典%s开始构建双数组..." % TranslatedPersonDictionary.path) print("音译人名词典%s开始构建双数组..." % TranslatedPersonDictionary.path) TranslatedPersonDictionary.trie.build(map) Predefine.logger.info("音译人名词典%s开始编译DAT文件..." % TranslatedPersonDictionary.path) print("音译人名词典%s开始编译DAT文件..." % TranslatedPersonDictionary.path) Predefine.logger.info( "音译人名词典%s编译结果:%s" % (TranslatedPersonDictionary.path, self.saveDat())) return True
def loadMainDictionary(mainPath): Predefine.logger.info("自定义词典开始加载:%s" % mainPath) print("自定义词典开始加载:%s" % mainPath) if CustomDictionary.loadDat(mainPath): return True CustomDictionary.dat = DoubleArrayTrie() map = TreeMap({}) customNatureCollector = set() try: for p in CustomDictionary.path: defaultNature = Nature.n Predefine.logger.info("以默认词性[%s]加载自定义词典%s中……" % (str(defaultNature), p)) print("以默认词性[%s]加载自定义词典%s中……" % (str(defaultNature), p)) success, map = CustomDictionary.loadtxt( p, defaultNature, map, customNatureCollector) if not success: Predefine.logger.warning("失败:%s" % p) except IOError as e: Predefine.logger.error("自定义词典%s不存在或读取错误!%s" % (mainPath, e)) except Exception as e: Predefine.logger.error("自定义词典%s缓存失败!%s\n" % (mainPath, e)) if map.size() == 0: Predefine.logger.warning("没有加载到任何词条") # 当做空白占位符 map.put(Predefine.TAG_OTHER, None) Predefine.logger.info("正在构建DoubleArrayTrie……") CustomDictionary.dat.build(map) # 缓存成dat文件,下次加载会快很多 Predefine.logger.info("正在缓存词典为dat文件……") # 缓存值文件 attributeList = [] for key, value in map.items(): attributeList.append(value) out = open(mainPath + Predefine.BIN_EXT, 'w+') # 缓存用户词性 #IOUtil.writeCustomNature(out, customNatureCollector) # 缓存正文 out.writelines(Convert.convert(len(attributeList))) for attribute in attributeList: attribute.save(out) CustomDictionary.dat.save1(out) out.close() return True
def load(self): JapanesePersonDictionary.trie = DoubleArrayTrie() if self.loadDat(): return True initdict = OrderedDict() br = open(JapanesePersonDictionary.path, 'r') while 1: line = br.readline().encode().strip() if not line: break param = line.split(" ") initdict[param[0]] = param[1] map = TreeMap(initdict) Predefine.logger.info("日本人名词典%s开始构建双数组..." % JapanesePersonDictionary.path) JapanesePersonDictionary.trie.build(map) Predefine.logger.info("日本人名词典%s开始编译DAT文件..." % JapanesePersonDictionary.path) Predefine.logger.info("日本人名词典%s编译结果:%s" % (JapanesePersonDictionary.path, str(self.saveDat(map)))) return True
def loadMainDictionary(mainPath): Predefine.logger.info("自定义词典开始加载:%s" % mainPath) print "自定义词典开始加载:%s" % mainPath if CustomDictionary.loadDat(mainPath): return True CustomDictionary.dat = DoubleArrayTrie() map = TreeMap({}) customNatureCollector = set() try: for p in CustomDictionary.path: defaultNature = Nature.n Predefine.logger.info("以默认词性[%s]加载自定义词典%s中……" % (str(defaultNature), p)) print "以默认词性[%s]加载自定义词典%s中……" % (str(defaultNature), p) success, map = CustomDictionary.loadtxt( p, defaultNature, map, customNatureCollector) if not success: Predefine.logger.warning("失败:%s" % p) except IOError, e: Predefine.logger.error("自定义词典%s不存在或读取错误!%s" % (mainPath, e))
class CoreDictionary(object): """ 使用DoubleArrayTrie实现的核心词典 """ class Attribute(object): """ 核心词典中的词属性 """ def __init__(self): # 词性列表 self.nature = [Nature] # 词性对应的词频 self.frequency = [int()] self.totalFrequency = int() self.logger = Predefine.logger def init1(self, size): self.nature = [Nature] * int(size) self.frequency = [int()] * int(size) return self def init2(self, nature, frequency): self.nature = nature self.frequency = frequency return self def init3(self, nature, frequency): self.init1(1) self.nature[0] = nature self.frequency[0] = frequency self.totalFrequency = frequency return self def init4(self, nature, frequency, totalFrequency): self.nature = nature self.frequency = frequency self.totalFrequency = totalFrequency def init5(self, nature): """ 使用单个词性,默认词频1000构造 :param nature: :return: """ return self.init3(nature, 1000) def create(self, natureWithFrequency): try: param = natureWithFrequency.strip().split(' ') natureCount = len(param) / 2 attribute = CoreDictionary.Attribute().init1(natureCount) for i in range(natureCount): attribute.nature[i] = Nature.valueOf(param[2 * i]) attribute.frequency[i] = int(param[1 + 2 * i]) attribute.totalFrequency += attribute.frequency[i] return attribute except: self.logger.warning("使用字符串" + natureWithFrequency + "创建词条属性失败!") return None def bcreate(self, byteArray, natureIndexArray): """ 从字节流中加载 @ param byteArray @ param natureIndexArray :return: """ currentTotalFrequency = byteArray.nextInt() length = byteArray.nextInt() attribute = CoreDictionary.Attribute().init1(length) attribute.totalFrequency = currentTotalFrequency for i in range(length): attribute.nature[i] = natureIndexArray[byteArray.nextInt()] attribute.frequency[i] = byteArray.nextInt() return attribute def getNatureFrequency(self, nature): """ 获取词性的词频 @param nature 词性 :return: 词频 """ i = 0 for pos in self.nature: if nature == pos: return self.frequency[i] i += 1 return 0 def hasNature(self, nature): """ 判断是否有某个词性 @param nature :return: boolean """ return self.getNatureFrequency(nature) > 0 def hasNatureStartsWith(self, prefix): """ 是否有以某个前缀开头的词性 :param prefix: 词性前缀,比如u会查询是否有ude, uzhe等等 :return: boolean """ for n in self.nature: if n.startsWith(prefix): # ??? return True return False def toString(self): """ nature and frequency to string :return: toString result """ result = "" for i in range(len(self.nature)): result += "%s %s " % (str(self.nature[i]), str(self.frequency[i])) return result def save(self, out): out.writelines(Convert.convert(self.totalFrequency)) out.writelines(Convert.convert(len(self.nature))) for i in range(len(self.nature)): out.writelines(Convert.convert(Nature.ordinal(self.nature[i]))) out.writelines(Convert.convert(self.frequency[i])) trie = DoubleArrayTrie() attribute = Attribute() NR_WORD_ID = None NS_WORD_ID = None NT_WORD_ID = None T_WORD_ID = None X_WORD_ID = None M_WORD_ID = None NX_WORD_ID = None def __init__(self): self.path = Config.CoreDictionaryPath self.logger = Predefine.logger self.totalFrequency = 221894 # 自动加载词典 start = time() if not self.load(self.path): self.logger.error('核心词典%s加载失败' % self.path) sys.exit(1) else: end = time() self.logger.info('%s加载成功%i个词条,耗时%fms' % (self.path, 2, (end - start) * 1000)) # 一些特殊的WORD_ID CoreDictionary.NR_WORD_ID = CoreDictionary.getWordID(Predefine.TAG_PEOPLE) CoreDictionary.NS_WORD_ID = CoreDictionary.getWordID(Predefine.TAG_PLACE) CoreDictionary.NT_WORD_ID = CoreDictionary.getWordID(Predefine.TAG_GROUP) CoreDictionary.T_WORD_ID = CoreDictionary.getWordID(Predefine.TAG_TIME) CoreDictionary.X_WORD_ID = CoreDictionary.getWordID(Predefine.TAG_CLUSTER) CoreDictionary.M_WORD_ID = CoreDictionary.getWordID(Predefine.TAG_NUMBER) CoreDictionary.NX_WORD_ID = CoreDictionary.getWordID(Predefine.TAG_PROPER) @staticmethod def get(key): """ 获取条目 :@param key :@return """ return CoreDictionary.trie.get2(key) def get1(self, wordID): """ 获取条目 :@param key :@return """ return self.trie.get(wordID) @staticmethod def getWordID(a): """ 获取词语的ID :param a: 词语 :return: ID,如果不存在,则返回-1 """ return CoreDictionary.trie.exactMatchSearch(a) def load(self, path): self.logger.info("核心词典开始加载:%s" % path) print("核心词典开始加载:%s" % path) if self.loadDat(path): return True initdict = OrderedDict() try: f = open(path, 'r') line = '' MAX_FREQUENCY = 0 start = time() while 1: line = f.readline().strip(' \n\t\r') if not line: break param = line.split('\t') natureCount = int((len(param) - 1) / 2) attribute = CoreDictionary.Attribute().init1(natureCount) for i in range(natureCount): attribute.nature[i] = Nature.valueOf(param[1 + 2 * i]) attribute.frequency[i] = int(param[2 + 2 * i]) attribute.totalFrequency += attribute.frequency[i] initdict[param[0]] = attribute MAX_FREQUENCY += attribute.totalFrequency map = TreeMap(initdict) self.logger.info("核心词典读入词条%i,全部频次%i,耗时%fms" % (map.size(), MAX_FREQUENCY, (time() - start) * 1000)) print ("核心词典读入词条%i,全部频次%i,耗时%fms" % (map.size(), MAX_FREQUENCY, (time() - start) * 1000)) self.trie.build(map) self.logger.info("核心词典加载成功:%i个词条,下面将写入缓存" % self.trie.size1()) print ("核心词典加载成功:%i个词条,下面将写入缓存" % self.trie.size1()) try: out = open(self.path + Predefine.BIN_EXT, 'w+') attributeList = map.values() out.writelines(Convert.convert(len(attributeList))) for attribute in attributeList: out.writelines(Convert.convert(attribute.totalFrequency)) out.writelines(Convert.convert(len(attribute.nature))) for i in range(len(attribute.nature)): out.writelines(Convert.convert(Nature.ordinal(attribute.nature[i]))) out.writelines(Convert.convert(attribute.frequency[i])) self.trie.save(out) out.close() except Exception as e: self.logger.warning("保存失败%s" % str(e)) return False except IOError as e: self.logger.warning("核心词典%s不存在或读取错误!" % str(e)) return False return True def loadDat(self, path): """ 从磁盘加载双数组 :param path: :return: """ start = time() try: try: byteArray = pickle.load(open(path + Predefine.PIC_EXT, 'rb')) except Exception as e: byteArray = ByteArray().createByteArray(path + Predefine.BIN_EXT) out = open(path + Predefine.PIC_EXT, 'wb') pickle.dump(byteArray, out) if byteArray is None: return False size = byteArray.nextInt() # 列表,存储Attribute对象 attributes = [None] * size natureIndexArray = list(Nature) for i in range(size): # 第一个是全部频次,第二个是词性个数 currentTotalFrequency = byteArray.nextInt() length = byteArray.nextInt() attributes[i] = CoreDictionary.Attribute().init1(length) attributes[i].totalFrequency = currentTotalFrequency for j in range(length): attributes[i].nature[j] = natureIndexArray[byteArray.nextInt()] attributes[i].frequency[j] = byteArray.nextInt() if not self.trie.load(byteArray, attributes) or byteArray.hasMore(): return False except Exception as e: self.logger.warning("读取失败,问题发生在%s" % (str(e))) return False print ("核心词典加载成功%s,耗时%fms" % (path + Predefine.BIN_EXT, (time() - start) * 1000)) return True @staticmethod def get1(key): """ 获取条目 :param key: :return: """ return CoreDictionary.trie.get(key) @staticmethod def get2(wordID): """ 获取条目 :param wordID: :return: """ return CoreDictionary.trie.get(wordID) def getTermFrequency(self, term): """ 获取词频 :param term: :return: """ attribute = self.get1(term) if attribute is None: return 0 return attribute.totalFrequency def contains(self, key): """ 是否包含词语 :param key: :return: """ return self.trie.get(key) is not None
def __init__(self): self.trie = DoubleArrayTrie() self.logger = Predefine.logger
class CommonDictionary(object, metaclass=ABCMeta): def __init__(self): self.trie = DoubleArrayTrie() self.logger = Predefine.logger def load(self, path): start = time() valueArray = self.onLoadValue(path) if valueArray is None: self.logger.warning("加载值%s.value.dat失败,耗时%fms" % (path, (time() - start) * 1000)) return False self.logger.info("加载值%s.value.dat成功,耗时%fms" % (path, (time() - start) * 1000)) print("加载值%s.value.dat成功,耗时%fms" % (path, (time() - start) * 1000)) start = time() if self.loadDat(path + '.trie.dat', valueArray): self.logger.info("加载键%s.trie.dat成功,耗时%fms" % (path, (time() - start) * 1000)) print("加载键%s.trie.dat成功,耗时%fms" % (path, (time() - start) * 1000)) return True keyList = [] try: br = open(path, 'r') while 1: line = br.readline().encode('utf-8').strip(' \n\t\r') if not line: break paraArray = line.split(' ') keyList.append(paraArray[0]) except Exception as e: self.logger.warning("读取%s失败%s" % (path, str(e))) resultcode = self.trie.kvbuild(keyList, valueArray) if resultcode != 0: self.logger.warning("trie建立失败%i,正在尝试排序后重载" % resultcode) initdict = {} map = None for i in range(len(list(valueArray))): initdict[keyList[i]] = valueArray[i] map = TreeMap(initdict).sort() self.trie.build(map) i = 0 for v in map.values(): valueArray[i] = v i += 1 self.trie.save(path + '.trie.dat') self.logger.info(path + "加载成功") return True @abstractmethod def onLoadValue(self, path): """ 实现此方法来加载值 :param path: :return: """ pass @abstractmethod def onSaveValue(self, valueArray, path): """ :param valueArray: :param path: :return: """ pass def loadDat(self, path, valueArray): if self.trie.load1(path, valueArray): return True return False def get(self, key): """ 查询一个单词 :param key: :return:单词对应的条目 """ return self.trie.get2(key)
class CustomDictionary(object): # 用于储存用户动态插入词条的二分trie树 trie = BinTrie() dat = DoubleArrayTrie() # 第一个是主词典,其他是副词典 path = Config.CustomDictionaryPath start = time() def __init__(self): CustomDictionary.load() pass @staticmethod def load(): """ 自动加载词典 :param mainPath: :return: """ start = time() if not CustomDictionary.loadMainDictionary(CustomDictionary.path[0]): Predefine.logger.warning("自定义词典%s加载失败" % (" ".join(CustomDictionary.path))) else: Predefine.logger.info("自定义词典加载成功:%i个词条,耗时%fms" % (CustomDictionary.dat.size, (time() - start) * 1000)) print( "自定义词典%s加载成功:%i个词条,耗时%fms" % (" ".join(CustomDictionary.path), CustomDictionary.dat.size1(), (time() - start) * 1000)) return True @staticmethod def loadMainDictionary(mainPath): Predefine.logger.info("自定义词典开始加载:%s" % mainPath) print("自定义词典开始加载:%s" % mainPath) if CustomDictionary.loadDat(mainPath): return True CustomDictionary.dat = DoubleArrayTrie() map = TreeMap({}) customNatureCollector = set() try: for p in CustomDictionary.path: defaultNature = Nature.n Predefine.logger.info("以默认词性[%s]加载自定义词典%s中……" % (str(defaultNature), p)) print("以默认词性[%s]加载自定义词典%s中……" % (str(defaultNature), p)) success, map = CustomDictionary.loadtxt( p, defaultNature, map, customNatureCollector) if not success: Predefine.logger.warning("失败:%s" % p) except IOError as e: Predefine.logger.error("自定义词典%s不存在或读取错误!%s" % (mainPath, e)) except Exception as e: Predefine.logger.error("自定义词典%s缓存失败!%s\n" % (mainPath, e)) if map.size() == 0: Predefine.logger.warning("没有加载到任何词条") # 当做空白占位符 map.put(Predefine.TAG_OTHER, None) Predefine.logger.info("正在构建DoubleArrayTrie……") CustomDictionary.dat.build(map) # 缓存成dat文件,下次加载会快很多 Predefine.logger.info("正在缓存词典为dat文件……") # 缓存值文件 attributeList = [] for key, value in map.items(): attributeList.append(value) out = open(mainPath + Predefine.BIN_EXT, 'w+') # 缓存用户词性 #IOUtil.writeCustomNature(out, customNatureCollector) # 缓存正文 out.writelines(Convert.convert(len(attributeList))) for attribute in attributeList: attribute.save(out) CustomDictionary.dat.save1(out) out.close() return True @staticmethod def loadtxt(path, defaultNature, map, customNatureCollector): """ 加载用户词典(追加) :param path: 词典路径 :param defaultNature: 默认词性 :param map: :param customNatureCollector: 收集用户词性 :return: """ try: initdict = OrderedDict() br = open(path, 'r', encoding='utf-8') while 1: line = br.readline().strip() if not line: break param = line.split(" ") natureCount = (len(param) - 1) // 2 attribute = None if natureCount == 0: attribute = CoreDictionary.Attribute().init5(defaultNature) else: attribute = CoreDictionary.Attribute().init1(natureCount) for i in range(natureCount): #for i in range(attribute): attribute.nature[i] = Nature.valueOf(param[1 + 2 * i]) attribute.frequency[i] = int(param[2 + 2 * i]) attribute.totalFrequency += attribute.frequency[i] initdict[param[0]] = attribute map = TreeMap(initdict) except Exception as e: Predefine.logger.warning("自定义词典%s读取错误%s" % (path, e)) return False, map return True, map @staticmethod def loadDat(path): """ 从磁盘加载双数组 :param path: :return: """ try: byteArray = pickle.load(open(path + Predefine.PIC_EXT, 'rb')) except Exception as e: byteArray = ByteArray.createByteArray(path + Predefine.BIN_EXT) out = open(path + Predefine.PIC_EXT, 'wb') pickle.dump(byteArray, out) if byteArray is None: return False size = byteArray.nextInt() # 一种兼容措施,当size小于零表示文件头部储存了-size个用户词性 if size < 0: pass attributes = [None] * size natureIndexArray = list(Nature) for i in range(size): # 第一个是全部词频,第二个是词性个数 currentTotalFrequency = byteArray.nextInt() length = byteArray.nextInt() attributes[i] = CoreDictionary.Attribute().init1(length) attributes[i].totalFrequency = currentTotalFrequency for j in range(length): attributes[i].nature[j] = natureIndexArray[byteArray.nextInt()] attributes[i].frequency[j] = byteArray.nextInt() if not CustomDictionary.dat.load(byteArray, attributes): return False return True @staticmethod def get(key): attribute = CustomDictionary.dat.get(key) if attribute is not None: return attribute if CustomDictionary.trie is None: return None return CustomDictionary.trie.get(key)