def load(self, path): self.logger.info("核心词典开始加载:%s" % path) print("核心词典开始加载:%s" % path) if self.loadDat(path): return True initdict = OrderedDict() try: f = open(path, 'r') line = '' MAX_FREQUENCY = 0 start = time() while 1: line = f.readline().strip(' \n\t\r') if not line: break param = line.split('\t') natureCount = int((len(param) - 1) / 2) attribute = CoreDictionary.Attribute().init1(natureCount) for i in range(natureCount): attribute.nature[i] = Nature.valueOf(param[1 + 2 * i]) attribute.frequency[i] = int(param[2 + 2 * i]) attribute.totalFrequency += attribute.frequency[i] initdict[param[0]] = attribute MAX_FREQUENCY += attribute.totalFrequency map = TreeMap(initdict) self.logger.info("核心词典读入词条%i,全部频次%i,耗时%fms" % (map.size(), MAX_FREQUENCY, (time() - start) * 1000)) print ("核心词典读入词条%i,全部频次%i,耗时%fms" % (map.size(), MAX_FREQUENCY, (time() - start) * 1000)) self.trie.build(map) self.logger.info("核心词典加载成功:%i个词条,下面将写入缓存" % self.trie.size1()) print ("核心词典加载成功:%i个词条,下面将写入缓存" % self.trie.size1()) try: out = open(self.path + Predefine.BIN_EXT, 'w+') attributeList = map.values() out.writelines(Convert.convert(len(attributeList))) for attribute in attributeList: out.writelines(Convert.convert(attribute.totalFrequency)) out.writelines(Convert.convert(len(attribute.nature))) for i in range(len(attribute.nature)): out.writelines(Convert.convert(Nature.ordinal(attribute.nature[i]))) out.writelines(Convert.convert(attribute.frequency[i])) self.trie.save(out) out.close() except Exception as e: self.logger.warning("保存失败%s" % str(e)) return False except IOError as e: self.logger.warning("核心词典%s不存在或读取错误!" % str(e)) return False return True
def loadMainDictionary(mainPath): Predefine.logger.info("自定义词典开始加载:%s" % mainPath) print("自定义词典开始加载:%s" % mainPath) if CustomDictionary.loadDat(mainPath): return True CustomDictionary.dat = DoubleArrayTrie() map = TreeMap({}) customNatureCollector = set() try: for p in CustomDictionary.path: defaultNature = Nature.n Predefine.logger.info("以默认词性[%s]加载自定义词典%s中……" % (str(defaultNature), p)) print("以默认词性[%s]加载自定义词典%s中……" % (str(defaultNature), p)) success, map = CustomDictionary.loadtxt( p, defaultNature, map, customNatureCollector) if not success: Predefine.logger.warning("失败:%s" % p) except IOError as e: Predefine.logger.error("自定义词典%s不存在或读取错误!%s" % (mainPath, e)) except Exception as e: Predefine.logger.error("自定义词典%s缓存失败!%s\n" % (mainPath, e)) if map.size() == 0: Predefine.logger.warning("没有加载到任何词条") # 当做空白占位符 map.put(Predefine.TAG_OTHER, None) Predefine.logger.info("正在构建DoubleArrayTrie……") CustomDictionary.dat.build(map) # 缓存成dat文件,下次加载会快很多 Predefine.logger.info("正在缓存词典为dat文件……") # 缓存值文件 attributeList = [] for key, value in map.items(): attributeList.append(value) out = open(mainPath + Predefine.BIN_EXT, 'w+') # 缓存用户词性 #IOUtil.writeCustomNature(out, customNatureCollector) # 缓存正文 out.writelines(Convert.convert(len(attributeList))) for attribute in attributeList: attribute.save(out) CustomDictionary.dat.save1(out) out.close() return True