def generate(): preType = 5 preChar = 0 typeList = [] for i in range(65535): type = TextUtility.charType(i) if type != preType: array = [int()] * 3 array[0] = preChar array[1] = i - 1 array[2] = preType typeList.append(array) preChar = i array = [int()] * 3 array[0] = preChar array[1] = 65535 array[2] = preType typeList.append(array) out = sys.file(Config.CharTypePath, 'w+') for array in typeList: out.writelines(Convert.convert_char(array[0])) out.writelines(Convert.convert_char(array[1])) out.writelines(Convert.convert_byte(array[2])) out.close() byteArray = ByteArray.createByteArray(Config.CharTypePath) return byteArray
def save(self, path): try: out = open(path, 'w+') out.writelines(Convert.convert(self.size)) for i in range(self.size): out.writelines(Convert.convert(self.base[i])) out.writelines(Convert.convert(self.check[i])) out.close() except Exception, e: return False
def load(self, path): self.logger.info("核心词典开始加载:%s" % path) print("核心词典开始加载:%s" % path) if self.loadDat(path): return True initdict = OrderedDict() try: f = open(path, 'r') line = '' MAX_FREQUENCY = 0 start = time() while 1: line = f.readline().strip(' \n\t\r') if not line: break param = line.split('\t') natureCount = int((len(param) - 1) / 2) attribute = CoreDictionary.Attribute().init1(natureCount) for i in range(natureCount): attribute.nature[i] = Nature.valueOf(param[1 + 2 * i]) attribute.frequency[i] = int(param[2 + 2 * i]) attribute.totalFrequency += attribute.frequency[i] initdict[param[0]] = attribute MAX_FREQUENCY += attribute.totalFrequency map = TreeMap(initdict) self.logger.info("核心词典读入词条%i,全部频次%i,耗时%fms" % (map.size(), MAX_FREQUENCY, (time() - start) * 1000)) print ("核心词典读入词条%i,全部频次%i,耗时%fms" % (map.size(), MAX_FREQUENCY, (time() - start) * 1000)) self.trie.build(map) self.logger.info("核心词典加载成功:%i个词条,下面将写入缓存" % self.trie.size1()) print ("核心词典加载成功:%i个词条,下面将写入缓存" % self.trie.size1()) try: out = open(self.path + Predefine.BIN_EXT, 'w+') attributeList = map.values() out.writelines(Convert.convert(len(attributeList))) for attribute in attributeList: out.writelines(Convert.convert(attribute.totalFrequency)) out.writelines(Convert.convert(len(attribute.nature))) for i in range(len(attribute.nature)): out.writelines(Convert.convert(Nature.ordinal(attribute.nature[i]))) out.writelines(Convert.convert(attribute.frequency[i])) self.trie.save(out) out.close() except Exception as e: self.logger.warning("保存失败%s" % str(e)) return False except IOError as e: self.logger.warning("核心词典%s不存在或读取错误!" % str(e)) return False return True
def saveDat(self, map): """ 保存bat到磁盘 :param map: :return: """ out = open(JapanesePersonDictionary.path + Predefine.VALUE_EXT, 'w+') out.writelines(Convert.convert(map.size())) for k, c in map.items(): out.writelines(Convert.convert_char(ord(c))) out.close() return JapanesePersonDictionary.trie.save(JapanesePersonDictionary.path + Predefine.TRIE_EXT)
def save1(self, out): """ 将base和check保存下来 :param out: :return: """ try: out.writelines(Convert.convert(self.size)) for i in range(self.size): out.writelines(Convert.convert(self.base[i])) out.writelines(Convert.convert(self.check[i])) except Exception, e: return False
def saveDat(self, path, valueArray): try: out = file(path, 'w+') out.writelines(Convert.convert(len(valueArray))) for item in valueArray: out.writelines(Convert.convert(len(item))) for entry in item: out.writelines(Convert.convert(NR.ordinal(NR.valueOf(entry[0])))) out.writelines(Convert.convert(int(entry[1]))) out.close() except Exception, e: self.logger.warning("保存失败%s" % str(e)) return False
def saveDat(self, path, valueArray): try: out = open(path, 'w+',encoding='utf-8') out.writelines(Convert.convert(len(valueArray))) for item in valueArray: out.writelines(Convert.convert(len(item))) for entry in item: out.writelines(Convert.convert(NS.ordinal(NS.valueOf(entry[0])))) out.writelines(Convert.convert(int(entry[1]))) out.close() except Exception as e: self.logger.warning("保存失败%s" % str(e)) return False return True
def loadMainDictionary(mainPath): Predefine.logger.info("自定义词典开始加载:%s" % mainPath) print("自定义词典开始加载:%s" % mainPath) if CustomDictionary.loadDat(mainPath): return True CustomDictionary.dat = DoubleArrayTrie() map = TreeMap({}) customNatureCollector = set() try: for p in CustomDictionary.path: defaultNature = Nature.n Predefine.logger.info("以默认词性[%s]加载自定义词典%s中……" % (str(defaultNature), p)) print("以默认词性[%s]加载自定义词典%s中……" % (str(defaultNature), p)) success, map = CustomDictionary.loadtxt( p, defaultNature, map, customNatureCollector) if not success: Predefine.logger.warning("失败:%s" % p) except IOError as e: Predefine.logger.error("自定义词典%s不存在或读取错误!%s" % (mainPath, e)) except Exception as e: Predefine.logger.error("自定义词典%s缓存失败!%s\n" % (mainPath, e)) if map.size() == 0: Predefine.logger.warning("没有加载到任何词条") # 当做空白占位符 map.put(Predefine.TAG_OTHER, None) Predefine.logger.info("正在构建DoubleArrayTrie……") CustomDictionary.dat.build(map) # 缓存成dat文件,下次加载会快很多 Predefine.logger.info("正在缓存词典为dat文件……") # 缓存值文件 attributeList = [] for key, value in map.items(): attributeList.append(value) out = open(mainPath + Predefine.BIN_EXT, 'w+') # 缓存用户词性 #IOUtil.writeCustomNature(out, customNatureCollector) # 缓存正文 out.writelines(Convert.convert(len(attributeList))) for attribute in attributeList: attribute.save(out) CustomDictionary.dat.save1(out) out.close() return True
Predefine.logger.warning("没有加载到任何词条") # 当做空白占位符 map.put(Predefine.TAG_OTHER, None) Predefine.logger.info("正在构建DoubleArrayTrie……") CustomDictionary.dat.build(map) # 缓存成dat文件,下次加载会快很多 Predefine.logger.info("正在缓存词典为dat文件……") # 缓存值文件 attributeList = [] for key, value in map.items(): attributeList.append(value) out = file(mainPath + Predefine.BIN_EXT, 'w+') # 缓存用户词性 # IOUtil.writeCustomNature(out, customNatureCollector) # 缓存正文 out.writelines(Convert.convert(len(attributeList))) for attribute in attributeList: attribute.save(out) CustomDictionary.dat.save1(out) out.close() return True @staticmethod def loadtxt(path, defaultNature, map, customNatureCollector): """ 加载用户词典(追加) :param path: 词典路径 :param defaultNature: 默认词性 :param map: :param customNatureCollector: 收集用户词性
def save(self, out): out.writelines(Convert.convert(self.totalFrequency)) out.writelines(Convert.convert(len(self.nature))) for i in range(len(self.nature)): out.writelines(Convert.convert(Nature.ordinal(self.nature[i]))) out.writelines(Convert.convert(self.frequency[i]))