def loadMainDictionary(mainPath): Predefine.logger.info("自定义词典开始加载:%s" % mainPath) print("自定义词典开始加载:%s" % mainPath) if CustomDictionary.loadDat(mainPath): return True CustomDictionary.dat = DoubleArrayTrie() map = TreeMap({}) customNatureCollector = set() try: for p in CustomDictionary.path: defaultNature = Nature.n Predefine.logger.info("以默认词性[%s]加载自定义词典%s中……" % (str(defaultNature), p)) print("以默认词性[%s]加载自定义词典%s中……" % (str(defaultNature), p)) success, map = CustomDictionary.loadtxt( p, defaultNature, map, customNatureCollector) if not success: Predefine.logger.warning("失败:%s" % p) except IOError as e: Predefine.logger.error("自定义词典%s不存在或读取错误!%s" % (mainPath, e)) except Exception as e: Predefine.logger.error("自定义词典%s缓存失败!%s\n" % (mainPath, e)) if map.size() == 0: Predefine.logger.warning("没有加载到任何词条") # 当做空白占位符 map.put(Predefine.TAG_OTHER, None) Predefine.logger.info("正在构建DoubleArrayTrie……") CustomDictionary.dat.build(map) # 缓存成dat文件,下次加载会快很多 Predefine.logger.info("正在缓存词典为dat文件……") # 缓存值文件 attributeList = [] for key, value in map.items(): attributeList.append(value) out = open(mainPath + Predefine.BIN_EXT, 'w+') # 缓存用户词性 #IOUtil.writeCustomNature(out, customNatureCollector) # 缓存正文 out.writelines(Convert.convert(len(attributeList))) for attribute in attributeList: attribute.save(out) CustomDictionary.dat.save1(out) out.close() return True
def load(self, path): if self.loadDat(CoreBiGramTableDictionary.datPath): return True # Treemap对象 map = TreeMap({}) # map = dict() try: br = open(path, 'r') line = "" total = 0 maxWordId = CoreDictionary.trie.size1() line_num = 1 while 1: line = br.readline().strip("\n\r\t ") if not line: break params = re.split(' ', line) twoWord = params[0].split("@") a = twoWord[0] idA = CoreDictionary.trie.exactMatchSearch(a) if idA == -1: continue b = twoWord[1] idB = CoreDictionary.trie.exactMatchSearch(b) if idB == -1: continue freq = int(params[1]) biMap = map.get(idA) if biMap is None: biMap = TreeMap({}) biMap.put(int(idB), freq) map.put(int(idA), biMap) total += 2 line_num += 1 for k, v in map.items(): map.put(k, v.sort_long()) map.sort_long() br.close() CoreBiGramTableDictionary.start = [int()] * (maxWordId + 1) # total是连续的个数*2 CoreBiGramTableDictionary.pair = [int()] * total offset = 0 for i in range(maxWordId): bMap = map.get(i) if bMap is not None: for k, v in bMap.items(): index = offset << 1 CoreBiGramTableDictionary.pair[index] = k CoreBiGramTableDictionary.pair[index + 1] = v offset += 1 CoreBiGramTableDictionary.start[i + 1] = offset self.logger.info("二元词典读取完毕:%s") except IOError, e: self.logger("二元词典%s不存在或读取错误!%s" % (path, e)) return False