예제 #1
0
    def loadMainDictionary(mainPath):
        Predefine.logger.info("自定义词典开始加载:%s" % mainPath)
        print("自定义词典开始加载:%s" % mainPath)
        if CustomDictionary.loadDat(mainPath):
            return True
        CustomDictionary.dat = DoubleArrayTrie()

        map = TreeMap({})
        customNatureCollector = set()
        try:
            for p in CustomDictionary.path:
                defaultNature = Nature.n
                Predefine.logger.info("以默认词性[%s]加载自定义词典%s中……" %
                                      (str(defaultNature), p))
                print("以默认词性[%s]加载自定义词典%s中……" % (str(defaultNature), p))
                success, map = CustomDictionary.loadtxt(
                    p, defaultNature, map, customNatureCollector)
                if not success:
                    Predefine.logger.warning("失败:%s" % p)
        except IOError as e:
            Predefine.logger.error("自定义词典%s不存在或读取错误!%s" % (mainPath, e))
        except Exception as e:
            Predefine.logger.error("自定义词典%s缓存失败!%s\n" % (mainPath, e))
        if map.size() == 0:
            Predefine.logger.warning("没有加载到任何词条")
            # 当做空白占位符
            map.put(Predefine.TAG_OTHER, None)
        Predefine.logger.info("正在构建DoubleArrayTrie……")
        CustomDictionary.dat.build(map)
        # 缓存成dat文件,下次加载会快很多
        Predefine.logger.info("正在缓存词典为dat文件……")
        # 缓存值文件
        attributeList = []
        for key, value in map.items():
            attributeList.append(value)
        out = open(mainPath + Predefine.BIN_EXT, 'w+')
        # 缓存用户词性
        #IOUtil.writeCustomNature(out, customNatureCollector)
        # 缓存正文
        out.writelines(Convert.convert(len(attributeList)))
        for attribute in attributeList:
            attribute.save(out)
        CustomDictionary.dat.save1(out)
        out.close()

        return True
    def load(self, path):
        if self.loadDat(CoreBiGramTableDictionary.datPath):
            return True
        # Treemap对象

        map = TreeMap({})
        # map = dict()
        try:
            br = open(path, 'r')

            line = ""
            total = 0
            maxWordId = CoreDictionary.trie.size1()

            line_num = 1
            while 1:
                line = br.readline().strip("\n\r\t ")
                if not line:
                    break

                params = re.split(' ', line)

                twoWord = params[0].split("@")
                a = twoWord[0]

                idA = CoreDictionary.trie.exactMatchSearch(a)
                if idA == -1:
                    continue
                b = twoWord[1]
                idB = CoreDictionary.trie.exactMatchSearch(b)
                if idB == -1:
                    continue
                freq = int(params[1])
                biMap = map.get(idA)
                if biMap is None:
                    biMap = TreeMap({})

                biMap.put(int(idB), freq)
                map.put(int(idA), biMap)

                total += 2
                line_num += 1

            for k, v in map.items():
                map.put(k, v.sort_long())

            map.sort_long()

            br.close()
            CoreBiGramTableDictionary.start = [int()] * (maxWordId + 1)
            # total是连续的个数*2
            CoreBiGramTableDictionary.pair = [int()] * total
            offset = 0
            for i in range(maxWordId):
                bMap = map.get(i)
                if bMap is not None:
                    for k, v in bMap.items():
                        index = offset << 1
                        CoreBiGramTableDictionary.pair[index] = k
                        CoreBiGramTableDictionary.pair[index + 1] = v
                        offset += 1
                CoreBiGramTableDictionary.start[i + 1] = offset

            self.logger.info("二元词典读取完毕:%s")
        except IOError, e:
            self.logger("二元词典%s不存在或读取错误!%s" % (path, e))
            return False