Esempio n. 1
0
    def load(self, path):
        self.logger.info("核心词典开始加载:%s" % path)
        print("核心词典开始加载:%s" % path)
        if self.loadDat(path):
            return True

        initdict = OrderedDict()
        try:
            f = open(path, 'r')
            line = ''
            MAX_FREQUENCY = 0
            start = time()
            while 1:
                line = f.readline().strip(' \n\t\r')
                if not line:
                    break
                param = line.split('\t')
                natureCount = int((len(param) - 1) / 2)
                attribute = CoreDictionary.Attribute().init1(natureCount)
                for i in range(natureCount):
                    attribute.nature[i] = Nature.valueOf(param[1 + 2 * i])
                    attribute.frequency[i] = int(param[2 + 2 * i])
                    attribute.totalFrequency += attribute.frequency[i]
                initdict[param[0]] = attribute
                MAX_FREQUENCY += attribute.totalFrequency
            map = TreeMap(initdict)
            self.logger.info("核心词典读入词条%i,全部频次%i,耗时%fms" % (map.size(), MAX_FREQUENCY, (time() - start) * 1000))
            print ("核心词典读入词条%i,全部频次%i,耗时%fms" % (map.size(), MAX_FREQUENCY, (time() - start) * 1000))
            self.trie.build(map)
            self.logger.info("核心词典加载成功:%i个词条,下面将写入缓存" % self.trie.size1())
            print ("核心词典加载成功:%i个词条,下面将写入缓存" % self.trie.size1())

            try:
                out = open(self.path + Predefine.BIN_EXT, 'w+')
                attributeList = map.values()
                out.writelines(Convert.convert(len(attributeList)))
                for attribute in attributeList:
                    out.writelines(Convert.convert(attribute.totalFrequency))
                    out.writelines(Convert.convert(len(attribute.nature)))
                    for i in range(len(attribute.nature)):
                        out.writelines(Convert.convert(Nature.ordinal(attribute.nature[i])))
                        out.writelines(Convert.convert(attribute.frequency[i]))

                self.trie.save(out)
                out.close()
            except Exception as e:
                self.logger.warning("保存失败%s" % str(e))
                return False
        except IOError as e:
            self.logger.warning("核心词典%s不存在或读取错误!" % str(e))
            return False
        return True
Esempio n. 2
0
    def load(self, path):
        start = time()
        valueArray = self.onLoadValue(path)
        if valueArray is None:
            self.logger.warning("加载值%s.value.dat失败,耗时%fms" %
                                (path, (time() - start) * 1000))
            return False
        self.logger.info("加载值%s.value.dat成功,耗时%fms" %
                         (path, (time() - start) * 1000))
        print("加载值%s.value.dat成功,耗时%fms" % (path, (time() - start) * 1000))

        start = time()

        if self.loadDat(path + '.trie.dat', valueArray):
            self.logger.info("加载键%s.trie.dat成功,耗时%fms" %
                             (path, (time() - start) * 1000))
            print("加载键%s.trie.dat成功,耗时%fms" % (path, (time() - start) * 1000))
            return True

        keyList = []

        try:
            br = open(path, 'r')
            while 1:
                line = br.readline().encode('utf-8').strip(' \n\t\r')
                if not line:
                    break
                paraArray = line.split(' ')
                keyList.append(paraArray[0])
        except Exception as e:
            self.logger.warning("读取%s失败%s" % (path, str(e)))
        resultcode = self.trie.kvbuild(keyList, valueArray)

        if resultcode != 0:
            self.logger.warning("trie建立失败%i,正在尝试排序后重载" % resultcode)
            initdict = {}
            map = None
            for i in range(len(list(valueArray))):
                initdict[keyList[i]] = valueArray[i]
            map = TreeMap(initdict).sort()
            self.trie.build(map)
            i = 0
            for v in map.values():
                valueArray[i] = v
                i += 1
        self.trie.save(path + '.trie.dat')
        self.logger.info(path + "加载成功")
        return True
Esempio n. 3
0
class State(object):
    def __init__(self):
        # 模式串的长度,也是这个状态的深度
        self.depth = int()
        # 只要这个状态可达,则记录模式串
        self.emits = None
        # goto 表,也称转移函数。根据字符串的下一个字符转移到下一个状态
        self.success = TreeMap({})
        # 在双数组中的对应下标
        self.index = int()
        # fail 函数,如果没有匹配到,则跳转到此状态。
        self.failure = None

    def init1(self, depth):
        """
        构造深度为depth的节点
        :param depth:
        :return:
        """
        self.depth = depth
        return self

    def isAcceptable(self):
        """
        是否是终止状态
        :return:
        """
        return self.depth > 0 and self.emits is not None

    def getDepth(self):
        """
        获取节点深度
        :return:
        """
        return self.depth

    def getLargestValueId(self):
        """
        获取最大的值
        :return:
        """
        if self.emits is None or len(self.emits) == 0:
            return None
        return iter(self.emits).next()

    def addEmit(self, keyword):
        """
        添加一个匹配到的模式串(这个状态对应着这个模式串)
        :param keyword:
        :return:
        """
        if self.emits is None:
            # self.emits是倒序排列的treeset
            # this.emits = new TreeSet<Integer>(Collections.reverseOrder());
            self.emits = set()
        self.emits = set(tuple(self.emits))
        self.emits.add(keyword)
        self.emits = sorted(self.emits, reverse=True)

    def addEmit1(self, emits):
        """
        添加一些匹配到的模式串
        :param emits:
        :return:
        """
        for emit in emits:
            self.addEmit(emit)

    def nextStateIgnoreRootState(self, character):
        return self.nextState1(character, True)

    def addState(self, character):
        character = character.encode('utf-8')
        nextState = self.nextStateIgnoreRootState(character)
        if nextState is None:
            nextState = State().init1(self.depth + 1)
            self.success.result[character] = nextState
            self.success = TreeMap(inputDict=self.success.result).sort()
        return nextState

    def getSuccess(self):
        """
        获取goto表
        :return:
        """
        return self.success

    def setIndex(self, index):
        self.index = index

    def getIndex(self):
        return self.index

    def getStates(self):
        return self.success.values()

    def setFailure(self, failState, fail):
        """
        设置failure状态
        :param failState:
        :param fail:
        :return:
        """
        self.failure = failState
        fail[self.index] = failState.index

    def getEmit(self):
        """
        获取这个节点代表的模式串(们)
        :return:
        """
        if self.emits is None:
            return set()
        else:
            return self.emits

    def getTransitions(self):
        return set(self.success.keys())

    def nextState(self, character):
        """
        按照character转移,根节点转移失败会返回自己(永远不会返回null)
        :param character:
        :return:
        """
        return self.nextState1(character, False)

    def nextState1(self, character, ignoreRootState):
        """
        转移到下一个状态
        :param character:希望按此字符转移
        :param ignoreRootState:是否忽略根节点,如果是根节点自己调用则应该是true,否则为false
        :return:转移结果
        """
        nextState = self.success.get(character)
        if not ignoreRootState and nextState is None and self.depth == 0:
            nextState = self
        return nextState

    def getFailure(self):
        """
        获取failure状态
        :return:
        """
        return self.failure