Beispiel #1
0
class EnumItem(Enum):
    def __init__(self):
        Enum.__init__(self)
        self.initdict = {}
        self.labelMap = TreeMap({})  # Treemap()对象

    def getFrequency(self, label):
        frequency = self.labelMap.get(label)
        if frequency is None:
            return 0
        return frequency

    def init1(self, label, frequency):
        self.initdict[label] = frequency
        self.labelMap = TreeMap(self.initdict)
        return self

    def init2(self, *args):
        """
        创建一个条目,其标签频次都是1,各标签由参数指定
        :param args:
        :return:
        """
        for label in args:
            self.initdict[label] = 1
        self.labelMap = TreeMap(self.initdict)
        return self

    def init3(self, initdict):
        self.initdict = initdict
        self.labelMap = TreeMap(self.initdict)
        return self

    def init4(self, initdict):
        self.initdict = initdict
        self.labelMap = TreeMap(self.initdict)
        return self

    def containsLabel(self, label):
        return label in self.labelMap.result.keys()

    @staticmethod
    def create(param):
        if param is None:
            return None
        array = param.split(' ')
        return EnumItem.create1(array)

    @staticmethod
    def create1(param):
        if len(param) % 2 == 0:
            return None

        natureCount = (len(param) - 1) / 2
        entries = [None] * natureCount
        for i in range(natureCount):
            entries[i] = {param[1 + 2 * i]: int(param[2 + 2 * i])}
        return {param[0]: entries}
    def load(self, path):
        if self.loadDat(CoreBiGramTableDictionary.datPath):
            return True
        # Treemap对象

        map = TreeMap({})
        # map = dict()
        try:
            br = open(path, 'r')

            line = ""
            total = 0
            maxWordId = CoreDictionary.trie.size1()

            line_num = 1
            while 1:
                line = br.readline().strip("\n\r\t ")
                if not line:
                    break

                params = re.split(' ', line)

                twoWord = params[0].split("@")
                a = twoWord[0]

                idA = CoreDictionary.trie.exactMatchSearch(a)
                if idA == -1:
                    continue
                b = twoWord[1]
                idB = CoreDictionary.trie.exactMatchSearch(b)
                if idB == -1:
                    continue
                freq = int(params[1])
                biMap = map.get(idA)
                if biMap is None:
                    biMap = TreeMap({})

                biMap.put(int(idB), freq)
                map.put(int(idA), biMap)

                total += 2
                line_num += 1

            for k, v in map.items():
                map.put(k, v.sort_long())

            map.sort_long()

            br.close()
            CoreBiGramTableDictionary.start = [int()] * (maxWordId + 1)
            # total是连续的个数*2
            CoreBiGramTableDictionary.pair = [int()] * total
            offset = 0
            for i in range(maxWordId):
                bMap = map.get(i)
                if bMap is not None:
                    for k, v in bMap.items():
                        index = offset << 1
                        CoreBiGramTableDictionary.pair[index] = k
                        CoreBiGramTableDictionary.pair[index + 1] = v
                        offset += 1
                CoreBiGramTableDictionary.start[i + 1] = offset

            self.logger.info("二元词典读取完毕:%s")
        except IOError, e:
            self.logger("二元词典%s不存在或读取错误!%s" % (path, e))
            return False
Beispiel #3
0
class State(object):
    def __init__(self):
        # 模式串的长度,也是这个状态的深度
        self.depth = int()
        # 只要这个状态可达,则记录模式串
        self.emits = None
        # goto 表,也称转移函数。根据字符串的下一个字符转移到下一个状态
        self.success = TreeMap({})
        # 在双数组中的对应下标
        self.index = int()
        # fail 函数,如果没有匹配到,则跳转到此状态。
        self.failure = None

    def init1(self, depth):
        """
        构造深度为depth的节点
        :param depth:
        :return:
        """
        self.depth = depth
        return self

    def isAcceptable(self):
        """
        是否是终止状态
        :return:
        """
        return self.depth > 0 and self.emits is not None

    def getDepth(self):
        """
        获取节点深度
        :return:
        """
        return self.depth

    def getLargestValueId(self):
        """
        获取最大的值
        :return:
        """
        if self.emits is None or len(self.emits) == 0:
            return None
        return iter(self.emits).next()

    def addEmit(self, keyword):
        """
        添加一个匹配到的模式串(这个状态对应着这个模式串)
        :param keyword:
        :return:
        """
        if self.emits is None:
            # self.emits是倒序排列的treeset
            # this.emits = new TreeSet<Integer>(Collections.reverseOrder());
            self.emits = set()
        self.emits = set(tuple(self.emits))
        self.emits.add(keyword)
        self.emits = sorted(self.emits, reverse=True)

    def addEmit1(self, emits):
        """
        添加一些匹配到的模式串
        :param emits:
        :return:
        """
        for emit in emits:
            self.addEmit(emit)

    def nextStateIgnoreRootState(self, character):
        return self.nextState1(character, True)

    def addState(self, character):
        character = character.encode('utf-8')
        nextState = self.nextStateIgnoreRootState(character)
        if nextState is None:
            nextState = State().init1(self.depth + 1)
            self.success.result[character] = nextState
            self.success = TreeMap(inputDict=self.success.result).sort()
        return nextState

    def getSuccess(self):
        """
        获取goto表
        :return:
        """
        return self.success

    def setIndex(self, index):
        self.index = index

    def getIndex(self):
        return self.index

    def getStates(self):
        return self.success.values()

    def setFailure(self, failState, fail):
        """
        设置failure状态
        :param failState:
        :param fail:
        :return:
        """
        self.failure = failState
        fail[self.index] = failState.index

    def getEmit(self):
        """
        获取这个节点代表的模式串(们)
        :return:
        """
        if self.emits is None:
            return set()
        else:
            return self.emits

    def getTransitions(self):
        return set(self.success.keys())

    def nextState(self, character):
        """
        按照character转移,根节点转移失败会返回自己(永远不会返回null)
        :param character:
        :return:
        """
        return self.nextState1(character, False)

    def nextState1(self, character, ignoreRootState):
        """
        转移到下一个状态
        :param character:希望按此字符转移
        :param ignoreRootState:是否忽略根节点,如果是根节点自己调用则应该是true,否则为false
        :return:转移结果
        """
        nextState = self.success.get(character)
        if not ignoreRootState and nextState is None and self.depth == 0:
            nextState = self
        return nextState

    def getFailure(self):
        """
        获取failure状态
        :return:
        """
        return self.failure