コード例 #1
0
 def loadtxt(path, defaultNature, map, customNatureCollector):
     """
     加载用户词典(追加)
     :param path: 词典路径
     :param defaultNature: 默认词性
     :param map:
     :param customNatureCollector: 收集用户词性
     :return:
     """
     try:
         initdict = OrderedDict()
         br = open(path, 'r')
         while 1:
             line = br.readline().encode().strip()
             if not line:
                 break
             param = line.split(" ")
             natureCount = (len(param) - 1) / 2
             attribute = None
             if natureCount == 0:
                 attribute = CoreDictionary.Attribute().init5(defaultNature)
             else:
                 attribute = CoreDictionary.Attribute().init1(natureCount)
                 for i in range(natureCount):
                     attribute.nature[i] = Nature.valueOf(param[1 + 2 * i])
                     attribute.frequency[i] = int(param[2 + 2 * i])
                     attribute.totalFrequency += attribute.frequency[i]
             initdict[param[0]] = attribute
         map = TreeMap(initdict)
     except Exception, e:
         Predefine.logger.warning("自定义词典%s读取错误%s" % (path, e))
         return False, map
コード例 #2
0
    def loadDat(path):
        """
        从磁盘加载双数组
        :param path:
        :return:
        """
        try:
            byteArray = pickle.load(open(path + Predefine.PIC_EXT, 'rb'))
        except Exception as e:
            byteArray = ByteArray.createByteArray(path + Predefine.BIN_EXT)
            out = open(path + Predefine.PIC_EXT, 'wb')
            pickle.dump(byteArray, out)

        if byteArray is None:
            return False
        size = byteArray.nextInt()
        # 一种兼容措施,当size小于零表示文件头部储存了-size个用户词性
        if size < 0:
            pass
        attributes = [None] * size
        natureIndexArray = list(Nature)
        for i in range(size):
            # 第一个是全部词频,第二个是词性个数
            currentTotalFrequency = byteArray.nextInt()
            length = byteArray.nextInt()
            attributes[i] = CoreDictionary.Attribute().init1(length)
            attributes[i].totalFrequency = currentTotalFrequency
            for j in range(length):
                attributes[i].nature[j] = natureIndexArray[byteArray.nextInt()]
                attributes[i].frequency[j] = byteArray.nextInt()
        if not CustomDictionary.dat.load(byteArray, attributes):
            return False

        return True
コード例 #3
0
 def getAttribute(word):
     """
     获取某个单词的词频
     :@param word
     :@return
     """
     attribute = CoreDictionary.get(word)
     if attribute is not None:
         return attribute
     return CustomDictionary.get(word)
コード例 #4
0
    def initVertex(self, word, realWord, attribute, wordID):
        if attribute is None:
            attribute = CoreDictionary.Attribute().init3(Nature.n, 1)
        self.wordID = wordID
        self.attribute = attribute
        if word is None:
            word = self.compileRealWord(realWord, attribute)

        assert len(realWord) > 0
        self.word = word
        self.realword = realWord.decode()
        return self
コード例 #5
0
 def insertName(name, activeLine, wordNetOptimum, wordNetAll):
     """
     插入日本人名
     :param name:
     :param activeLine:
     :param wordNetOptimum:
     :param wordNetAll:
     :return:
     """
     if JapanesePersonRecognition.isBadCase(name):
         return
     wordNetOptimum.insert(
         activeLine,
         Vertex().initVertex(Predefine.TAG_PEOPLE, name,
                             CoreDictionary.Attribute().init5(Nature.nrj),
                             NRConstant.WORD_ID), wordNetAll)
コード例 #6
0
 def Recognition(segResult, wordNetOptimum, wordNetAll):
     """
     执行识别
     :param segResult: 粗分结果
     :param wordNetOptimum: 粗分结果对应的词图
     :param wordNetAll: 全词图
     :return:
     """
     sbName = ""
     appendTimes = 0
     i = 0
     # i += 1
     line = 1
     activeLine = 1
     while i < len(segResult) - 1:
         i += 1
         vertex = segResult[i]
         if appendTimes > 0:
             if vertex.guessNature(
             ) == Nature.nrf or TranslatedPersonDictionary.containsKey(
                     vertex.realword):
                 sbName += vertex.realword
                 appendTimes += 1
             else:
                 # 识别结束
                 if appendTimes > 1:
                     wordNetOptimum.insert(
                         activeLine,
                         Vertex().initVertex(
                             Predefine.TAG_PEOPLE, sbName,
                             CoreDictionary.Attribute().init5(Nature.nrf),
                             NRConstant.WORD_ID), wordNetAll)
                 sbName = ""
                 appendTimes = 0
         else:
             # nrf和nsf触发识别
             if vertex.guessNature() == Nature.nrf or vertex.getNature(
             ) == Nature.nsf:
                 sbName += vertex.realword
                 appendTimes += 1
                 activeLine = line
         line += len(vertex.realword)
コード例 #7
0
ファイル: WordNet.py プロジェクト: michaelliu03/py-seg
 def add1(self, line, atomSegment):
     """
     添加顶点,由原子分词顶点添加
     :param line:
     :param atomSegment:
     :return:
     """
     # 将原子部分存入m_segGraph
     offset = 0
     # Init the cost array
     for atomNode in atomSegment:
         # init the word
         sWord = atomNode.sWord
         nature = Nature.n
         id = -1
         for case in Switch(atomNode.nPOS):
             if case(Predefine.CT_CHINESE):
                 break
             if case(Predefine.CT_INDEX) or case(Predefine.CT_NUM):
                 nature = Nature.m
                 sWord = '未##数'
                 id = CoreDictionary.M_WORD_ID
                 break
             if case(Predefine.CT_DELIMITER) or case(Predefine.CT_OTHER):
                 nature = Nature.w
                 break
             if case(Predefine.CT_SINGLE):
                 nature = Nature.nx
                 sWord = '未##串'
                 id = CoreDictionary.X_WORD_ID
                 break
             if case():
                 break
         # 这些通用符的量级都在10万左右
         self.add(
             line + offset,
             Vertex().initVertex(
                 sWord, atomNode.sWord,
                 CoreDictionary.Attribute().init3(nature, 10000), id))
         offset += len(atomNode.sWord)
コード例 #8
0
ファイル: NRConstant.py プロジェクト: lishidaup/AdaSegment
class NRConstant(object):
    # 本词典专注的词的ID
    WORD_ID = CoreDictionary.getWordID(Predefine.TAG_PEOPLE)
    # 本词典专注的词的属性
    ATTRIBUTE = CoreDictionary.get2(WORD_ID)
コード例 #9
0
class PlaceDictionary(object):
    # 地名词典
    dictionary = NSDictionary()
    # 转移矩阵词典
    transformMatrixDictionary = TransformMatrixDictionary()
    # AC算法用到的Trie树
    trie = AhoCorasickDoubleArrayTrie()
    # 本词典专注的词的ID
    WORD_ID = CoreDictionary.getWordID(Predefine.TAG_PLACE)
    # 本词典专注的词的属性
    ATTRIBUTE = CoreDictionary.get2(WORD_ID)

    def __init__(self):
        self.load()

    def load(self):
        start = time()
        PlaceDictionary.dictionary.load(Config.PlaceDictionaryPath)
        Predefine.logger.info("%s加载成功,耗时%fms" % (Config.PlaceDictionaryPath,
                                                 (time() - start) * 1000))
        print "%s加载成功,耗时%fms" % (Config.PlaceDictionaryPath,
                                 (time() - start) * 1000)
        PlaceDictionary.transformMatrixDictionary = PlaceDictionary.transformMatrixDictionary.init1(
            NS)
        PlaceDictionary.transformMatrixDictionary.load(
            Config.PlaceDictionaryTrPath)
        init_dict = {}
        init_dict["CDEH"] = "CDEH"
        init_dict["CDH"] = "CDH"
        init_dict["CH"] = "CH"
        init_dict["GH"] = "GH"
        PlaceDictionary.trie.build(TreeMap(init_dict))

    @staticmethod
    def parsePattern(nsList, vertexList, wordNetOptimum, wordNetAll, pld_obj):
        """
        模式匹配
        :param nsList: 确定的标注序列
        :param wordNetOptimum: 原始的未加角色标注的序列
        :param wordNetAll: 待优化的图
        :return:
        """
        sbPattern = ""
        for ns in nsList:
            sbPattern += str(ns)
        pattern = str(sbPattern)
        wordList = []
        for i in range(len(vertexList)):
            wordList.append(vertexList[i].realword)
        wordArray = np.array(wordList)
        PlaceDictionary.trie.parseText1(pattern, wordArray, pld_obj,
                                        wordNetOptimum, wordNetAll)

    @staticmethod
    def isBadCase(name):
        """
        因为任何算法都无法解决100%的问题,总是有一些bad case,这些bad case会以“盖公章 A 1”的形式加入词典中<BR>
        这个方法返回是否是bad case
        :param name:
        :return:
        """
        nrEnumItem = None
        place_list = PlaceDictionary.dictionary.get(name)
        if place_list is not None:
            initdict = dict(place_list)
            nrEnumItem = EnumItem().init3(initdict)
        if nrEnumItem is None:
            return False
        return nrEnumItem.containsLabel(NS.Z)
コード例 #10
0
class PersonDictionary(object):
    # 人名词典
    dictionary = NRDictionary()
    # 转移矩阵词典
    transformMatrixDictionary = TransformMatrixDictionary()
    # AC算法用到的Trie树
    trie = AhoCorasickDoubleArrayTrie()

    ATTRIBUTE = CoreDictionary.Attribute().init3(Nature.nr, 100)

    def __init__(self):
        self.logger = Predefine.logger
        self.wordArray = None
        self.offsetArray = None
        self.wordNetOptimum = None
        self.wordNetAll = None

        self.init()

    def init(self):
        start = time()
        if not PersonDictionary.dictionary.load(Config.PersonDictionaryPath):
            self.logger.error("人名词典加载失败:%s" % Config.PersonDictionaryPath)
            sys.exit(0)

        PersonDictionary.transformMatrixDictionary.init1(NR)
        PersonDictionary.transformMatrixDictionary.load(
            Config.PersonDictionaryTrPath)

        initdict = {}
        for pattern in NRPattern:
            initdict[str(pattern)] = pattern
        map = TreeMap(initdict).sort()
        PersonDictionary.trie.build(map)
        self.logger.info("%s加载成功,耗时%fms" % (Config.PersonDictionaryPath,
                                            (time() - start) * 1000))

    @staticmethod
    def parsePattern(nrList, vertexList, wordNetOptimum, wordNetAll, pd_obj):
        """
        模式匹配
        :param nrList         确定的标注序列
        :param vertexList     原始的未加角色标注的序列
        :param wordNetOptimum 待优化的图
        :param wordNetAll     全词图
        """
        # 拆分UV
        # 遍历vertextList的下标
        i = -1
        sbPattern = ""
        preNR = NR.A
        backUp = False
        index = 0
        for nr in nrList:
            index += 1
            i += 1
            current = vertexList[i]
            if nr == NR.U:
                if not backUp:
                    i = index - 1
                    backUp = True
                sbPattern += str(NR.K)
                sbPattern += str(NR.B)
                preNR = NR.B

                nowK = current.realword[0:len(current.realword.decode()) - 1]
                nowB = current.realword[len(current.realword.decode()) - 1:]
                vertexList[i] = Vertex().init1(nowK)

                i += 1
                vertexList.insert(i, Vertex().init1(nowB))
                continue
            elif nr == NR.V:
                if not backUp:
                    i = index - 1
                    backUp = True
                if preNR == NR.B:
                    # BE
                    sbPattern += str(NR.E)
                else:
                    # CD
                    sbPattern += str(NR.D)
                sbPattern += str(NR.L)
                # 对串也做一些修改
                # i -= 1
                nowED = current.realword[len(current.realword) - 1:]
                nowL = current.realword[0:len(current.realword) - 1]
                vertexList[i] = Vertex().init1(nowED)
                vertexList.insert(i, Vertex().init1(nowL))
                i += 1
                # i += 1
                continue
            else:

                sbPattern += str(nr)

            # i += 1
            preNR = nr

        pattern = str(sbPattern)
        wordList = []
        for i in range(len(vertexList)):
            wordList.append(vertexList[i].realword)
        wordArray = np.array(wordList)

        offsetArray = [int()] * len(wordArray)
        offsetArray[0] = 0

        for i in range(1, len(wordArray)):
            offsetArray[i] = offsetArray[i - 1] + len(wordArray[i - 1])

        PersonDictionary.trie.parseText(pattern, wordArray, offsetArray,
                                        pd_obj, wordNetOptimum, wordNetAll)

    def isBadCase(self, name):
        """
        因为任何算法都无法解决100%的问题,总是有一些bad case,这些bad case会以“盖公章 A 1”的形式加入词典中<BR>
        这个方法返回人名是否是bad case
        :param name:
        :return:
        """
        nrEnumItem = None
        name_list = PersonDictionary.dictionary.get(name)
        if name_list is not None:
            initdict = dict(name_list)
            nrEnumItem = EnumItem().init3(initdict)

        if nrEnumItem is None:
            return False
        return nrEnumItem.containsLabel(NR.A)
コード例 #11
0
            out = file(path + Predefine.PIC_EXT, 'wb')
            cPickle.dump(byteArray, out)

        if byteArray is None:
            return False
        size = byteArray.nextInt()
        # 一种兼容措施,当size小于零表示文件头部储存了-size个用户词性
        if size < 0:
            pass
        attributes = [None] * size
        natureIndexArray = list(Nature)
        for i in range(size):
            # 第一个是全部词频,第二个是词性个数
            currentTotalFrequency = byteArray.nextInt()
            length = byteArray.nextInt()
            attributes[i] = CoreDictionary.Attribute().init1(length)
            attributes[i].totalFrequency = currentTotalFrequency
            for j in range(length):
                attributes[i].nature[j] = natureIndexArray[byteArray.nextInt()]
                attributes[i].frequency[j] = byteArray.nextInt()
        if not CustomDictionary.dat.load(byteArray, attributes):
            return False

        return True

    @staticmethod
    def get(key):
        attribute = CustomDictionary.dat.get(key)
        if attribute is not None:
            return attribute
        if CustomDictionary.trie is None:
コード例 #12
0
 def newE():
     return Vertex().initVertex(
         Predefine.TAG_END, ' ',
         CoreDictionary.Attribute().init3(Nature.end,
                                          Predefine.MAX_FREQUENCY / 10),
         CoreDictionary.getWordID(Predefine.TAG_END))
コード例 #13
0
 def newB():
     return Vertex().initVertex(
         Predefine.TAG_BIGIN, ' ',
         CoreDictionary.Attribute().init3(Nature.begin,
                                          Predefine.MAX_FREQUENCY / 10),
         CoreDictionary.getWordID(Predefine.TAG_BIGIN))
コード例 #14
0
class Vertex(object):
    """
    顶点
    """
    cd = CoreDictionary()
    cbtd = CoreBiGramTableDictionary()

    def __init__(self):
        # 节点对应的词或等效词(如未##数)
        self.word = ''
        # 节点对应的真实词,绝对不含##
        self.realword = ''

        # 词的属性,谨慎修改属性内部的数据,因为会影响到字典
        # 如果要修改,应当new一个Attribute
        self.attribute = None
        # 等效词ID,也是Attribute的下标
        self.wordID = int()
        # 在一维顶点数组中的下标,可以视作这个顶点的id
        self.index = int()

        # 到该节点的最短路径的前驱节点
        self.fromnode = None
        # 最短路径对应的权重
        self.weight = float()

    @staticmethod
    def newB():
        return Vertex().initVertex(
            Predefine.TAG_BIGIN, ' ',
            CoreDictionary.Attribute().init3(Nature.begin,
                                             Predefine.MAX_FREQUENCY / 10),
            CoreDictionary.getWordID(Predefine.TAG_BIGIN))

    @staticmethod
    def newE():
        return Vertex().initVertex(
            Predefine.TAG_END, ' ',
            CoreDictionary.Attribute().init3(Nature.end,
                                             Predefine.MAX_FREQUENCY / 10),
            CoreDictionary.getWordID(Predefine.TAG_END))

    def updateFrom(self, fromnode):
        weight = fromnode.weight + MathTools.calculateWeight(fromnode, self)
        if self.fromnode is None or self.weight > weight:
            self.fromnode = fromnode
            self.weight = weight

    def initVertex(self, word, realWord, attribute, wordID):
        if attribute is None:
            attribute = CoreDictionary.Attribute().init3(Nature.n, 1)
        self.wordID = wordID
        self.attribute = attribute
        if word is None:
            word = self.compileRealWord(realWord, attribute)

        assert len(realWord) > 0
        self.word = word
        self.realword = realWord.decode()
        return self

    def init1(self, realWord):
        """
        自动构造一个合理的顶点
        :param realword:
        :return:
        """
        return self.initVertex(None, realWord, Vertex.cd.get(realWord), -1)

    def init2(self, realWord, attribute, wordID):
        return self.initVertex(None, realWord, attribute, wordID)

    def init3(self, word, realWord, attribute):
        """
        最复杂的构造函数
        :param word: 编译后的词
        :param realWord: 真实词
        :param attribute: 属性
        :return:
        """
        return self.initVertex(word, realWord, attribute, -1)

    def init4(self, realWord, attribute):
        """
        真实词与编译词相同时候的构造函数
        :param realWord:
        :param attribute:
        :return:
        """
        return self.init3(None, realWord, attribute)

    def compileRealWord(self, realword, attribute):
        if len(attribute.nature) == 1:
            for case in Switch(attribute.nature[0]):
                if case(Nature.nr) or case(Nature.nr1) or case(
                        Nature.nr2) or case(Nature.nrf) or case(Nature.nrj):
                    self.wordID = Vertex.cd.NR_WORD_ID
                    return Predefine.TAG_PEOPLE
                if case(Nature.ns) or case(Nature.nsf):
                    self.wordID = Vertex.cd.NS_WORD_ID
                    return Predefine.TAG_PLACE
                if case(Nature.nx):
                    self.wordID = Vertex.cd.NX_WORD_ID
                    self.attribute = Vertex.cd.get1(Vertex.cd.NX_WORD_ID)
                    return Predefine.TAG_PROPER
                if case(Nature.nt) or case(Nature.ntc) or case(
                        Nature.ntcf) or case(Nature.ntcb) or case(
                            Nature.ntch) or case(Nature.nto) or case(
                                Nature.ntu) or case(Nature.nts) or case(
                                    Nature.nth) or case(Nature.nit):
                    self.wordID = Vertex.cd.NT_WORD_ID
                    # self.attribute = Vertex.cd.get1(Vertex.cd.NT_WORD_ID)
                    return Predefine.TAG_GROUP
                if case(Nature.m) or case(Nature.mq):
                    self.wordID = Vertex.cd.M_WORD_ID
                    self.attribute = Vertex.cd.get1(Vertex.cd.M_WORD_ID)
                    return Predefine.TAG_NUMBER
                if case(Nature.x):
                    self.wordID = Vertex.cd.X_WORD_ID
                    self.attribute = Vertex.cd.get1(Vertex.cd.X_WORD_ID)
                    return Predefine.TAG_CLUSTER
                if case(Nature.t):
                    self.wordID = Vertex.cd.T_WORD_ID
                    self.attribute = Vertex.cd.get1(Vertex.cd.T_WORD_ID)
                    return Predefine.TAG_TIME
        return realword

    def getNature(self):
        """
        获取该节点的词性,如果词性还未确定,则返回null
        :return:
        """
        if len(self.attribute.nature) == 1:
            return self.attribute.nature[0]
        return None

    def guessNature(self):
        """
        猜测最可能的词性,也就是这个节点的词性中出现频率最大的那一个词性
        :return:
        """
        return self.attribute.nature[0]

    def getAttribute(self):
        """
        获取词的属性
        :return:
        """
        return self.attribute
コード例 #15
0
                        traceFailureState = traceFailureState.getFailure()

                    newFailureState = traceFailureState.nextState(transition)
                    targetState.setFailure(newFailureState, self.ac.fail)
                    targetState.addEmit1(newFailureState.getEmit())
                    self.constructOutput(targetState)

        def loseWeight(self):
            """
            释放空闲的内存
            :return:
            """
            nbase = [int()] * (self.ac.size + 65535)
            nbase[:self.ac.size] = self.ac.base[:self.ac.size]
            self.ac.base = nbase

            ncheck = [int()] * (self.ac.size + 65535)
            ncheck[:self.ac.size] = self.ac.check[:self.ac.size]
            self.ac.check = ncheck


# 跳出外层循环类
class Getoutofloop(Exception):
    pass


if __name__ == "__main__":
    a = CoreDictionary()
    ac = AhoCorasickDoubleArrayTrie()
    ac.Builder(ac).resize(9)