Example #1
0
 def __init__(self):
     """初始化自动机,load词典,构建自动机"""
     self.__acto = AhoCorasick()
     with open("dict/SogouLabDic.dic", "r") as fin:
         for line in fin:
             line = line.strip()
             tokens = line.split("\t")
             if len(tokens) < 2:
                 continue
             if len(tokens) == 2:
                 word = tokens[0].decode("utf8", "ignore")
                 pos_tag = u"ProperNoun"
                 self.__acto.insert(word, pos_tag)
             elif len(tokens) == 3:
                 word = tokens[0].decode("utf8", "ignore")
                 pos_list = tokens[2].split(",")
                 sign = 0
                 for pos_tag in pos_list:
                     if len(pos_tag) == 0:
                         continue
                     sign = 1
                     pos_tag = pos_tag.decode("utf8", "ignore")
                     self.__acto.insert(word, pos_tag)
                 if sign == 0:
                     self.__acto.insert(word, u"ProperNoun")
     self.__acto.build_fail()
Example #2
0
 def __init__(self):
     """初始化自动机,load词典,构建自动机"""
     self.__acto = AhoCorasick()
     with open("dict/SogouLabDic.dic", "r") as fin:
         for line in fin:
             line = line.strip()
             tokens = line.split("\t")
             if len(tokens) < 2:
                 continue
             if len(tokens) == 2:
                 word = tokens[0].decode("utf8", "ignore")
                 pos_tag = u"ProperNoun"
                 self.__acto.insert(word, pos_tag)
             elif len(tokens) == 3:
                 word = tokens[0].decode("utf8", "ignore")
                 pos_list = tokens[2].split(",")
                 sign = 0
                 for pos_tag in pos_list:
                     if len(pos_tag) == 0:
                         continue
                     sign = 1
                     pos_tag = pos_tag.decode("utf8", "ignore")
                     self.__acto.insert(word, pos_tag)
                 if sign == 0:
                     self.__acto.insert(word, u"ProperNoun")
     self.__acto.build_fail()
Example #3
0
class RawSeg(object):
    """基础切词,返回多条切词路径"""

    def __init__(self):
        """初始化自动机,load词典,构建自动机"""
        self.__acto = AhoCorasick()
        with open("dict/SogouLabDic.dic", "r") as fin:
            for line in fin:
                line = line.strip()
                tokens = line.split("\t")
                if len(tokens) < 2:
                    continue
                if len(tokens) == 2:
                    word = tokens[0].decode("utf8", "ignore")
                    pos_tag = u"ProperNoun"
                    self.__acto.insert(word, pos_tag)
                elif len(tokens) == 3:
                    word = tokens[0].decode("utf8", "ignore")
                    pos_list = tokens[2].split(",")
                    sign = 0
                    for pos_tag in pos_list:
                        if len(pos_tag) == 0:
                            continue
                        sign = 1
                        pos_tag = pos_tag.decode("utf8", "ignore")
                        self.__acto.insert(word, pos_tag)
                    if sign == 0:
                        self.__acto.insert(word, u"ProperNoun")
        self.__acto.build_fail()

    def backtrace(self, ulist, N, cidx, wlist, M, widx, path_stk):
        """回溯法,寻找所有路径"""
        if cidx >= N:
            path = Path()
            path.word_list = copy.deepcopy(path_stk) 
            return [path]
        if widx >= M:
            """单词已经枚举光了"""
            path.word_list = copy.deepcopy(path_stk)
            while cidx < N:
                word = Word(ulist[cidx], [u"Single"])
                cidx += 1
                path.word_list.append(word)
            return [path]
        path_list = []
        if cidx < N:
            word = Word(ulist[cidx], [u"Single"])
            path_stk.append(word)
            path_list += self.backtrace(ulist, N, cidx + 1, wlist, M, widx, path_stk)
            path_stk.pop()
        while widx < M:
            if wlist[widx][0] < cidx:
                widx += 1
                continue
            if wlist[widx][0] > cidx:
                break
            word = Word(ulist[wlist[widx][0]:wlist[widx][1]], wlist[widx][2])
            path_stk.append(word)
            path_list += self.backtrace(ulist, N, wlist[widx][1], wlist, M, widx + 1, path_stk)
            path_stk.pop()
            widx += 1
        return path_list

    def seg(self, query):
        """对query进行切词,切分路径list"""
        assert isinstance(query, unicode)
        word_idx_list = self.__acto.get_match_word_idx(query)
        word_idx_list = sorted(word_idx_list, key=lambda x: x[0])
        path_list = []
        path_stk = []
        return self.backtrace(query, len(query), 0, word_idx_list, len(word_idx_list), 0, path_stk)
Example #4
0
class RawSeg(object):
    """基础切词,返回多条切词路径"""
    def __init__(self):
        """初始化自动机,load词典,构建自动机"""
        self.__acto = AhoCorasick()
        with open("dict/SogouLabDic.dic", "r") as fin:
            for line in fin:
                line = line.strip()
                tokens = line.split("\t")
                if len(tokens) < 2:
                    continue
                if len(tokens) == 2:
                    word = tokens[0].decode("utf8", "ignore")
                    pos_tag = u"ProperNoun"
                    self.__acto.insert(word, pos_tag)
                elif len(tokens) == 3:
                    word = tokens[0].decode("utf8", "ignore")
                    pos_list = tokens[2].split(",")
                    sign = 0
                    for pos_tag in pos_list:
                        if len(pos_tag) == 0:
                            continue
                        sign = 1
                        pos_tag = pos_tag.decode("utf8", "ignore")
                        self.__acto.insert(word, pos_tag)
                    if sign == 0:
                        self.__acto.insert(word, u"ProperNoun")
        self.__acto.build_fail()

    def backtrace(self, ulist, N, cidx, wlist, M, widx, path_stk):
        """回溯法,寻找所有路径"""
        if cidx >= N:
            path = Path()
            path.word_list = copy.deepcopy(path_stk)
            return [path]
        if widx >= M:
            """单词已经枚举光了"""
            path.word_list = copy.deepcopy(path_stk)
            while cidx < N:
                word = Word(ulist[cidx], [u"Single"])
                cidx += 1
                path.word_list.append(word)
            return [path]
        path_list = []
        if cidx < N:
            word = Word(ulist[cidx], [u"Single"])
            path_stk.append(word)
            path_list += self.backtrace(ulist, N, cidx + 1, wlist, M, widx,
                                        path_stk)
            path_stk.pop()
        while widx < M:
            if wlist[widx][0] < cidx:
                widx += 1
                continue
            if wlist[widx][0] > cidx:
                break
            word = Word(ulist[wlist[widx][0]:wlist[widx][1]], wlist[widx][2])
            path_stk.append(word)
            path_list += self.backtrace(ulist, N, wlist[widx][1], wlist, M,
                                        widx + 1, path_stk)
            path_stk.pop()
            widx += 1
        return path_list

    def seg(self, query):
        """对query进行切词,切分路径list"""
        assert isinstance(query, unicode)
        word_idx_list = self.__acto.get_match_word_idx(query)
        word_idx_list = sorted(word_idx_list, key=lambda x: x[0])
        path_list = []
        path_stk = []
        return self.backtrace(query, len(query), 0, word_idx_list,
                              len(word_idx_list), 0, path_stk)