Exemple #1
0
 def __init__(self, params=None):
     from ab.util.abwords import ABWORDS
     import os
     super(ABTokenizer, self).__init__(params)
     # user dict from params
     dictList = []
     #dictList.append(os.environ[self.ABWDE_HOME] + '/dict/Aibang_basicDict.txt')
     #dictList.append(os.environ[self.ABWDE_HOME] + '/dict/Aibang_groupDict.txt')
     dictList.append(os.environ[self.ABWDE_HOME] + '/dict/sougou.dict')
     self._wordparser = ABWORDS(dictList)
     self._imp_tokenizer = ChunkTokenizer()
Exemple #2
0
class ABTokenizer(Tokenizer):
    ABWDE_HOME = "ABWDE_HOME"

    def __init__(self, params=None):
        from ab.util.abwords import ABWORDS
        import os
        super(ABTokenizer, self).__init__(params)
        # user dict from params
        dictList = []
        #dictList.append(os.environ[self.ABWDE_HOME] + '/dict/Aibang_basicDict.txt')
        #dictList.append(os.environ[self.ABWDE_HOME] + '/dict/Aibang_groupDict.txt')
        dictList.append(os.environ[self.ABWDE_HOME] + '/dict/sougou.dict')
        self._wordparser = ABWORDS(dictList)
        self._imp_tokenizer = ChunkTokenizer()

    def tokenize(self, stream):
        gbk_stream = stream.encode('gbk', 'ignore')
        for chunk in self._imp_tokenizer.tokenize(stream):
            words = self._wordparser.seg_words(chunk.encode('gbk', 'ignore'))
            for word in words:
                yield word.decode('gbk', 'ignore')