def __init__(self, params=None): from ab.util.abwords import ABWORDS import os super(ABTokenizer, self).__init__(params) # user dict from params dictList = [] #dictList.append(os.environ[self.ABWDE_HOME] + '/dict/Aibang_basicDict.txt') #dictList.append(os.environ[self.ABWDE_HOME] + '/dict/Aibang_groupDict.txt') dictList.append(os.environ[self.ABWDE_HOME] + '/dict/sougou.dict') self._wordparser = ABWORDS(dictList) self._imp_tokenizer = ChunkTokenizer()
class ABTokenizer(Tokenizer): ABWDE_HOME = "ABWDE_HOME" def __init__(self, params=None): from ab.util.abwords import ABWORDS import os super(ABTokenizer, self).__init__(params) # user dict from params dictList = [] #dictList.append(os.environ[self.ABWDE_HOME] + '/dict/Aibang_basicDict.txt') #dictList.append(os.environ[self.ABWDE_HOME] + '/dict/Aibang_groupDict.txt') dictList.append(os.environ[self.ABWDE_HOME] + '/dict/sougou.dict') self._wordparser = ABWORDS(dictList) self._imp_tokenizer = ChunkTokenizer() def tokenize(self, stream): gbk_stream = stream.encode('gbk', 'ignore') for chunk in self._imp_tokenizer.tokenize(stream): words = self._wordparser.seg_words(chunk.encode('gbk', 'ignore')) for word in words: yield word.decode('gbk', 'ignore')