def moses_tokenize(text): from mosestokenizer import MosesTokenizer global MOSES_TOK if not MOSES_TOK: MOSES_TOK = MosesTokenizer('ru') # disable MOSES_TOK.argv.append('-no-escape') # " -> " MOSES_TOK.argv.remove('-a') # - -> @-@ MOSES_TOK.restart() chunks = MOSES_TOK(text) return find_substrings(chunks, text)
class MosesTokenizer: label = 'mosestokenizer' def __init__(self): from mosestokenizer import MosesTokenizer self.tokenizer = MosesTokenizer('ru') # disable self.tokenizer.argv.append('-no-escape') # " -> " self.tokenizer.argv.remove('-a') # - -> @-@ self.tokenizer.restart() def __call__(self, text): chunks = self.tokenizer(text) return find_substrings(chunks, text)