Ejemplo n.º 1
0
def moses_tokenize(text):
    from mosestokenizer import MosesTokenizer

    global MOSES_TOK
    if not MOSES_TOK:
        MOSES_TOK = MosesTokenizer('ru')
        # disable
        MOSES_TOK.argv.append('-no-escape')  # " -> "
        MOSES_TOK.argv.remove('-a')  # - -> @-@
        MOSES_TOK.restart()

    chunks = MOSES_TOK(text)
    return find_substrings(chunks, text)
Ejemplo n.º 2
0
class MosesTokenizer:
    label = 'mosestokenizer'

    def __init__(self):
        from mosestokenizer import MosesTokenizer

        self.tokenizer = MosesTokenizer('ru')
        # disable
        self.tokenizer.argv.append('-no-escape')  # " -> "
        self.tokenizer.argv.remove('-a')  # - -> @-@
        self.tokenizer.restart()

    def __call__(self, text):
        chunks = self.tokenizer(text)
        return find_substrings(chunks, text)