def test_detokenize_with_aggressive_split(self): mt = MosesTokenizer() md = MosesDetokenizer() text = "foo-bar" assert md.detokenize(mt.tokenize(text, aggressive_dash_splits=True)) == text
def test_mixed_cjk_tokenization(self): tokenizer = MosesTokenizer() detokenizer = MosesDetokenizer() text = u"Japan is 日本 in Japanese." assert tokenizer.tokenize(text) == [ u"Japan", u"is", u"日", u"本", u"in", u"Japanese", u".", ] assert detokenizer.detokenize(tokenizer.tokenize(text)) == text
def test_moses_detokenize(self): mt = MosesTokenizer() md = MosesDetokenizer() text = ( u"This, is a sentence with weird\xbb symbols\u2026 appearing everywhere\xbf" ) expected_tokens = mt.tokenize(text) expected_detokens = u"This, is a sentence with weird \xbb symbols \u2026 appearing everywhere \xbf" assert md.detokenize(expected_tokens) == expected_detokens text = "This ain't funny. It's actually hillarious, yet double Ls. | [] < > [ ] & You're gonna shake it off? Don't?" expected_tokens = [ u"This", u"ain", u"'t", u"funny", u".", u"It", u"'s", u"actually", u"hillarious", u",", u"yet", u"double", u"Ls", u".", u"|", u"[", u"]", u"<", u">", u"[", u"]", u"&", u"You", u"'re", u"gonna", u"shake", u"it", u"off", u"?", u"Don", u"'t", u"?", ] expected_detokens = "This ain't funny. It's actually hillarious, yet double Ls. | [] < > [] & You're gonna shake it off? Don't?" assert mt.tokenize(text) == expected_tokens assert md.detokenize(expected_tokens) == expected_detokens
def detokenize_file( iterator, language, processes, quiet, xml_unescape, ): moses = MosesDetokenizer(lang=language) moses_detokenize = partial(moses.detokenize, return_str=True, unescape=xml_unescape) return parallel_or_not(list(map(str.split, iterator)), moses_detokenize, processes, quiet)
def test_korean_tokenization(self): tokenizer = MosesTokenizer(lang="ko") detokenizer = MosesDetokenizer(lang="ko") text = u"세계 에서 가장 강력한." assert tokenizer.tokenize(text) == [u'세계', u'에서', u'가장', u'강력한', u'.'] assert detokenizer.detokenize(tokenizer.tokenize(text)) == text
def test_french_apostrophes(self): tokenizer = MosesTokenizer(lang="fr") detokenizer = MosesDetokenizer(lang="fr") text = u"L'amitié nous a fait forts d'esprit" assert detokenizer.detokenize(tokenizer.tokenize(text)) == text
def test_opening_brackets(self): tokenizer = MosesTokenizer() detokenizer = MosesDetokenizer() text = "By the mid 1990s a version of the game became a Latvian television series (with a parliamentary setting, and played by Latvian celebrities)." assert detokenizer.detokenize(tokenizer.tokenize(text)) == text