Ejemplo n.º 1
0
 def test_final_comma_split_after_number(self):
     moses = MosesTokenizer()
     text = u"Sie sollten vor dem Upgrade eine Sicherung dieser Daten erstellen (wie unter Abschnitt 4.1.1, „Sichern aller Daten und Konfigurationsinformationen“ beschrieben). "
     expected_tokens = [
         "Sie",
         "sollten",
         "vor",
         "dem",
         "Upgrade",
         "eine",
         "Sicherung",
         "dieser",
         "Daten",
         "erstellen",
         "(",
         "wie",
         "unter",
         "Abschnitt",
         "4.1.1",
         ",",
         u"„",
         "Sichern",
         "aller",
         "Daten",
         "und",
         "Konfigurationsinformationen",
         u"“",
         "beschrieben",
         ")",
         ".",
     ]
     self.assertEqual(moses.tokenize(text), expected_tokens)
Ejemplo n.º 2
0
    def test_detokenize_with_aggressive_split(self):
        mt = MosesTokenizer()
        md = MosesDetokenizer()

        text = "foo-bar"
        assert md.detokenize(mt.tokenize(text,
                                         aggressive_dash_splits=True)) == text
Ejemplo n.º 3
0
    def test_protect_patterns(self):
        moses = MosesTokenizer()
        text = "this is a webpage https://stackoverflow.com/questions/6181381/how-to-print-variables-in-perl that kicks ass"
        expected_tokens = [
            "this",
            "is",
            "a",
            "webpage",
            "https://stackoverflow.com/questions/6181381/how-to-print-variables-in-perl",
            "that",
            "kicks",
            "ass",
        ]
        assert (moses.tokenize(
            text, protected_patterns=moses.BASIC_PROTECTED_PATTERNS) ==
                expected_tokens)

        # Testing against pattern from https://github.com/alvations/sacremoses/issues/35
        noe_patterns = [
            r"(?:http|ftp)s?://"  # http:// or https://
            r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?))"
            r"(?::\d+)?"  # optional port
            r"(?:/\w+)*"
            r"(?:(?:\.[a-z]+)|/?)"
        ]
        assert moses.tokenize(
            text, protected_patterns=noe_patterns) == expected_tokens
Ejemplo n.º 4
0
    def test_aggressive_split(self):
        moses = MosesTokenizer()
        expected_tokens_wo_aggressive_dash_split = ["foo-bar"]
        expected_tokens_with_aggressive_dash_split = ["foo", "@-@", "bar"]

        assert moses.tokenize(
            "foo-bar") == expected_tokens_wo_aggressive_dash_split
        assert (moses.tokenize("foo-bar", aggressive_dash_splits=True) ==
                expected_tokens_with_aggressive_dash_split)
Ejemplo n.º 5
0
    def test_opening_brackets(self):
        moses = MosesTokenizer()

        text = "By the mid 1990s a version of the game became a Latvian television series (with a parliamentary setting, and played by Latvian celebrities)."

        # echo By the mid 1990s a version of the game became a Latvian television series (with a parliamentary setting, and played by Latvian celebrities). | perl mosesdecoder\scripts\tokenizer\tokenizer.perl en
        expected_tokens = "By the mid 1990s a version of the game became a Latvian television series ( with a parliamentary setting , and played by Latvian celebrities ) .".split(
        )

        assert moses.tokenize(text) == expected_tokens
Ejemplo n.º 6
0
    def test_moses_detokenize(self):
        mt = MosesTokenizer()
        md = MosesDetokenizer()

        text = (
            u"This, is a sentence with weird\xbb symbols\u2026 appearing everywhere\xbf"
        )
        expected_tokens = mt.tokenize(text)
        expected_detokens = u"This, is a sentence with weird \xbb symbols \u2026 appearing everywhere \xbf"

        assert md.detokenize(expected_tokens) == expected_detokens

        text = "This ain't funny. It's actually hillarious, yet double Ls. | [] < > [ ] & You're gonna shake it off? Don't?"
        expected_tokens = [
            u"This",
            u"ain",
            u"&apos;t",
            u"funny",
            u".",
            u"It",
            u"&apos;s",
            u"actually",
            u"hillarious",
            u",",
            u"yet",
            u"double",
            u"Ls",
            u".",
            u"&#124;",
            u"&#91;",
            u"&#93;",
            u"&lt;",
            u"&gt;",
            u"&#91;",
            u"&#93;",
            u"&amp;",
            u"You",
            u"&apos;re",
            u"gonna",
            u"shake",
            u"it",
            u"off",
            u"?",
            u"Don",
            u"&apos;t",
            u"?",
        ]
        expected_detokens = "This ain't funny. It's actually hillarious, yet double Ls. | [] < > [] & You're gonna shake it off? Don't?"
        assert mt.tokenize(text) == expected_tokens
        assert md.detokenize(expected_tokens) == expected_detokens
Ejemplo n.º 7
0
    def test_hyphen_boundaries(self):
        tokenizer = MosesTokenizer()

        text = "-I'm tired."
        tokens = tokenizer.tokenize(text)
        assert tokens == ["-", "I&apos;m", "tired", "."]

        text = "non-breaking word"
        tokens = tokenizer.tokenize(text)
        assert tokens == ["non-breaking", "word"]

        text = "also-3 3-ventilators word"
        tokens = tokenizer.tokenize(text)
        assert tokens == ["also-3", "3-ventilator", "word"]
Ejemplo n.º 8
0
 def test_mixed_cjk_tokenization(self):
     tokenizer = MosesTokenizer()
     detokenizer = MosesDetokenizer()
     text = u"Japan is 日本 in Japanese."
     assert tokenizer.tokenize(text) == [
         u"Japan",
         u"is",
         u"日",
         u"本",
         u"in",
         u"Japanese",
         u".",
     ]
     assert detokenizer.detokenize(tokenizer.tokenize(text)) == text
Ejemplo n.º 9
0
def tokenize_file(
    iterator,
    language,
    processes,
    quiet,
    xml_escape,
    aggressive_dash_splits,
    protected_patterns,
    custom_nb_prefixes,
):
    moses = MosesTokenizer(lang=language,
                           custom_nonbreaking_prefixes_file=custom_nb_prefixes)

    if protected_patterns:
        with open(protected_patterns, encoding="utf8") as fin:
            protected_patterns = [
                pattern.strip() for pattern in fin.readlines()
            ]

    moses_tokenize = partial(
        moses.tokenize,
        return_str=True,
        aggressive_dash_splits=aggressive_dash_splits,
        escape=xml_escape,
        protected_patterns=protected_patterns,
    )
    return parallel_or_not(iterator, moses_tokenize, processes, quiet)
Ejemplo n.º 10
0
    def test_final_dot_unconditionally(self):
        # Make sure that it works for examples on
        # https://github.com/moses-smt/mosesdecoder/pull/204
        text = "'So am I."
        expected_tokens = "&apos;So am I .".split()
        self.assertEqual(moses.tokenize(text), expected_tokens)

        moses = MosesTokenizer(lang="fr")
        text = "Des gens admirent une œuvre d'art."
        expected_tokens = "Des gens admirent une œuvre d' art .".split()
        self.assertEqual(moses.tokenize(text), expected_tokens)

        moses = MosesTokenizer(lang="de")
        text = "...schwer wie ein iPhone 5."
        expected_tokens = "... schwer wie ein iPhone 5 .".split()
        self.assertEqual(moses.tokenize(text), expected_tokens)

        moses = MosesTokenizer(lang="cz")
        text = "Dvě děti, které běží bez bot."
        expected_tokens = "Dvě děti , které běží bez bot .".split()
        self.assertEqual(moses.tokenize(text), expected_tokens)
Ejemplo n.º 11
0
 def test_japanese_tokenization(self):
     tokenizer = MosesTokenizer(lang="ja")
     text = u"電話でんわの邪魔じゃまをしないでください"
     assert tokenizer.tokenize(text) == [text]
Ejemplo n.º 12
0
 def test_korean_tokenization(self):
     tokenizer = MosesTokenizer(lang="ko")
     detokenizer = MosesDetokenizer(lang="ko")
     text = u"세계 에서 가장 강력한."
     assert tokenizer.tokenize(text) == [u'세계', u'에서', u'가장', u'강력한', u'.']
     assert detokenizer.detokenize(tokenizer.tokenize(text)) == text
Ejemplo n.º 13
0
 def test_chinese_tokenization(self):
     tokenizer = MosesTokenizer(lang="zh")
     text = u"记者 应谦 美国"
     assert tokenizer.tokenize(text) == [u'记者', u'应谦', u'美国']
Ejemplo n.º 14
0
    def test_french_apostrophes(self):
        tokenizer = MosesTokenizer(lang="fr")
        detokenizer = MosesDetokenizer(lang="fr")

        text = u"L'amitié nous a fait forts d'esprit"
        assert detokenizer.detokenize(tokenizer.tokenize(text)) == text
Ejemplo n.º 15
0
    def test_opening_brackets(self):
        tokenizer = MosesTokenizer()
        detokenizer = MosesDetokenizer()

        text = "By the mid 1990s a version of the game became a Latvian television series (with a parliamentary setting, and played by Latvian celebrities)."
        assert detokenizer.detokenize(tokenizer.tokenize(text)) == text
Ejemplo n.º 16
0
    def test_moses_tokenize(self):
        moses = MosesTokenizer()

        # Tokenize a sentence.
        text = (
            u"This, is a sentence with weird\xbb symbols\u2026 appearing everywhere\xbf"
        )
        expected_tokens = u"This , is a sentence with weird \xbb symbols \u2026 appearing everywhere \xbf"
        tokenized_text = moses.tokenize(text, return_str=True)
        assert tokenized_text == expected_tokens

        # The nonbreaking prefixes should tokenize the final fullstop.
        assert moses.tokenize("abc def.") == [u"abc", u"def", u"."]

        # The nonbreaking prefixes should deal the situation when numeric only prefix is the last token.
        # In below example, "pp" is the last element, and there is no digit after it.
        assert moses.tokenize("2016, pp.") == [u"2016", u",", u"pp", u"."]

        # Test escape_xml
        text = "This ain't funny. It's actually hillarious, yet double Ls. | [] < > [ ] & You're gonna shake it off? Don't?"
        expected_tokens_with_xmlescape = [
            "This",
            "ain",
            "&apos;t",
            "funny",
            ".",
            "It",
            "&apos;s",
            "actually",
            "hillarious",
            ",",
            "yet",
            "double",
            "Ls",
            ".",
            "&#124;",
            "&#91;",
            "&#93;",
            "&lt;",
            "&gt;",
            "&#91;",
            "&#93;",
            "&amp;",
            "You",
            "&apos;re",
            "gonna",
            "shake",
            "it",
            "off",
            "?",
            "Don",
            "&apos;t",
            "?",
        ]
        expected_tokens_wo_xmlescape = [
            "This",
            "ain",
            "'t",
            "funny",
            ".",
            "It",
            "'s",
            "actually",
            "hillarious",
            ",",
            "yet",
            "double",
            "Ls",
            ".",
            "|",
            "[",
            "]",
            "<",
            ">",
            "[",
            "]",
            "&",
            "You",
            "'re",
            "gonna",
            "shake",
            "it",
            "off",
            "?",
            "Don",
            "'t",
            "?",
        ]
        assert moses.tokenize(text,
                              escape=True) == expected_tokens_with_xmlescape
        assert moses.tokenize(text,
                              escape=False) == expected_tokens_wo_xmlescape

        # Test to check https://github.com/alvations/sacremoses/issues/19
        text = "this 'is' the thing"
        expected_tokens = ["this", "&apos;", "is", "&apos;", "the", "thing"]
        assert moses.tokenize(text, escape=True) == expected_tokens
Ejemplo n.º 17
0
 def test_dot_splitting(self):
     moses = MosesTokenizer()
     text = "The meeting will take place at 11:00 a.m. Tuesday."
     expected_tokens = (
         "The meeting will take place at 11 : 00 a.m. Tuesday .".split())
     self.assertEqual(moses.tokenize(text), expected_tokens)
Ejemplo n.º 18
0
 def test_trailing_dot_apostrophe(self):
     moses = MosesTokenizer()
     text = "'Hello.'"
     expected_tokens = "&apos;Hello . &apos;".split()
     self.assertEqual(moses.tokenize(text), expected_tokens)