Esempio n. 1
0
 def test_final_comma_split_after_number(self):
     moses = MosesTokenizer()
     text = u"Sie sollten vor dem Upgrade eine Sicherung dieser Daten erstellen (wie unter Abschnitt 4.1.1, „Sichern aller Daten und Konfigurationsinformationen“ beschrieben). "
     expected_tokens = [
         "Sie",
         "sollten",
         "vor",
         "dem",
         "Upgrade",
         "eine",
         "Sicherung",
         "dieser",
         "Daten",
         "erstellen",
         "(",
         "wie",
         "unter",
         "Abschnitt",
         "4.1.1",
         ",",
         u"„",
         "Sichern",
         "aller",
         "Daten",
         "und",
         "Konfigurationsinformationen",
         u"“",
         "beschrieben",
         ")",
         ".",
     ]
     self.assertEqual(moses.tokenize(text), expected_tokens)
Esempio n. 2
0
def tokenize_file(iterator, language, processes, quiet):
    moses = MosesTokenizer(lang=language)
    moses_tokenize = partial(
        moses.tokenize,
        return_str=True,
    )
    return parallel_or_not(iterator, moses_tokenize, processes, quiet)
Esempio n. 3
0
def tokenize_file(
    iterator,
    language,
    processes,
    quiet,
    xml_escape,
    aggressive_dash_splits,
    protected_patterns,
    custom_nb_prefixes,
):
    moses = MosesTokenizer(
        lang=language, custom_nonbreaking_prefixes_file=custom_nb_prefixes
    )

    if protected_patterns:
        if protected_patterns == ":basic:":
            protected_patterns = moses.BASIC_PROTECTED_PATTERNS
        elif protected_patterns == ":web:":
            protected_patterns = moses.WEB_PROTECTED_PATTERNS
        else:
            with open(protected_patterns, encoding="utf8") as fin:
                protected_patterns = [pattern.strip() for pattern in fin.readlines()]

    moses_tokenize = partial(
        moses.tokenize,
        return_str=True,
        aggressive_dash_splits=aggressive_dash_splits,
        escape=xml_escape,
        protected_patterns=protected_patterns,
    )
    return parallel_or_not(iterator, moses_tokenize, processes, quiet)
Esempio n. 4
0
    def test_aggressive_split(self):
        moses = MosesTokenizer()
        expected_tokens_wo_aggressive_dash_split = ['foo-bar']
        expected_tokens_with_aggressive_dash_split = ['foo', '@-@', 'bar']

        assert moses.tokenize('foo-bar') == expected_tokens_wo_aggressive_dash_split
        assert moses.tokenize('foo-bar', aggressive_dash_splits=True) == expected_tokens_with_aggressive_dash_split
Esempio n. 5
0
    def test_moses_tokenize(self):
        moses = MosesTokenizer()

        # Tokenize a sentence.
        text = u'This, is a sentence with weird\xbb symbols\u2026 appearing everywhere\xbf'
        expected_tokens = u'This , is a sentence with weird \xbb symbols \u2026 appearing everywhere \xbf'
        tokenized_text = moses.tokenize(text, return_str=True)
        assert tokenized_text == expected_tokens

        # The nonbreaking prefixes should tokenize the final fullstop.
        assert moses.tokenize('abc def.') == [u'abc', u'def', u'.']

        # The nonbreaking prefixes should deal the situation when numeric only prefix is the last token.
        # In below example, "pp" is the last element, and there is no digit after it.
        assert moses.tokenize('2016, pp.') == [u'2016', u',', u'pp', u'.']

        # Test escape_xml
        text = "This ain't funny. It's actually hillarious, yet double Ls. | [] < > [ ] & You're gonna shake it off? Don't?"
        expected_tokens_with_xmlescape = [
            'This', 'ain', '&apos;t', 'funny', '.', 'It', '&apos;s',
            'actually', 'hillarious', ',', 'yet', 'double', 'Ls', '.',
            '&#124;', '&#91;', '&#93;', '&lt;', '&gt;', '&#91;', '&#93;',
            '&amp;', 'You', '&apos;re', 'gonna', 'shake', 'it', 'off', '?',
            'Don', '&apos;t', '?'
        ]
        expected_tokens_wo_xmlescape = [
            'This', 'ain', "'t", 'funny', '.', 'It', "'s", 'actually',
            'hillarious', ',', 'yet', 'double', 'Ls', '.', '|', '[', ']', '<',
            '>', '[', ']', '&', 'You', "'re", 'gonna', 'shake', 'it', 'off',
            '?', 'Don', "'t", '?'
        ]
        assert moses.tokenize(text,
                              escape=True) == expected_tokens_with_xmlescape
        assert moses.tokenize(text,
                              escape=False) == expected_tokens_wo_xmlescape
def tokenize_file(language, processes, xml_escape, aggressive_dash_splits,
                  protected_patterns, custom_nb_prefixes, encoding, quiet):
    moses = MosesTokenizer(lang=language,
                           custom_nonbreaking_prefixes_file=custom_nb_prefixes)

    if protected_patterns:
        with open(protected_patterns, encoding="utf8") as fin:
            protected_patterns = [
                pattern.strip() for pattern in fin.readlines()
            ]

    moses_tokenize = partial(
        moses.tokenize,
        return_str=True,
        aggressive_dash_splits=aggressive_dash_splits,
        escape=xml_escape,
        protected_patterns=protected_patterns,
    )

    with click.get_text_stream("stdin", encoding=encoding) as fin:
        with click.get_text_stream("stdout", encoding=encoding) as fout:
            # If it's single process, joblib parallization is slower,
            # so just process line by line normally.
            if processes == 1:
                for line in tqdm(fin.readlines()):
                    print(moses_tokenize(line), end="\n", file=fout)
            else:
                for outline in parallelize_preprocess(
                        moses_tokenize,
                        fin.readlines(),
                        processes,
                        progress_bar=(not quiet)):
                    print(outline, end="\n", file=fout)
Esempio n. 7
0
    def test_protect_patterns(self):
        moses = MosesTokenizer()
        text = "this is a webpage https://stackoverflow.com/questions/6181381/how-to-print-variables-in-perl that kicks ass"
        expected_tokens = [
            "this",
            "is",
            "a",
            "webpage",
            "https://stackoverflow.com/questions/6181381/how-to-print-variables-in-perl",
            "that",
            "kicks",
            "ass",
        ]
        assert (
            moses.tokenize(text, protected_patterns=moses.BASIC_PROTECTED_PATTERNS)
            == expected_tokens
        )

        # Testing against pattern from https://github.com/alvations/sacremoses/issues/35
        noe_patterns = [
            r"(?:http|ftp)s?://"  # http:// or https://
            r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?))"
            r"(?::\d+)?"  # optional port
            r"(?:/\w+)*"
            r"(?:(?:\.[a-z]+)|/?)"
        ]
        assert moses.tokenize(text, protected_patterns=noe_patterns) == expected_tokens
Esempio n. 8
0
    def test_opening_brackets(self):
        tokenizer = MosesTokenizer()
        detokenizer = MosesDetokenizer()

        text = "By the mid 1990s a version of the game became a Latvian television series (with a parliamentary setting, and played by Latvian celebrities)."

        assert detokenizer.detokenize(tokenizer.tokenize(text)) == text
Esempio n. 9
0
    def test_detokenize_with_aggressive_split(self):
        mt = MosesTokenizer()
        md = MosesDetokenizer()

        text = 'foo-bar'
        assert md.detokenize(mt.tokenize(text,
                                         aggressive_dash_splits=True)) == text
Esempio n. 10
0
 def test_dot_splitting(self):
     moses = MosesTokenizer()
     text = "The meeting will take place at 11:00 a.m. Tuesday."
     expected_tokens = (
         "The meeting will take place at 11 : 00 a.m. Tuesday .".split()
     )
     self.assertEqual(moses.tokenize(text), expected_tokens)
Esempio n. 11
0
def tokenize_file(iterator, language, processes, quiet, aggressive_dash_splits):
    moses = MosesTokenizer(lang=language)
    moses_tokenize = partial(
        moses.tokenize,
        return_str=True,
        aggressive_dash_splits=aggressive_dash_splits,
    )
    return parallel_or_not(iterator, moses_tokenize, processes, quiet)
Esempio n. 12
0
    def test_opening_brackets(self):
        moses = MosesTokenizer()

        text = "By the mid 1990s a version of the game became a Latvian television series (with a parliamentary setting, and played by Latvian celebrities)."

        # echo By the mid 1990s a version of the game became a Latvian television series (with a parliamentary setting, and played by Latvian celebrities). | perl mosesdecoder\scripts\tokenizer\tokenizer.perl en
        expected_tokens = "By the mid 1990s a version of the game became a Latvian television series ( with a parliamentary setting , and played by Latvian celebrities ) .".split()

        assert moses.tokenize(text) == expected_tokens
    def test_aggressive_split(self):
        moses = MosesTokenizer()
        expected_tokens_wo_aggressive_dash_split = ["foo-bar"]
        expected_tokens_with_aggressive_dash_split = ["foo", "@-@", "bar"]

        assert moses.tokenize(
            "foo-bar") == expected_tokens_wo_aggressive_dash_split
        assert (moses.tokenize("foo-bar", aggressive_dash_splits=True) ==
                expected_tokens_with_aggressive_dash_split)
Esempio n. 14
0
 def test_stress_has_numeric_only_prefixes(self):
     """Stress testing to prevent redos."""
     moses = MosesTokenizer()
     for i in range(1, 10):
         start_time = time.perf_counter()
         payload = " " + " " * (i * 500) + ""
         moses.has_numeric_only(payload)
         stop_time = time.perf_counter() - start_time
         assert stop_time < 20
Esempio n. 15
0
 def test_final_comma_split_after_number(self):
     moses = MosesTokenizer()
     text = u"Sie sollten vor dem Upgrade eine Sicherung dieser Daten erstellen (wie unter Abschnitt 4.1.1, „Sichern aller Daten und Konfigurationsinformationen“ beschrieben). "
     expected_tokens = [
         'Sie', 'sollten', 'vor', 'dem', 'Upgrade', 'eine', 'Sicherung',
         'dieser', 'Daten', 'erstellen', '(', 'wie', 'unter', 'Abschnitt',
         '4.1.1', ',', u'„', 'Sichern', 'aller', 'Daten', 'und',
         'Konfigurationsinformationen', u'“', 'beschrieben', ')', '.'
     ]
     self.assertEqual(moses.tokenize(text), expected_tokens)
Esempio n. 16
0
    def test_final_dot_unconditionally(self):
        # Make sure that it works for examples on
        # https://github.com/moses-smt/mosesdecoder/pull/204
        text = "'So am I."
        expected_tokens = "&apos;So am I .".split()
        self.assertEqual(moses.tokenize(text), expected_tokens)

        moses = MosesTokenizer(lang="fr")
        text = "Des gens admirent une œuvre d'art."
        expected_tokens = "Des gens admirent une œuvre d' art .".split()
        self.assertEqual(moses.tokenize(text), expected_tokens)

        moses = MosesTokenizer(lang="de")
        text = "...schwer wie ein iPhone 5."
        expected_tokens = "... schwer wie ein iPhone 5 .".split()
        self.assertEqual(moses.tokenize(text), expected_tokens)

        moses = MosesTokenizer(lang="cz")
        text = "Dvě děti, které běží bez bot."
        expected_tokens = "Dvě děti , které běží bez bot .".split()
        self.assertEqual(moses.tokenize(text), expected_tokens)
Esempio n. 17
0
 def test_mixed_cjk_tokenization(self):
     tokenizer = MosesTokenizer()
     detokenizer = MosesDetokenizer()
     text = u"Japan is 日本 in Japanese."
     assert tokenizer.tokenize(text) == [
         u"Japan",
         u"is",
         u"日",
         u"本",
         u"in",
         u"Japanese",
         u".",
     ]
     assert detokenizer.detokenize(tokenizer.tokenize(text)) == text
Esempio n. 18
0
    def test_moses_detokenize(self):
        mt = MosesTokenizer()
        md = MosesDetokenizer()

        text = u'This, is a sentence with weird\xbb symbols\u2026 appearing everywhere\xbf'
        expected_tokens = mt.tokenize(text)
        expected_detokens = = u'This , is a sentence with weird \xbb symbols \u2026 appearing everywhere \xbf'
        assert md.detokenize(expected_tokens) == expected_detokens

        text = "This ain't funny. It's actually hillarious, yet double Ls. | [] < > [ ] & You're gonna shake it off? Don't?"
        expected_tokens = [u'This', u'ain', u'&apos;t', u'funny', u'.', u'It', u'&apos;s', u'actually', u'hillarious', u',', u'yet', u'double', u'Ls', u'.', u'&#124;', u'&#91;', u'&#93;', u'&lt;', u'&gt;', u'&#91;', u'&#93;', u'&amp;', u'You', u'&apos;re', u'gonna', u'shake', u'it', u'off', u'?', u'Don', u'&apos;t', u'?']
        expected_detokens = "This ain't funny. It's actually hillarious, yet double Ls. | [] < > [] & You're gonna shake it off? Don't?"
        assert mt.tokenize(text) == expected_tokens
        assert md.detokenize(expected_tokens) == expected_detokens
Esempio n. 19
0
    def test_moses_detokenize(self):
        mt = MosesTokenizer()
        md = MosesDetokenizer()

        text = (
            u"This, is a sentence with weird\xbb symbols\u2026 appearing everywhere\xbf"
        )
        expected_tokens = mt.tokenize(text)
        expected_detokens = u"This, is a sentence with weird \xbb symbols \u2026 appearing everywhere \xbf"

        assert md.detokenize(expected_tokens) == expected_detokens

        text = "This ain't funny. It's actually hillarious, yet double Ls. | [] < > [ ] & You're gonna shake it off? Don't?"
        expected_tokens = [
            u"This",
            u"ain",
            u"&apos;t",
            u"funny",
            u".",
            u"It",
            u"&apos;s",
            u"actually",
            u"hillarious",
            u",",
            u"yet",
            u"double",
            u"Ls",
            u".",
            u"&#124;",
            u"&#91;",
            u"&#93;",
            u"&lt;",
            u"&gt;",
            u"&#91;",
            u"&#93;",
            u"&amp;",
            u"You",
            u"&apos;re",
            u"gonna",
            u"shake",
            u"it",
            u"off",
            u"?",
            u"Don",
            u"&apos;t",
            u"?",
        ]
        expected_detokens = "This ain't funny. It's actually hillarious, yet double Ls. | [] < > [] & You're gonna shake it off? Don't?"
        assert mt.tokenize(text) == expected_tokens
        assert md.detokenize(expected_tokens) == expected_detokens
Esempio n. 20
0
def tokenize_file(processes, xml_escape, aggressive_dash_splits):
    moses = MosesTokenizer()
    moses_tokenize = partial(moses.tokenize,
                             return_str=True,
                             aggressive_dash_splits=aggressive_dash_splits,
                             escape=xml_escape)

    with click.get_text_stream('stdin') as fin, click.get_text_stream(
            'stdout') as fout:
        # If it's single process, joblib parallization is slower,
        # so just process line by line normally.
        if processes == 1:
            for line in tqdm(fin.readlines()):
                print(moses_tokenize(line), end='\n', file=fout)
        else:
            for outline in parallelize_preprocess(moses_tokenize,
                                                  fin.readlines(),
                                                  processes,
                                                  progress_bar=True):
                print(outline, end='\n', file=fout)
Esempio n. 21
0
def tokenize_file(
    language,
    processes,
    quiet):

    moses = MosesTokenizer(lang=language)

    moses_tokenize = partial(
        moses.tokenize,
        return_str=True,
    )

    def processor(iterator):
        if processes == 1:
            for line in list(iterator):
                yield moses_tokenize(line)
        else:
            for outline in parallelize_preprocess(
                moses_tokenize, list(iterator), processes, progress_bar=(not quiet)
            ):
                yield outline
    return processor
Esempio n. 22
0
    def test_protect_patterns(self):
        moses = MosesTokenizer()
        text = "this is a webpage https://stackoverflow.com/questions/6181381/how-to-print-variables-in-perl that kicks ass"
        expected_tokens = [
            'this', 'is', 'a', 'webpage',
            'https://stackoverflow.com/questions/6181381/how-to-print-variables-in-perl',
            'that', 'kicks', 'ass'
        ]
        assert moses.tokenize(text,
                              protected_patterns=moses.BASIC_PROTECTED_PATTERNS
                              ) == expected_tokens

        # Testing against pattern from https://github.com/alvations/sacremoses/issues/35
        noe_patterns = [
            r'(?:http|ftp)s?://'  # http:// or https://
            r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?))'
            r'(?::\d+)?'  # optional port
            r'(?:/\w+)*'
            r'(?:(?:\.[a-z]+)|/?)'
        ]
        assert moses.tokenize(
            text, protected_patterns=noe_patterns) == expected_tokens
Esempio n. 23
0
    "--output-stdout",
    action='store_true',
    help="output as tab-separated similar to the input file instead of 3 files"
)

args = parser.parse_args()

# opening output files
if args.output_stdout == False:
    args.output_files = [args.output_prefix + '.' + ext for ext in args.exts]
    fos = [open(fo, 'w') for fo in args.output_files]

tokenizer = None
if args.tokenize:
    assert args.source_language, "--source-language must be set if --tokenize flag is used"
    tokenizer = MosesTokenizer(args.source_language)


def cleanup(s, args=None):
    s = re.sub('<[^>]*>', ' ', s)
    s = s.strip()
    if args:
        if args.remove_begin_hyphens == True and s.startswith('-'):
            s = s[1:]
    s = re.sub(' +', ' ', s.strip())
    return s


for line in tqdm.tqdm(args.input_file):

    line = line.rstrip()
Esempio n. 24
0
 def test_japanese_tokenization(self):
     tokenizer = MosesTokenizer(lang="ja")
     text = u"電話でんわの邪魔じゃまをしないでください"
     assert tokenizer.tokenize(text) == [text]
Esempio n. 25
0
 def test_korean_tokenization(self):
     tokenizer = MosesTokenizer(lang="ko")
     detokenizer = MosesDetokenizer(lang="ko")
     text = u"세계 에서 가장 강력한."
     assert tokenizer.tokenize(text) == [u"세계", u"에서", u"가장", u"강력한", u"."]
     assert detokenizer.detokenize(tokenizer.tokenize(text)) == text
Esempio n. 26
0
 def test_chinese_tokenization(self):
     tokenizer = MosesTokenizer(lang="zh")
     text = u"记者 应谦 美国"
     assert tokenizer.tokenize(text) == [u"记者", u"应谦", u"美国"]
Esempio n. 27
0
    def test_french_apostrophes(self):
        tokenizer = MosesTokenizer(lang="fr")
        detokenizer = MosesDetokenizer(lang="fr")

        text = u"L'amitié nous a fait forts d'esprit"
        assert detokenizer.detokenize(tokenizer.tokenize(text)) == text
Esempio n. 28
0
def preprocess(src_file, mt_file, output_dir, tokenize_lang=None):
    """
        pre-process input file before post-editing
        split at <br> and remove <i> tags and music symbols.
        store everything in a codes file in output_dir

        Args:
            src_file: src_file of the translation to be preprocessed
            mt_file: output of the mt system file to be preprocessed
            output_dir: output directory to output the preprocessed files and codes file

    """

    punct_normalizer = MosesPunctNormalizer()

    # set tokenizer
    tokenizer = None
    if tokenize_lang:
        tokenizer = MosesTokenizer(lang=tokenize_lang)

    code_file = output_dir+'/codes.'+os.path.basename(mt_file)
    src_out_file = output_dir+'/'+os.path.basename(src_file)+'.pre'
    mt_out_file = output_dir+'/'+os.path.basename(mt_file)+'.pre'
    with open(src_out_file,'w') as fosrc, open(mt_out_file,'w') as fomt, open(code_file,'w') as fcodes, open(src_file) as fsrc, open(mt_file) as fmt:
        idx=0
        for src,mt in zip(fsrc,fmt):
            src, mt = src.strip(), mt.strip()
            

            idx+=1
            
            # standardize br tags
            src = re.sub('<\s*br\s*\/*>', '<br>', src, flags=re.IGNORECASE)
            mt = re.sub('<\s*br\s*\/*>', '<br>', mt, flags=re.IGNORECASE)


            # if number of <br> is same, split and save it as multiple lines
            src_split = re.split(r'\s*<br>\s*',src)
            mt_split = re.split(r'\s*<br>\s*',mt)

            # if the src, mt, do not have the same number of <br>, then do not split it
            if not (len(src_split) == len(mt_split)):
                src_split = [src]
                mt_split = [mt]
                


            for src_part, mt_part in zip(src_split, mt_split):
                code = "{}\t".format(idx)

                # check if they start with the hyphen
                has_hyphen = False
                if src_part.startswith('-'):
                    has_hyphen = True
                    src_part = src_part[1:].lstrip()

                if mt_part.startswith('-'):
                    has_hyphen = True
                    mt_part = mt_part[1:].lstrip()

                # check if they start with the music symbol
                music_syms = ('♫','♬','♪')
                has_music = False
                if re.search(r'\s*[{}]\s*'.format(''.join(music_syms)), src_part):
                    has_music = True
                    src_part = re.sub(r'\s*[{}]\s*'.format(''.join(music_syms)), '', src_part)

                #if mt_part.startswith(music_syms) or mt_part.endswith(music_syms):
                if re.search(r'\s*[{}]\s*'.format(''.join(music_syms)), mt_part):                
                    has_music = True
                    mt_part = re.sub(r'\s*[{}]\s*'.format(''.join(music_syms)), '', mt_part)

                # check if it has enclosing italics tags. otherwise leave it as it is
                itag = '<i>'
                eitag = '</i>'
                has_itag = False
                if src_part.startswith(itag) or src_part.endswith(eitag):
                    has_itag = True

                if mt_part.startswith(itag) or mt_part.endswith(eitag):
                    has_itag = True


                #if re.match(r'^<i>[^<]*</i>$', src_part):
                if has_hyphen == True:
                    code += 'HYPHENBEGIN\t'
                if has_music == True:
                    code += 'MUSIC\t'
                if has_itag == True:
                    code += 'ITALICTAGS\t'

                src_part = punct_normalizer.normalize(cleanup(src_part))
                mt_part = punct_normalizer.normalize(cleanup(mt_part))

                if tokenizer:
                    src_part = " ".join(tokenizer.tokenize(src_part, escape=False))
                    mt_part = " ".join(tokenizer.tokenize(mt_part, escape=False))

                fosrc.write(src_part.strip()+'\n')
                fomt.write(mt_part.strip()+'\n')
                fcodes.write("{}\n".format(code))
Esempio n. 29
0
    def test_moses_tokenize(self):
        moses = MosesTokenizer()

        # Tokenize a sentence.
        text = (
            u"This, is a sentence with weird\xbb symbols\u2026 appearing everywhere\xbf"
        )
        expected_tokens = u"This , is a sentence with weird \xbb symbols \u2026 appearing everywhere \xbf"
        tokenized_text = moses.tokenize(text, return_str=True)
        assert tokenized_text == expected_tokens

        # The nonbreaking prefixes should tokenize the final fullstop.
        assert moses.tokenize("abc def.") == [u"abc", u"def", u"."]

        # The nonbreaking prefixes should deal the situation when numeric only prefix is the last token.
        # In below example, "pp" is the last element, and there is no digit after it.
        assert moses.tokenize("2016, pp.") == [u"2016", u",", u"pp", u"."]

        # Test escape_xml
        text = "This ain't funny. It's actually hillarious, yet double Ls. | [] < > [ ] & You're gonna shake it off? Don't?"
        expected_tokens_with_xmlescape = [
            "This",
            "ain",
            "&apos;t",
            "funny",
            ".",
            "It",
            "&apos;s",
            "actually",
            "hillarious",
            ",",
            "yet",
            "double",
            "Ls",
            ".",
            "&#124;",
            "&#91;",
            "&#93;",
            "&lt;",
            "&gt;",
            "&#91;",
            "&#93;",
            "&amp;",
            "You",
            "&apos;re",
            "gonna",
            "shake",
            "it",
            "off",
            "?",
            "Don",
            "&apos;t",
            "?",
        ]
        expected_tokens_wo_xmlescape = [
            "This",
            "ain",
            "'t",
            "funny",
            ".",
            "It",
            "'s",
            "actually",
            "hillarious",
            ",",
            "yet",
            "double",
            "Ls",
            ".",
            "|",
            "[",
            "]",
            "<",
            ">",
            "[",
            "]",
            "&",
            "You",
            "'re",
            "gonna",
            "shake",
            "it",
            "off",
            "?",
            "Don",
            "'t",
            "?",
        ]
        assert moses.tokenize(text, escape=True) == expected_tokens_with_xmlescape
        assert moses.tokenize(text, escape=False) == expected_tokens_wo_xmlescape

        # Test to check https://github.com/alvations/sacremoses/issues/19
        text = "this 'is' the thing"
        expected_tokens = ["this", "&apos;", "is", "&apos;", "the", "thing"]
        assert moses.tokenize(text, escape=True) == expected_tokens
Esempio n. 30
0
 def test_trailing_dot_apostrophe(self):
     moses = MosesTokenizer()
     text = "'Hello.'"
     expected_tokens = "&apos;Hello . &apos;".split()
     self.assertEqual(moses.tokenize(text), expected_tokens)