Beispiel #1
0
 def test_moses_normalize_documents(self):
     moses = MosesPunctNormalizer()
     # Examples from normalizing big.txt
     inputs = [
         "The United States in 1805 (color map)                 _Facing_     193",
         "=Formation of the Constitution.=--(1) The plans before the convention,",
         "directions--(1) The infective element must be eliminated. When the ulcer",
         "College of Surgeons, Edinburgh.)]",
     ]
     expected = [
         "The United States in 1805 (color map) _Facing_ 193",
         "=Formation of the Constitution.=-- (1) The plans before the convention,",
         "directions-- (1) The infective element must be eliminated. When the ulcer",
         "College of Surgeons, Edinburgh.) ]",
     ]
     for text, expect in zip(inputs, expected):
         assert moses.normalize(text) == expect
Beispiel #2
0
    def test_moses_normalize_numbers(self):
        # See https://stackoverflow.com/a/55233871/610569
        moses_norm_num = MosesPunctNormalizer("en", norm_numbers=True)
        moses_no_norm_num = MosesPunctNormalizer("en", norm_numbers=False)

        text = u"12{}123".format(u"\u00A0")
        expected = u"12.123"
        assert moses_norm_num.normalize(text) == expected

        text = expected = u"12 123"
        assert moses_no_norm_num.normalize(text) == expected
Beispiel #3
0
    def test_moses_normalize_quote_comma(self):
        moses_norm_quote = MosesPunctNormalizer("en", norm_quote_commas=True)
        moses_no_norm_quote = MosesPunctNormalizer("en",
                                                   norm_quote_commas=False)
        text = 'THIS EBOOK IS OTHERWISE PROVIDED TO YOU "AS-IS".'

        expected_norm_quote = 'THIS EBOOK IS OTHERWISE PROVIDED TO YOU "AS-IS."'
        assert moses_norm_quote.normalize(text) == expected_norm_quote

        expected_no_norm_quote = 'THIS EBOOK IS OTHERWISE PROVIDED TO YOU "AS-IS".'
        assert moses_no_norm_quote.normalize(text) == expected_no_norm_quote
Beispiel #4
0
def normalize_file(
    iterator,
    language,
    processes,
    quiet,
    normalize_quote_commas,
    normalize_numbers,
    replace_unicode_puncts,
    remove_control_chars,
):
    moses = MosesPunctNormalizer(
        language,
        norm_quote_commas=normalize_quote_commas,
        norm_numbers=normalize_numbers,
        pre_replace_unicode_punct=replace_unicode_puncts,
        post_remove_control_chars=remove_control_chars,
    )
    moses_normalize = partial(moses.normalize)
    return parallel_or_not(iterator, moses_normalize, processes, quiet)
Beispiel #5
0
 def test_normalization_pipeline(self):
     moses_norm_unicode = MosesPunctNormalizer(
         pre_replace_unicode_punct=True, post_remove_control_chars=True)
     text = u"0《123》      456%  '' 【789】"
     expected = u'0"123" 456% " [789]'
     assert moses_norm_unicode.normalize(text) == expected
Beispiel #6
0
 def test_replace_unicode_punct(self):
     moses_norm_unicode = MosesPunctNormalizer()
     text = u"0《123》 456% 【789】"
     expected = u'0"123" 456% [789]'
     assert moses_norm_unicode.replace_unicode_punct(text) == expected
Beispiel #7
0
 def test_moses_noralize_single_apostrophe(self):
     moses_norm_num = MosesPunctNormalizer("en")
     text = u"yesterday ’s reception"
     expected = u"yesterday 's reception"
     assert moses_norm_num.normalize(text) == expected