Beispiel #1
0
 def test_moses_detruecase_str(self):
     moses = MosesDetruecaser()
     text = 'the adventures of Sherlock Holmes'
     expected = ['The', 'adventures', 'of', 'Sherlock', 'Holmes']
     expected_str = 'The adventures of Sherlock Holmes'
     assert moses.detruecase(text) == expected
     assert moses.detruecase(text, return_str=True) == expected_str
Beispiel #2
0
 def test_moses_detruecase_allcaps(self):
     moses = MosesDetruecaser()
     text = "MLB Baseball standings"
     expected = ["MLB", "Baseball", "standings"]
     expected_str = "MLB Baseball standings"
     assert moses.detruecase(text) == expected
     assert moses.detruecase(text, return_str=True) == expected_str
Beispiel #3
0
 def test_moses_detruecase_headline(self):
     moses = MosesDetruecaser()
     text = "the adventures of Sherlock Holmes"
     expected = ["The", "Adventures", "of", "Sherlock", "Holmes"]
     expected_str = "The Adventures of Sherlock Holmes"
     assert moses.detruecase(text, is_headline=True) == expected
     assert moses.detruecase(text, is_headline=True, return_str=True) == expected_str
    def test_moses_detruecase_file(self):
        moses = MosesDetruecaser()
        text = text_type('the adventures of Sherlock Holmes\n'
                         '<hl> something ABC has gone wrong Xyz , \n'
                         'second line of HEADERS that are very Importante .\n'
                         '</hl>\n'
                         'then the next sentence with Caps here and There .\n')

        with io.open('detruecase-test.txt', 'w', encoding='utf8') as fout:
            with io.StringIO(text) as fin:
                fout.write(fin.read())

        expected = [
            'The adventures of Sherlock Holmes',
            '<hl> Something Abc Has Gone Wrong Xyz ,',
            'Second Line of Headers That Are Very Importante .', '</hl>',
            'Then the next sentence with Caps here and There .'
        ]

        assert list(moses.detruecase_file('detruecase-test.txt')) == expected
Beispiel #5
0
def detruecase_file(processes, is_headline, encoding):
    moses = MosesDetruecaser()
    moses_detruecase = partial(
        moses.detruecase, return_str=True, is_headline=is_headline
    )
    with click.get_text_stream("stdin", encoding=encoding) as fin:
        with click.get_text_stream("stdout", encoding=encoding) as fout:
            # If it's single process, joblib parallization is slower,
            # so just process line by line normally.
            if processes == 1:
                for line in tqdm(fin.readlines()):
                    print(moses_detruecase(line), end="\n", file=fout)
            else:
                for outline in parallelize_preprocess(
                    moses_detruecase, fin.readlines(), processes, progress_bar=True
                ):
                    print(outline, end="\n", file=fout)
Beispiel #6
0
def detruecase_file(iterator, language, processes, quiet, is_headline):
    moses = MosesDetruecaser()
    moses_detruecase = partial(
        moses.detruecase, return_str=True, is_headline=is_headline
    )
    return parallel_or_not(iterator, moses_detruecase, processes, quiet)