def test_moses_detruecase_str(self): moses = MosesDetruecaser() text = 'the adventures of Sherlock Holmes' expected = ['The', 'adventures', 'of', 'Sherlock', 'Holmes'] expected_str = 'The adventures of Sherlock Holmes' assert moses.detruecase(text) == expected assert moses.detruecase(text, return_str=True) == expected_str
def test_moses_detruecase_allcaps(self): moses = MosesDetruecaser() text = "MLB Baseball standings" expected = ["MLB", "Baseball", "standings"] expected_str = "MLB Baseball standings" assert moses.detruecase(text) == expected assert moses.detruecase(text, return_str=True) == expected_str
def test_moses_detruecase_headline(self): moses = MosesDetruecaser() text = "the adventures of Sherlock Holmes" expected = ["The", "Adventures", "of", "Sherlock", "Holmes"] expected_str = "The Adventures of Sherlock Holmes" assert moses.detruecase(text, is_headline=True) == expected assert moses.detruecase(text, is_headline=True, return_str=True) == expected_str
def test_moses_detruecase_file(self): moses = MosesDetruecaser() text = text_type('the adventures of Sherlock Holmes\n' '<hl> something ABC has gone wrong Xyz , \n' 'second line of HEADERS that are very Importante .\n' '</hl>\n' 'then the next sentence with Caps here and There .\n') with io.open('detruecase-test.txt', 'w', encoding='utf8') as fout: with io.StringIO(text) as fin: fout.write(fin.read()) expected = [ 'The adventures of Sherlock Holmes', '<hl> Something Abc Has Gone Wrong Xyz ,', 'Second Line of Headers That Are Very Importante .', '</hl>', 'Then the next sentence with Caps here and There .' ] assert list(moses.detruecase_file('detruecase-test.txt')) == expected
def detruecase_file(processes, is_headline, encoding): moses = MosesDetruecaser() moses_detruecase = partial( moses.detruecase, return_str=True, is_headline=is_headline ) with click.get_text_stream("stdin", encoding=encoding) as fin: with click.get_text_stream("stdout", encoding=encoding) as fout: # If it's single process, joblib parallization is slower, # so just process line by line normally. if processes == 1: for line in tqdm(fin.readlines()): print(moses_detruecase(line), end="\n", file=fout) else: for outline in parallelize_preprocess( moses_detruecase, fin.readlines(), processes, progress_bar=True ): print(outline, end="\n", file=fout)
def detruecase_file(iterator, language, processes, quiet, is_headline): moses = MosesDetruecaser() moses_detruecase = partial( moses.detruecase, return_str=True, is_headline=is_headline ) return parallel_or_not(iterator, moses_detruecase, processes, quiet)