def test_preprocess_corpus_no_lower_no_punc_with_special(self): s = 'á THIS, is a test string, where we test ! Whether join entity names is working properly on the test string?!' trie = trie_util.Trie(['test', 'test string', 'test case', 'the test string', 'join entity names']) output = read_corpus_util.preprocess_corpus(s, trie, to_lower=False, no_punctuations=True, no_special_char=False) expected_output = \ 'á THIS is a test_string where we test Whether join_entity_names is working properly on the_test_string' self.assertEqual(output, expected_output)
def test_preprocess_corpus_no_lower_with_punc(self): s = 'á THIS, is a test string, where we test ! Whether join entity names is working properly on the test string?!' trie = trie_util.Trie(['test', 'test string', 'test case', 'the test string', 'join entity names']) output = read_corpus_util.preprocess_corpus(s, trie, to_lower=False, no_punctuations=False, no_special_char=True) # Notice that when there is punctuation, we cannot connect 'test string,' to 'test_string,'. expected_output = \ 'THIS, is a test string, where we test ! Whether join_entity_names is working properly on the test string?!' self.assertEqual(output, expected_output) def test_preprocess_corpus_no_lower_with_punc_with_special(self): s = 'á THIS, is a test string, where we test ! Whether join entity names is working properly on the test string?!' trie = trie_util.Trie(['test', 'test string', 'test case', 'the test string', 'join entity names']) output = read_corpus_util.preprocess_corpus(s, trie, to_lower=False, no_punctuations=False, no_special_char=False) expected_output = \ 'á THIS, is a test string, where we test ! Whether join_entity_names is working properly on the test string?!' self.assertEqual(output, expected_output)
def clean_text_file(save_dir, text_dir, to_lower=False, no_punctuations=False, no_special_char=True): key_phrases_dict, key_phrases_set = read_corpus_util.read_key_phrases_dict( read_corpus_util.kCorpusDirectory, to_lower=to_lower) key_phrases_list = map(lambda phrase: phrase.replace(' ', '_'), list(key_phrases_set)) trie = trie_util.Trie(key_phrases_list) with open(text_dir, 'r') as fin, open(save_dir, 'w') as fout: # Can't use csv_reader because line is too long. for line in fin: fout.write( read_corpus_util.preprocess_corpus(line, trie, to_lower, no_punctuations, no_special_char)) fout.write('\n')