def test_preprocess_corpus_no_lower_no_punc_with_special(self):
     s = 'á THIS, is a test string, where we test ! Whether join entity names is working properly on the test string?!'
     trie = trie_util.Trie(['test', 'test string', 'test case', 'the test string', 'join entity names'])
     output = read_corpus_util.preprocess_corpus(s, trie, to_lower=False, no_punctuations=True, no_special_char=False)
     expected_output = \
         'á THIS is a test_string where we test Whether join_entity_names is working properly on the_test_string'
     self.assertEqual(output, expected_output)
    def test_preprocess_corpus_no_lower_with_punc(self):
        s = 'á THIS, is a test string, where we test ! Whether join entity names is working properly on the test string?!'
        trie = trie_util.Trie(['test', 'test string', 'test case', 'the test string', 'join entity names'])
        output = read_corpus_util.preprocess_corpus(s, trie, to_lower=False, no_punctuations=False, no_special_char=True)
        # Notice that when there is punctuation, we cannot connect 'test string,' to 'test_string,'.
        expected_output = \
            'THIS, is a test string, where we test ! Whether join_entity_names is working properly on the test string?!'
        self.assertEqual(output, expected_output)

        def test_preprocess_corpus_no_lower_with_punc_with_special(self):
            s = 'á THIS, is a test string, where we test ! Whether join entity names is working properly on the test string?!'
            trie = trie_util.Trie(['test', 'test string', 'test case', 'the test string', 'join entity names'])
            output = read_corpus_util.preprocess_corpus(s, trie, to_lower=False, no_punctuations=False,
                                                        no_special_char=False)
            expected_output = \
                'á THIS, is a test string, where we test ! Whether join_entity_names is working properly on the test string?!'
            self.assertEqual(output, expected_output)
Example #3
0
def clean_text_file(save_dir,
                    text_dir,
                    to_lower=False,
                    no_punctuations=False,
                    no_special_char=True):

    key_phrases_dict, key_phrases_set = read_corpus_util.read_key_phrases_dict(
        read_corpus_util.kCorpusDirectory, to_lower=to_lower)
    key_phrases_list = map(lambda phrase: phrase.replace(' ', '_'),
                           list(key_phrases_set))
    trie = trie_util.Trie(key_phrases_list)
    with open(text_dir, 'r') as fin, open(save_dir, 'w') as fout:
        # Can't use csv_reader because line is too long.
        for line in fin:
            fout.write(
                read_corpus_util.preprocess_corpus(line, trie, to_lower,
                                                   no_punctuations,
                                                   no_special_char))
            fout.write('\n')