def test_clean_tokenize_corpus_all_not_str(self): """Both of texts is none""" texts = [ 123, [1, 2, 3] ] res_texts = clean_tokenize_corpus(texts) exp_res = [ ] self.assertEqual(res_texts, exp_res)
def test_clean_tokenize_corpus_all_none(self): """Both of texts is none""" texts = [ None, None ] res_texts = clean_tokenize_corpus(texts) exp_res = [ ] self.assertEqual(res_texts, exp_res)
def test_clean_tokenize_corpus_one_not_str(self): """Both of texts is none""" texts = [ 'This is an example of test text. It contains two sentences.', [1, 2, 3] ] res_texts = clean_tokenize_corpus(texts) exp_res = [ ['this', 'is', 'an', 'example', 'of', 'test', 'text', 'it', 'contains', 'two', 'sentences'], ] self.assertEqual(res_texts, exp_res)
def test_clean_tokenize_corpus_one_none(self): """One of texts is none""" texts = [ None, 'Das ist ein Testtext.<br /><br />Es ist auf deutsch geschrieben.' ] res_texts = clean_tokenize_corpus(texts) exp_res = [ ['das', 'ist', 'ein', 'testtext', 'es', 'ist', 'auf', 'deutsch', 'geschrieben'] ] self.assertEqual(res_texts, exp_res)
def test_clean_tokenize_corpus_line_breaks(self): """Text with linebreaks""" texts = [ 'This is an example of test text!<br /><br />It contains two sentences.', 'Das ist ein Testtext.<br /><br />Es ist auf deutsch geschrieben.' ] res_texts = clean_tokenize_corpus(texts) exp_res = [ ['this', 'is', 'an', 'example', 'of', 'test', 'text', 'it', 'contains', 'two', 'sentences'], ['das', 'ist', 'ein', 'testtext', 'es', 'ist', 'auf', 'deutsch', 'geschrieben'] ] self.assertEqual(res_texts, exp_res)
def test_clean_tokenize_corpus_dirty(self): """Text is dirty""" texts = [ 'This* is an@ example>< of test - text! It cont&*ains two: sen#!tences.', 'Das is()t ein Test^text. Es ist auf deu-tsch geschr=ieben.' ] res_texts = clean_tokenize_corpus(texts) exp_res = [ ['this', 'is', 'an', 'example', 'of', 'test', 'text', 'it', 'contains', 'two', 'sentences'], ['das', 'ist', 'ein', 'testtext', 'es', 'ist', 'auf', 'deutsch', 'geschrieben'] ] self.assertEqual(res_texts, exp_res)
def test_clean_tokenize_corpus_punctuation(self): """Text contains punctuation marks""" texts = [ 'This, is an example of test - text. It contains two: sentences.', 'Das ist ein Testtext. Es ist auf deutsch geschrieben.' ] res_texts = clean_tokenize_corpus(texts) exp_res = [ ['this', 'is', 'an', 'example', 'of', 'test', 'text', 'it', 'contains', 'two', 'sentences'], ['das', 'ist', 'ein', 'testtext', 'es', 'ist', 'auf', 'deutsch', 'geschrieben'] ] self.assertEqual(res_texts, exp_res)