def test_get_interesting_words(self): text_extractor = DocumentTextExtractor('tests_files/test_extractor', 5, 10) text_extractor._extract_sentence_and_work_tokens( 'tests_files/test_extractor') self.assertEqual( text_extractor.get_interesting_words(number_following=2), ['take', 'nothing', 'time', 'whenever', 'get'])
def test_splits_sentences_by_multiple_docs(self): text_extractor = DocumentTextExtractor('test_directory', 5, 10) text_extractor._extract_sentence_and_work_tokens( 'tests_files/test_directory') self.assertEqual( text_extractor._sentence_tokens[0], ('test_text_2.txt', 'quite simply the second document.')) self.assertEqual(text_extractor._sentence_tokens[6], ('test_text_1.txt', 'way-too-nice.'))
def test_single_document_get_text(self): document_string = DocumentTextExtractor._get_string_from_document( 'tests_files/test_directory/test_text_2.txt') self.assertEqual( document_string, "quite simply the second document. and that, for now, is all you're getting - YES!" )
def test_create_word_type_following_dict(self): tokens = [(('just', 'ADV'), ('three', 'NUM')), (('three', 'NUM'), ('words', 'NOUN'))] following_dict = {'just': {'NUM'}, 'three': {'NOUN'}} self.assertEqual( DocumentTextExtractor('test_directory', 5, 10)._create_word_type_following_dict(tokens), following_dict)
def test_find_words_with_two_different_following_word_types(self): following_dict = { 'just': {'NUM', 'VERB'}, 'three': {'NOUN'}, 'words': {'ADV'} } self.assertEqual( DocumentTextExtractor('test_directory', 5, 10)._find_number_follow_types( following_dict, 2), ['just'])
def test_convert_to_csv_format_multi_word(self): context_dict = { 'us': ['text_1: let us go then', 'text_1: let us go'], 'hotter': ['text_1: i get hotter.', 'text_1: Did you say hotter?'], } csv_format = DocumentTextExtractor._convert_to_csv_form(context_dict) self.assertEqual( csv_format, [['us', 'text_1: let us go then'], ['', 'text_1: let us go'], ['hotter', 'text_1: i get hotter.'], ['', 'text_1: Did you say hotter?']])
def test_create_word_type_following_dict_multiple_following(self): """ Test when a word is followed by more than one different kinds of words """ tokens = [(('just', 'ADV'), ('three', 'NUM')), (('three', 'NUM'), ('words', 'NOUN')), (('words', 'NOUN'), ('just', 'ADV')), (('just', 'ADV'), ('say', 'VERB'))] following_dict = { 'just': {'NUM', 'VERB'}, 'three': {'NOUN'}, 'words': {'ADV'} } self.assertEqual( DocumentTextExtractor('test_directory', 5, 10)._create_word_type_following_dict(tokens), following_dict)
def test_convert_to_csv_format(self): context_dict = {'us': ['text_1: let us go then', 'text_1: let us go']} csv_format = DocumentTextExtractor._convert_to_csv_form(context_dict) self.assertEqual( csv_format, [['us', 'text_1: let us go then'], ['', 'text_1: let us go']])
def test_gets_num_of_sentences_multiple_docs(self): text_extractor = DocumentTextExtractor('test_directory', 5, 10) text_extractor._extract_sentence_and_work_tokens( 'tests_files/test_directory') self.assertEqual(len(text_extractor._sentence_tokens), 7)
def test_splits_sentences_by_document(self): text_extractor = DocumentTextExtractor('test_directory', 5, 10) text_extractor._extract_sentence_and_work_tokens( 'tests_files/test_directory_single_file') self.assertEqual(text_extractor._sentence_tokens, [('single_file.txt', 'one, two, three and four')])
def test_document_get_word_tokens(self): text_extractor = DocumentTextExtractor('test_directory', 5, 10) text_extractor._extract_sentence_and_work_tokens( 'tests_files/test_directory_single_file') self.assertEqual(len(text_extractor._word_tokens), 7)