def test_get_interesting_words(self):
     text_extractor = DocumentTextExtractor('tests_files/test_extractor', 5,
                                            10)
     text_extractor._extract_sentence_and_work_tokens(
         'tests_files/test_extractor')
     self.assertEqual(
         text_extractor.get_interesting_words(number_following=2),
         ['take', 'nothing', 'time', 'whenever', 'get'])
 def test_splits_sentences_by_multiple_docs(self):
     text_extractor = DocumentTextExtractor('test_directory', 5, 10)
     text_extractor._extract_sentence_and_work_tokens(
         'tests_files/test_directory')
     self.assertEqual(
         text_extractor._sentence_tokens[0],
         ('test_text_2.txt', 'quite simply the second document.'))
     self.assertEqual(text_extractor._sentence_tokens[6],
                      ('test_text_1.txt', 'way-too-nice.'))
 def test_single_document_get_text(self):
     document_string = DocumentTextExtractor._get_string_from_document(
         'tests_files/test_directory/test_text_2.txt')
     self.assertEqual(
         document_string,
         "quite simply the second document.  and that, for now, is all you're getting - YES!"
     )
 def test_create_word_type_following_dict(self):
     tokens = [(('just', 'ADV'), ('three', 'NUM')),
               (('three', 'NUM'), ('words', 'NOUN'))]
     following_dict = {'just': {'NUM'}, 'three': {'NOUN'}}
     self.assertEqual(
         DocumentTextExtractor('test_directory', 5,
                               10)._create_word_type_following_dict(tokens),
         following_dict)
 def test_find_words_with_two_different_following_word_types(self):
     following_dict = {
         'just': {'NUM', 'VERB'},
         'three': {'NOUN'},
         'words': {'ADV'}
     }
     self.assertEqual(
         DocumentTextExtractor('test_directory', 5,
                               10)._find_number_follow_types(
                                   following_dict, 2), ['just'])
 def test_convert_to_csv_format_multi_word(self):
     context_dict = {
         'us': ['text_1: let us go then', 'text_1: let us go'],
         'hotter': ['text_1: i get hotter.', 'text_1: Did you say hotter?'],
     }
     csv_format = DocumentTextExtractor._convert_to_csv_form(context_dict)
     self.assertEqual(
         csv_format,
         [['us', 'text_1: let us go then'], ['', 'text_1: let us go'],
          ['hotter', 'text_1: i get hotter.'],
          ['', 'text_1: Did you say hotter?']])
 def test_create_word_type_following_dict_multiple_following(self):
     """
     Test when a word is followed by more than one different kinds of words
     """
     tokens = [(('just', 'ADV'), ('three', 'NUM')),
               (('three', 'NUM'), ('words', 'NOUN')),
               (('words', 'NOUN'), ('just', 'ADV')),
               (('just', 'ADV'), ('say', 'VERB'))]
     following_dict = {
         'just': {'NUM', 'VERB'},
         'three': {'NOUN'},
         'words': {'ADV'}
     }
     self.assertEqual(
         DocumentTextExtractor('test_directory', 5,
                               10)._create_word_type_following_dict(tokens),
         following_dict)
 def test_convert_to_csv_format(self):
     context_dict = {'us': ['text_1: let us go then', 'text_1: let us go']}
     csv_format = DocumentTextExtractor._convert_to_csv_form(context_dict)
     self.assertEqual(
         csv_format,
         [['us', 'text_1: let us go then'], ['', 'text_1: let us go']])
 def test_gets_num_of_sentences_multiple_docs(self):
     text_extractor = DocumentTextExtractor('test_directory', 5, 10)
     text_extractor._extract_sentence_and_work_tokens(
         'tests_files/test_directory')
     self.assertEqual(len(text_extractor._sentence_tokens), 7)
 def test_splits_sentences_by_document(self):
     text_extractor = DocumentTextExtractor('test_directory', 5, 10)
     text_extractor._extract_sentence_and_work_tokens(
         'tests_files/test_directory_single_file')
     self.assertEqual(text_extractor._sentence_tokens,
                      [('single_file.txt', 'one, two, three and four')])
 def test_document_get_word_tokens(self):
     text_extractor = DocumentTextExtractor('test_directory', 5, 10)
     text_extractor._extract_sentence_and_work_tokens(
         'tests_files/test_directory_single_file')
     self.assertEqual(len(text_extractor._word_tokens), 7)