def test_shouldReadFromFile(self): expected_file_content = """This is content of the file.\nWith line separated. blabla""" expected_document = Document(text=expected_file_content) actual_document = Document(text=FileReader.read(self.path)) self.assertEquals(actual_document,expected_document)
def multi_document(self): documents = [] abs_path = os.path.abspath(self.path) text_files = sorted(self._list_files()) for text_file in text_files: filepath = os.path.join(abs_path, text_file) text = FileReader.read(filepath) documents.append(Document(id=text_file, text=text)) return MultiDocument(documents=documents)
def test_shouldCheckForConsistencyOfLSIModel(self): import os filepath = os.path.join(os.path.dirname(__file__), "test_data/reuters_rupee_decline/doc1") text = FileReader.read(filepath) processor = TextProcessor() sentences = processor.nltk_sentences(text) tokenised_sentence_map = dict( [(index, processor.stopped_tokenize(sentence)) for index, sentence in enumerate(sentences)]) for i in range(5): print "\n\n************* ITERATION ", i, " *************" lsi_transformation = LSITransformation(tokenised_sentence_map) lsi_transformation.print_transformation()