Esempio n. 1
0
    def test_shouldReadFromFile(self):
        expected_file_content = """This is content of the file.\nWith line separated. blabla"""
        expected_document = Document(text=expected_file_content)

        actual_document = Document(text=FileReader.read(self.path))

        self.assertEquals(actual_document,expected_document)
Esempio n. 2
0
    def multi_document(self):
        documents = []
        abs_path = os.path.abspath(self.path)
        text_files = sorted(self._list_files())
        for text_file in text_files:
            filepath = os.path.join(abs_path, text_file)
            text = FileReader.read(filepath)
            documents.append(Document(id=text_file, text=text))

        return MultiDocument(documents=documents)
Esempio n. 3
0
    def test_shouldCheckForConsistencyOfLSIModel(self):
        import os
        filepath = os.path.join(os.path.dirname(__file__), "test_data/reuters_rupee_decline/doc1")
        text = FileReader.read(filepath)
        processor = TextProcessor()
        sentences = processor.nltk_sentences(text)
        tokenised_sentence_map = dict(
            [(index, processor.stopped_tokenize(sentence)) for index, sentence in enumerate(sentences)])


        for i in range(5):
            print "\n\n************* ITERATION ", i, " *************"
            lsi_transformation = LSITransformation(tokenised_sentence_map)
            lsi_transformation.print_transformation()