def test_category_to_corpus_separated(self):
     """
     Test separated mode, which puts documents in the same category into different
     corpora
     """
     category_to_corpus = CategoryToCorpus(None, None, "categories", None,
                                           "separated")
     pipeline = GeneratorPipeline([category_to_corpus])
     docs = pipeline.process(self.docs)
     # TODO: add a sink module or something similar to thread all docs/words
     # through a pipeline
     for doc in docs:
         continue
     # Get the generated corpus
     corpora = category_to_corpus.post_process()
     self.assertEqual(len(corpora), 2)
     self.assertEqual(len(corpora["stopwords"]), 2)
     self.assertEqual(len(corpora["parsing"]), 1)
    def test_category_to_corpus_combined(self):
        """
        Test combined mode, which concatenates documents in the same category
        into a single document
        """
        category_to_corpus = CategoryToCorpus()
        pipeline = GeneratorPipeline([category_to_corpus])
        docs = pipeline.process(self.docs)
        # TODO: add a sink module or something similar to thread all docs/words
        # through a pipeline
        for doc in docs:
            continue
        # Get the generated corpus
        corpus = category_to_corpus.post_process()
        self.assertEqual(len(corpus.categories()), 2)
        stopwords_docs = corpus["stopwords"]
        parsing_docs = corpus["parsing"]

        word_list = list(stopwords_docs.words())
        self.assertEqual(len(word_list), 12)
        self.assertEqual(word_list[0], "a")
        self.assertEqual(word_list[1], "stopword")
        self.assertEqual(word_list[2], "test")
        self.assertEqual(word_list[3], ".")
        self.assertEqual(word_list[4], "a")
        self.assertEqual(word_list[5], "stopword")
        self.assertEqual(word_list[6], "test")
        self.assertEqual(word_list[7], "and")
        self.assertEqual(word_list[8], "a")
        self.assertEqual(word_list[9], "unique")
        self.assertEqual(word_list[10], "test")
        self.assertEqual(word_list[11], ".")

        word_list = list(parsing_docs.words())
        self.assertEqual(len(word_list), 4)
        self.assertEqual(word_list[0], "a")
        self.assertEqual(word_list[1], "parsing")
        self.assertEqual(word_list[2], "test")
        self.assertEqual(word_list[3], ".")