def test_feed(self):
        path = os.path.join(os.path.dirname(__file__), "./")
        storage = Storage(path)
        df = storage.read("raw/corpus_multi.csv", delimiter="\t",
                          names=["label", "review", "comment"])

        dp = DatasetPreprocessor()
        dp.process("review")\
            .by(ct.text.UnicodeNormalizer())\
            .by(ct.Tokenizer("en"))\
            .by(ct.token.StopwordFilter("en"))\
            .by(ct.Vocabulary(min_df=0, max_df=1.0))\
            .by(ct.formatter.Padding(length=5))\
            .fit(df.loc[:, ["review", "comment"]])
        dp.process("label")\
            .by(ct.formatter.CategoricalLabel(),
                reference=dp.process("review"))

        adjusted = dp(df).preprocess().format().processed
        self.assertEqual(len(adjusted["label"][0]),
                         dp.process("review").preprocessor.vocabulary.count)

        # Iterate
        for batch in dp(df).preprocess().iterate(batch_size=1, epoch=1):
            self.assertEqual(len(batch), 3)
            self.assertEqual(len(batch["review"][0]), 5)

            inversed = dp.inverse(batch)
            self.assertEqual(inversed["label"][0], np.argmax(batch["label"]))
            self.assertLessEqual(len(inversed["review"][0]), 5)
Example #2
0
    def test_dataframe(self):
        path = os.path.join(os.path.dirname(__file__), "./data")
        storage = Storage(path)
        df = storage.read("raw/corpus.csv",
                          delimiter="\t",
                          names=["summary", "text"])

        preprocessor = Preprocessor(
            tokenizer=ct.Tokenizer("ja"),
            text_transformers=[ct.text.UnicodeNormalizer()],
            vocabulary=ct.Vocabulary(vocab_size=50))

        preprocessor.fit(df[["summary", "text"]])
        joblib.dump(preprocessor, "test_preprocessor.pkl")

        preprocessor = joblib.load("test_preprocessor.pkl")
        transformed = preprocessor.transform(df)
        inversed = preprocessor.inverse_transform(transformed)

        for c in df.columns:
            for o, i in zip(df[c], inversed[c]):
                self.assertEqual(o, "".join(i))

        print(inversed)
        os.remove("test_preprocessor.pkl")
Example #3
0
    def test_series(self):
        path = os.path.join(os.path.dirname(__file__), "./")
        storage = Storage(path)
        df = storage.read("raw/corpus_multi.csv", delimiter="\t",
                          names=["label", "review", "comment"])

        preprocessor = Preprocessor(
                            tokenizer=ct.Tokenizer("en"),
                            text_transformers=[ct.text.UnicodeNormalizer()],
                            token_transformers=[ct.token.StopwordFilter("en")],
                            vocabulary=ct.Vocabulary(min_df=0, max_df=1.0))

        preprocessor.fit(df["review"])
        transformed = preprocessor.transform(df["comment"])
        self.assertEqual(len(transformed), 3)