def test_feed(self): path = os.path.join(os.path.dirname(__file__), "./") storage = Storage(path) df = storage.read("raw/corpus_multi.csv", delimiter="\t", names=["label", "review", "comment"]) dp = DatasetPreprocessor() dp.process("review")\ .by(ct.text.UnicodeNormalizer())\ .by(ct.Tokenizer("en"))\ .by(ct.token.StopwordFilter("en"))\ .by(ct.Vocabulary(min_df=0, max_df=1.0))\ .by(ct.formatter.Padding(length=5))\ .fit(df.loc[:, ["review", "comment"]]) dp.process("label")\ .by(ct.formatter.CategoricalLabel(), reference=dp.process("review")) adjusted = dp(df).preprocess().format().processed self.assertEqual(len(adjusted["label"][0]), dp.process("review").preprocessor.vocabulary.count) # Iterate for batch in dp(df).preprocess().iterate(batch_size=1, epoch=1): self.assertEqual(len(batch), 3) self.assertEqual(len(batch["review"][0]), 5) inversed = dp.inverse(batch) self.assertEqual(inversed["label"][0], np.argmax(batch["label"])) self.assertLessEqual(len(inversed["review"][0]), 5)
def test_dataframe(self): path = os.path.join(os.path.dirname(__file__), "./data") storage = Storage(path) df = storage.read("raw/corpus.csv", delimiter="\t", names=["summary", "text"]) preprocessor = Preprocessor( tokenizer=ct.Tokenizer("ja"), text_transformers=[ct.text.UnicodeNormalizer()], vocabulary=ct.Vocabulary(vocab_size=50)) preprocessor.fit(df[["summary", "text"]]) joblib.dump(preprocessor, "test_preprocessor.pkl") preprocessor = joblib.load("test_preprocessor.pkl") transformed = preprocessor.transform(df) inversed = preprocessor.inverse_transform(transformed) for c in df.columns: for o, i in zip(df[c], inversed[c]): self.assertEqual(o, "".join(i)) print(inversed) os.remove("test_preprocessor.pkl")
def test_series(self): path = os.path.join(os.path.dirname(__file__), "./") storage = Storage(path) df = storage.read("raw/corpus_multi.csv", delimiter="\t", names=["label", "review", "comment"]) preprocessor = Preprocessor( tokenizer=ct.Tokenizer("en"), text_transformers=[ct.text.UnicodeNormalizer()], token_transformers=[ct.token.StopwordFilter("en")], vocabulary=ct.Vocabulary(min_df=0, max_df=1.0)) preprocessor.fit(df["review"]) transformed = preprocessor.transform(df["comment"]) self.assertEqual(len(transformed), 3)