def test_feed(self): path = os.path.join(os.path.dirname(__file__), "./") storage = Storage(path) df = storage.read("raw/corpus_multi.csv", delimiter="\t", names=["label", "review", "comment"]) dp = DatasetPreprocessor() dp.process("review")\ .by(ct.text.UnicodeNormalizer())\ .by(ct.Tokenizer("en"))\ .by(ct.token.StopwordFilter("en"))\ .by(ct.Vocabulary(min_df=0, max_df=1.0))\ .by(ct.formatter.Padding(length=5))\ .fit(df.loc[:, ["review", "comment"]]) dp.process("label")\ .by(ct.formatter.CategoricalLabel(), reference=dp.process("review")) adjusted = dp(df).preprocess().format().processed self.assertEqual(len(adjusted["label"][0]), dp.process("review").preprocessor.vocabulary.count) # Iterate for batch in dp(df).preprocess().iterate(batch_size=1, epoch=1): self.assertEqual(len(batch), 3) self.assertEqual(len(batch["review"][0]), 5) inversed = dp.inverse(batch) self.assertEqual(inversed["label"][0], np.argmax(batch["label"])) self.assertLessEqual(len(inversed["review"][0]), 5)
def _make_dp(self): data = { "label": np.random.uniform(size=100), "feature": np.random.uniform(size=100) } df = pd.DataFrame.from_dict(data) def column(name): return df[name].values.reshape(-1, 1) label_scaler = StandardScaler().fit(column("label")) feature_scaler = StandardScaler().fit(column("feature")) feature_scaler_2 = MinMaxScaler().fit(column("feature")) dp = DatasetPreprocessor() dp.process("label").by(label_scaler) dp.process("feature")\ .by(feature_scaler)\ .by(feature_scaler_2).as_name("feature_1") dp.process("feature")\ .by(feature_scaler).as_name("feature_2")\ .by(ScalingFormatter()).as_name("feature_3") return df, dp
def test_save_load(self): data, dp = self._make_dp() path = os.path.join(os.path.dirname(__file__), "test_preprocess.tar.gz") dp.save(path) _dp = DatasetPreprocessor.load(path) preprocessed = _dp.preprocess(data) self.assertEqual(len(preprocessed), 3) for c in preprocessed: self.assertTrue(c in ["label", "feature_1", "feature_2"]) os.remove(path)
def _make_dp(self): data = { "label": np.random.uniform(size=100).reshape((-1, 1)), "feature": np.random.uniform(size=100).reshape((-1, 1)) } label_scaler = StandardScaler().fit(data["label"]) feature_scaler = StandardScaler().fit(data["feature"]) feature_scaler_2 = MinMaxScaler().fit(data["feature"]) dp = DatasetPreprocessor() dp.process("label").by(label_scaler) dp.process("feature")\ .by(feature_scaler)\ .by(feature_scaler_2).as_name("feature_1") dp.process("feature")\ .by(feature_scaler).as_name("feature_2")\ .by(ScalingFormatter()).as_name("feature_3") return data, dp
def __init__(self, pattern, replacement, copy=True): super().__init__(copy) self.pattern = pattern self.replacement = replacement def apply(self, text): # patternにマッチした部分文字列をreplacementに置き換える return re.sub(self.pattern, self.replacement, text) df = pd.read_pickle('news.pkl') df = df[:10] pad_length = 300 dp = DatasetPreprocessor() dp.process('news')\ .by(ct.text.UnicodeNormalizer())\ .by(ct.text.LowerNormalizer())\ .by(ct.text.SymbolFilter())\ .by(ct.Tokenizer('ja'))\ .by(ct.token.StopwordFilter('ja'))\ .by(ct.Vocabulary(min_df=2, max_df=0.8))\ .by(Padding(length=pad_length))\ .fit(df['news']) dp.process('class')\ .by(ct.formatter.CategoricalLabel(num_class=9)) preprocessed = dp.preprocess(df) print(preprocessed)