コード例 #1
0
    def test_feed(self):
        path = os.path.join(os.path.dirname(__file__), "./")
        storage = Storage(path)
        df = storage.read("raw/corpus_multi.csv", delimiter="\t",
                          names=["label", "review", "comment"])

        dp = DatasetPreprocessor()
        dp.process("review")\
            .by(ct.text.UnicodeNormalizer())\
            .by(ct.Tokenizer("en"))\
            .by(ct.token.StopwordFilter("en"))\
            .by(ct.Vocabulary(min_df=0, max_df=1.0))\
            .by(ct.formatter.Padding(length=5))\
            .fit(df.loc[:, ["review", "comment"]])
        dp.process("label")\
            .by(ct.formatter.CategoricalLabel(),
                reference=dp.process("review"))

        adjusted = dp(df).preprocess().format().processed
        self.assertEqual(len(adjusted["label"][0]),
                         dp.process("review").preprocessor.vocabulary.count)

        # Iterate
        for batch in dp(df).preprocess().iterate(batch_size=1, epoch=1):
            self.assertEqual(len(batch), 3)
            self.assertEqual(len(batch["review"][0]), 5)

            inversed = dp.inverse(batch)
            self.assertEqual(inversed["label"][0], np.argmax(batch["label"]))
            self.assertLessEqual(len(inversed["review"][0]), 5)
コード例 #2
0
    def _make_dp(self):
        data = {
            "label": np.random.uniform(size=100),
            "feature": np.random.uniform(size=100)
        }

        df = pd.DataFrame.from_dict(data)

        def column(name):
            return df[name].values.reshape(-1, 1)

        label_scaler = StandardScaler().fit(column("label"))
        feature_scaler = StandardScaler().fit(column("feature"))
        feature_scaler_2 = MinMaxScaler().fit(column("feature"))

        dp = DatasetPreprocessor()
        dp.process("label").by(label_scaler)
        dp.process("feature")\
            .by(feature_scaler)\
            .by(feature_scaler_2).as_name("feature_1")
        dp.process("feature")\
            .by(feature_scaler).as_name("feature_2")\
            .by(ScalingFormatter()).as_name("feature_3")

        return df, dp
コード例 #3
0
    def test_save_load(self):
        data, dp = self._make_dp()
        path = os.path.join(os.path.dirname(__file__), "test_preprocess.tar.gz")
        dp.save(path)

        _dp = DatasetPreprocessor.load(path)
        preprocessed = _dp.preprocess(data)
        self.assertEqual(len(preprocessed), 3)
        for c in preprocessed:
            self.assertTrue(c in ["label", "feature_1", "feature_2"])

        os.remove(path)
コード例 #4
0
    def _make_dp(self):
        data = {
            "label": np.random.uniform(size=100).reshape((-1, 1)),
            "feature": np.random.uniform(size=100).reshape((-1, 1))
        }

        label_scaler = StandardScaler().fit(data["label"])
        feature_scaler = StandardScaler().fit(data["feature"])
        feature_scaler_2 = MinMaxScaler().fit(data["feature"])

        dp = DatasetPreprocessor()
        dp.process("label").by(label_scaler)
        dp.process("feature")\
            .by(feature_scaler)\
            .by(feature_scaler_2).as_name("feature_1")
        dp.process("feature")\
            .by(feature_scaler).as_name("feature_2")\
            .by(ScalingFormatter()).as_name("feature_3")

        return data, dp
コード例 #5
0
ファイル: flow.py プロジェクト: naofumi1014/upura.hatenablog
    def __init__(self, pattern, replacement, copy=True):
        super().__init__(copy)
        self.pattern = pattern
        self.replacement = replacement

    def apply(self, text):
        # patternにマッチした部分文字列をreplacementに置き換える
        return re.sub(self.pattern, self.replacement, text)


df = pd.read_pickle('news.pkl')
df = df[:10]

pad_length = 300

dp = DatasetPreprocessor()
dp.process('news')\
    .by(ct.text.UnicodeNormalizer())\
    .by(ct.text.LowerNormalizer())\
    .by(ct.text.SymbolFilter())\
    .by(ct.Tokenizer('ja'))\
    .by(ct.token.StopwordFilter('ja'))\
    .by(ct.Vocabulary(min_df=2, max_df=0.8))\
    .by(Padding(length=pad_length))\
    .fit(df['news'])

dp.process('class')\
    .by(ct.formatter.CategoricalLabel(num_class=9))

preprocessed = dp.preprocess(df)
print(preprocessed)