Ejemplo n.º 1
0
 def test_filter_feat_grp(self):
     df = pd.read_csv("../examples/classification/train_classification.csv")
     df2 = GenericDataFrame(df, feat_grp=["id"] + ["num"] * 2 + ["cat"] * 9)
     df3 = df2.filter_feat_grp(["num", "id"])
     self.assertTrue(isinstance(df3, GenericDataFrame))
     self.assertTrue(
         np.all(df3.origin_grp == pd.Series(["id", "num", "num"])))
     self.assertTrue(
         np.all(df3.feat_grp == pd.Series(["id", "num", "num"])))
Ejemplo n.º 2
0
    def test_procedure(self):
        df = pd.read_csv("../examples/classification/train_classification.csv")
        y = df.pop("Survived").values
        df = df.loc[:, ["Sex", "Cabin", "Age"]]
        feat_grp = ["cat_nan", "cat_nan", "num_nan"]
        df_train, df_test, y_train, y_test = train_test_split(df,
                                                              y,
                                                              test_size=0.2,
                                                              random_state=10)
        df_train = GenericDataFrame(df_train, feat_grp=feat_grp)
        df_test = GenericDataFrame(df_test, feat_grp=feat_grp)
        cv = KFold(n_splits=5, random_state=10, shuffle=True)
        train_ix, valid_ix = next(cv.split(df_train))

        df_train, df_valid = df_train.split([train_ix, valid_ix])
        y_valid = y_train[valid_ix]
        y_train = y_train[train_ix]

        fill_cat = FillCat()
        fill_cat.in_feat_grp = "cat_nan"
        fill_cat.out_feat_grp = "cat"
        fill_cat.update_hyperparams({"strategy": "<NULL>"})

        fill_num = FillNum()
        fill_num.in_feat_grp = "num_nan"
        fill_num.out_feat_grp = "num"
        fill_num.update_hyperparams({"strategy": "median"})

        ohe = OneHotEncoder()
        ohe.in_feat_grp = "cat"
        ohe.out_feat_grp = "num"

        sgd = SGD()
        sgd.in_feat_grp = "num"
        sgd.update_hyperparams({"loss": "log", "random_state": 10})

        ret1 = fill_cat.fit_transform(df_train, y_train, df_valid, y_valid,
                                      df_test)
        ret2 = fill_num.fit_transform(**ret1)
        ret3 = ohe.fit_transform(**ret2)
        sgd.fit(**ret3, y_train=y_train)

        y_pred = sgd.predict(ret3["X_valid"])
        acc_valid = accuracy_score(y_valid, y_pred)
        print(acc_valid)

        y_pred = sgd.predict(ret3["X_train"])
        acc_train = accuracy_score(y_train, y_pred)
        print(acc_train)

        y_pred = sgd.predict(ret3["X_test"])
        acc_test = accuracy_score(y_test, y_pred)
        print(acc_test)
Ejemplo n.º 3
0
    def test_replace_feat_grp(self):
        df = pd.read_csv("../examples/classification/train_classification.csv")
        suffix = ["num"] * 2 + ["cat"] * 2 + ["num"] * 5 + ["cat"] * 2
        feat_grp = ["id"] + suffix

        df2 = GenericDataFrame(df, feat_grp=feat_grp)
        # test 1->2
        selected = df2.filter_feat_grp("id").values
        selected = np.hstack([selected, selected])
        df3 = df2.replace_feat_grp("id", selected, "id2")
        self.assertTrue(np.all(df3.feat_grp == pd.Series(suffix +
                                                         ["id2"] * 2)))
        self.assertTrue(
            np.all(df3.origin_grp == pd.Series(suffix + ["id"] * 2)))
        # test 1->1
        selected = df2.filter_feat_grp("id").values
        selected = np.hstack([selected])
        df3 = df2.replace_feat_grp("id", selected, "id2")
        self.assertTrue(np.all(df3.feat_grp == pd.Series(suffix + ["id2"])))
        self.assertTrue(np.all(df3.origin_grp == pd.Series(suffix + ["id"])))
        # test 1->0
        selected = df2.filter_feat_grp("id").values
        selected = np.zeros([selected.shape[0], 0])
        df3 = df2.replace_feat_grp("id", selected, "id2")
        self.assertTrue(np.all(df3.feat_grp == pd.Series(suffix)))
        self.assertTrue(np.all(df3.origin_grp == pd.Series(suffix)))
Ejemplo n.º 4
0
 def _transform(self, X_: np.ndarray, X: GenericDataFrame):
     if X_ is None:
         return None
     X_ = self.before_trans_X(X_)
     X_ = self._transform_proc(X_)
     X_ = densify(X_)  # todo: 改为判断的形式?
     return X.replace_feat_grp(self.in_feat_grp, X_, self.out_feat_grp)
Ejemplo n.º 5
0
 def _transform(self, X: GenericDataFrame, y):
     columns = X.columns
     feat_grp = X.feat_grp
     origin_grp = X.origin_grp
     X_, y_ = self._transform_proc(X, y)
     X = GenericDataFrame(pd.DataFrame(X_, columns=columns),
                          feat_grp=feat_grp,
                          origin_grp=origin_grp)
     return X, y_
Ejemplo n.º 6
0
 def process(
         self, X_origin: Optional[GenericDataFrame]
 ) -> Optional[GenericDataFrame]:
     if X_origin is None:
         return None
     X = X_origin.filter_feat_grp(self.in_feat_grp)
     highR = self.hyperparams.get(self.key1_hp_name, self.key1_default_name)
     lowR = self.hyperparams.get(self.key2_hp_name, self.key2_default_name)
     collection = {
         highR: defaultdict(list),
         lowR: defaultdict(list),
     }
     for i, (col_name, feat_grp, origin_grp) in enumerate(
             zip(X.columns, X.feat_grp, X.origin_grp)):
         col = X.iloc[:, i]
         if col_name in self.info[self.key1]["col_name"]:
             keyname = highR
         else:
             keyname = lowR
         collection[keyname]["X"].append(col)
         collection[keyname]["col_name"].append(col_name)
         collection[keyname]["feat_grp"].append(feat_grp)
         collection[keyname]["origin_grp"].append(origin_grp)
     dfs = []
     for feat_grp_name, dict_ in collection.items():
         X = np.array(dict_["X"]).T
         origin_grp = dict_["origin_grp"]
         feat_grp = [feat_grp_name] * len(origin_grp)
         if X.shape == (0, ):
             X = np.zeros([X_origin.shape[0], 0])
         df = GenericDataFrame(pd.DataFrame(X, columns=dict_["col_name"]),
                               feat_grp=feat_grp,
                               origin_grp=origin_grp)
         dfs.append(df)
     assert len(dfs) == 2
     df = dfs[0].concat_two(dfs[0], dfs[1])
     return X_origin.replace_feat_grp(self.in_feat_grp, df, df.feat_grp,
                                      df.origin_grp)
Ejemplo n.º 7
0
 def fit(self,
         X_train: GenericDataFrame,
         y_train=None,
         X_valid=None,
         y_valid=None,
         X_test=None,
         y_test=None):
     self.threshold = self.hyperparams.get("threshold",
                                           self.default_threshold)
     info = {
         self.key1: defaultdict(list),
         self.key2: defaultdict(list),
     }
     X: GenericDataFrame = X_train.filter_feat_grp(self.in_feat_grp)
     rows = X.shape[0]
     for i, (col_name, feat_grp, origin_grp) in enumerate(
             zip(X.columns, X.feat_grp, X.origin_grp)):
         col = X.iloc[:, i]
         keyname = self.judge_keyname(col, rows)
         info[keyname]["col_name"].append(col_name)
         info[keyname]["feat_grp"].append(feat_grp)
         info[keyname]["origin_grp"].append(origin_grp)
     self.info = info
     return self
Ejemplo n.º 8
0
from autopipeline.pipeline.components.preprocessing.encode.one_hot import OneHotEncoder
from autopipeline.pipeline.components.preprocessing.impute.fill_cat import FillCat
from autopipeline.pipeline.components.preprocessing.impute.fill_num import FillNum
from autopipeline.pipeline.dataframe import GenericDataFrame
from autopipeline.pipeline.pipeline import GenericPipeline

df = pd.read_csv("../examples/classification/train_classification.csv")
y = df.pop("Survived").values
df = df.loc[:, ["Sex", "Cabin", "Age"]]
feat_grp = ["cat_nan", "cat_nan", "num_nan"]
df_train, df_test, y_train, y_test = train_test_split(df,
                                                      y,
                                                      test_size=0.2,
                                                      random_state=10)

df_train = GenericDataFrame(df_train, feat_grp=feat_grp)
df_test = GenericDataFrame(df_test, feat_grp=feat_grp)
cv = KFold(n_splits=5, random_state=10, shuffle=True)
train_ix, valid_ix = next(cv.split(df_train))

df_train, df_valid = df_train.split([train_ix, valid_ix])
y_valid = y_train[valid_ix]
y_train = y_train[train_ix]

fill_cat = FillCat()
fill_cat.in_feat_grp = "cat_nan"
fill_cat.out_feat_grp = "cat"
fill_cat.update_hyperparams({"strategy": "<NULL>"})

fill_num = FillNum()
fill_num.in_feat_grp = "num_nan"
Ejemplo n.º 9
0
        suffix = ["num"] * 2 + ["cat"] * 2 + ["num"] * 5 + ["cat"] * 2
        feat_grp = ["id"] + suffix

        df2 = GenericDataFrame(df, feat_grp=feat_grp)
        # test 1->2
        selected = df2.filter_feat_grp("id").values
        selected = np.hstack([selected, selected])
        df3 = df2.replace_feat_grp("id", selected, "id2")
        self.assertTrue(np.all(df3.feat_grp == pd.Series(suffix +
                                                         ["id2"] * 2)))
        self.assertTrue(
            np.all(df3.origin_grp == pd.Series(suffix + ["id"] * 2)))
        # test 1->1
        selected = df2.filter_feat_grp("id").values
        selected = np.hstack([selected])
        df3 = df2.replace_feat_grp("id", selected, "id2")
        self.assertTrue(np.all(df3.feat_grp == pd.Series(suffix + ["id2"])))
        self.assertTrue(np.all(df3.origin_grp == pd.Series(suffix + ["id"])))
        # test 1->0
        selected = df2.filter_feat_grp("id").values
        selected = np.zeros([selected.shape[0], 0])
        df3 = df2.replace_feat_grp("id", selected, "id2")
        self.assertTrue(np.all(df3.feat_grp == pd.Series(suffix)))
        self.assertTrue(np.all(df3.origin_grp == pd.Series(suffix)))


if __name__ == '__main__':
    df = pd.read_csv("../examples/classification/train_classification.csv")
    df2 = GenericDataFrame(df, feat_grp=["id"] + ["num"] * 2 + ["cat"] * 9)
    df3 = df2.filter_feat_grp(["num", "id"])
Ejemplo n.º 10
0
import pandas as pd


from autopipeline.pipeline.components.preprocessing.operate.drop import DropAll
from autopipeline.pipeline.components.preprocessing.operate.split.cat import SplitCat
from autopipeline.pipeline.dataframe import GenericDataFrame
from autopipeline.pipeline.pipeline import GenericPipeline

df = pd.read_csv("../examples/classification/train_classification.csv")
y = df.pop("Survived").values
df = df.loc[:, ["Sex", "Ticket", "Pclass"]]

df2 = GenericDataFrame(df, feat_grp=["cat", "cat", "num"])

split_cat = SplitCat()
split_cat.in_feat_grp = "cat"
split_cat.update_hyperparams({
    "highR": "highR_cat",
    "lowR": "lowR_cat",
    "threshold": 0.5
})
ret = split_cat.fit_transform(df2)
print(ret)

df2 = GenericDataFrame(df, feat_grp=["cat", "cat", "num"])
drop_all = DropAll()
drop_all.in_feat_grp = ["cat", "num"]
drop_all.out_feat_grp="drop"

split_cat = SplitCat()
split_cat.in_feat_grp = "cat"
Ejemplo n.º 11
0
    def test_pipeline(self):
        df = pd.read_csv("../examples/classification/train_classification.csv")
        y = df.pop("Survived").values
        df = df.loc[:, ["Sex", "Cabin", "Age"]]
        feat_grp = ["cat_nan", "cat_nan", "num_nan"]
        df_train, df_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=10)
        df_train = GenericDataFrame(df_train, feat_grp=feat_grp)
        df_test = GenericDataFrame(df_test, feat_grp=feat_grp)
        cv = KFold(n_splits=5, random_state=10, shuffle=True)
        train_ix, valid_ix = next(cv.split(df_train))

        df_train, df_valid = df_train.split([train_ix, valid_ix])
        y_valid = y_train[valid_ix]
        y_train = y_train[train_ix]

        fill_cat = FillCat()
        fill_cat.in_feat_grp = "cat_nan"
        fill_cat.out_feat_grp = "cat"
        fill_cat.update_hyperparams({"strategy": "<NULL>"})

        fill_num = FillNum()
        fill_num.in_feat_grp = "num_nan"
        fill_num.out_feat_grp = "num"
        fill_num.update_hyperparams({"strategy": "median"})

        ohe = OneHotEncoder()
        ohe.in_feat_grp = "cat"
        ohe.out_feat_grp = "num"

        sgd = SGD()
        sgd.in_feat_grp = "num"
        sgd.update_hyperparams({"loss": "log", "random_state": 10})

        pipeline = GenericPipeline([
            ("fill_cat", fill_cat),
            ("fill_num", fill_num),
            ("ohe", ohe),
            ("sgd", sgd),
        ])

        pipeline.fit(df_train, y_train, df_valid, y_valid, df_test, y_test)
        pred_train = pipeline.predict(df_train)
        pred_test = pipeline.predict(df_test)
        pred_valid = pipeline.predict(df_valid)
        score_valid = pipeline.predict_proba(df_valid)
        print(accuracy_score(y_train, pred_train))
        print(accuracy_score(y_valid, pred_valid))
        print(accuracy_score(y_test, pred_test))
        ret = pipeline.procedure(constants.binary_classification_task, df_train, y_train, df_valid, y_valid, df_test,
                                 y_test)
        pred_test = ret["pred_test"]
        pred_valid = ret["pred_valid"]
        print(accuracy_score(y_valid, (pred_valid > .5).astype("int")[:, 1]))
        print(accuracy_score(y_test, (pred_test > .5).astype("int")[:, 1]))

        pipeline = GenericPipeline([
            ("fill_cat", fill_cat),
            ("fill_num", fill_num),
            ("ohe", ohe),
        ])

        pipeline.fit(df_train, y_train, df_valid, y_valid, df_test, y_test)
        ret1 = pipeline.transform(df_train, df_valid, df_test)
        ret2 = pipeline.fit_transform(df_train, y_train, df_valid, y_valid, df_test, y_test)
        for key in ["X_train", "X_valid", "X_test"]:
            assert np.all(ret1[key] == ret2[key])

        pipeline = GenericPipeline([
            ("sgd", sgd),
        ])

        ret = pipeline.procedure(constants.binary_classification_task, ret1["X_train"], y_train, ret1["X_valid"],
                                 y_valid,
                                 ret1["X_test"], y_test)
        pred_test = ret["pred_test"]
        pred_valid = ret["pred_valid"]
        print(accuracy_score(y_valid, (pred_valid > .5).astype("int")[:, 1]))
        print(accuracy_score(y_test, (pred_test > .5).astype("int")[:, 1]))