コード例 #1
0
    def __init__(self,
                 X,
                 y=None,
                 X_test=None,
                 y_test=None,
                 dataset_metadata=frozenset(),
                 column_descriptions=None):
        dataset_metadata = dict(dataset_metadata)
        super(XYDataManager, self).__init__(
            dataset_metadata.get("dataset_name", "default_dataset_name"))
        X, y, X_test, y_test, feat_grp, column2feat_grp = self.parse_column_descriptions(
            column_descriptions, X, y, X_test, y_test)
        self.column2feat_grp = column2feat_grp
        self.ml_task: MLTask = get_task_from_y(y)
        self.X_train = GenericDataFrame(X, feat_grp=feat_grp)
        self.y_train = y
        self.X_test = GenericDataFrame(
            X_test, feat_grp=feat_grp) if X_test is not None else None
        self.y_test = y_test if y_test is not None else None

        # todo: 用户自定义验证集可以通过RandomShuffle 或者mlxtend指定
        # fixme: 不支持multilabel
        if len(y.shape) > 2:
            raise ValueError('y must not have more than two dimensions, '
                             'but has %d.' % len(y.shape))

        if X.shape[0] != y.shape[0]:
            raise ValueError('X and y must have the same number of '
                             'datapoints, but have %d and %d.' %
                             (X.shape[0], y.shape[0]))
コード例 #2
0
 def test_filter_feat_grp(self):
     df = pd.read_csv("../examples/classification/train_classification.csv")
     df2 = GenericDataFrame(df, feat_grp=["id"] + ["num"] * 2 + ["cat"] * 9)
     df3 = df2.filter_feat_grp(["num", "id"])
     self.assertTrue(isinstance(df3, GenericDataFrame))
     self.assertTrue(np.all(df3.origin_grp == pd.Series(["id", "num", "num"])))
     self.assertTrue(np.all(df3.feat_grp == pd.Series(["id", "num", "num"])))
コード例 #3
0
ファイル: delete_nan.py プロジェクト: tqichun/HyperFlow
def is_nan_rejection(X: GenericDataFrame, y, nan_grp):
    selected_df = X.filter_feat_grp(nan_grp)
    deleted_index = selected_df.dropna().index
    if not isinstance(y, (pd.DataFrame, pd.Series)):
        y = pd.Series(y)
    y.index = selected_df.index
    return next(X.split([deleted_index], type="loc")), y[deleted_index].values
コード例 #4
0
 def _transform(self, X_: np.ndarray, X: GenericDataFrame):
     if X_ is None:
         return None
     X_ = self.before_trans_X(X_)
     X_ = self._transform_proc(X_)
     X_ = densify(X_)  # todo: 改为判断的形式?
     return X.replace_feat_grp(self.in_feat_grp, X_, self.out_feat_grp)
コード例 #5
0
 def _transform(self, X: GenericDataFrame, y):
     columns = X.columns
     feat_grp = X.feat_grp
     origin_grp = X.origin_grp
     X_, y_ = self._transform_proc(X, y)
     X = GenericDataFrame(pd.DataFrame(X_, columns=columns),
                          feat_grp=feat_grp,
                          origin_grp=origin_grp)
     return X, y_
コード例 #6
0
    def test_replace_feat_grp(self):
        df = pd.read_csv("../examples/classification/train_classification.csv")
        suffix = ["num"] * 2 + ["cat"] * 2 + ["num"] * 5 + ["cat"] * 2
        feat_grp = ["id"] + suffix

        df2 = GenericDataFrame(df, feat_grp=feat_grp)
        # test 1->2
        selected = df2.filter_feat_grp("id").values
        selected = np.hstack([selected, selected])
        df3 = df2.replace_feat_grp("id", selected, "id2")
        self.assertTrue(np.all(df3.feat_grp == pd.Series(suffix + ["id2"] * 2)))
        self.assertTrue(np.all(df3.origin_grp == pd.Series(suffix + ["id"] * 2)))
        # test 1->1
        selected = df2.filter_feat_grp("id").values
        selected = np.hstack([selected])
        df3 = df2.replace_feat_grp("id", selected, "id2")
        self.assertTrue(np.all(df3.feat_grp == pd.Series(suffix + ["id2"])))
        self.assertTrue(np.all(df3.origin_grp == pd.Series(suffix + ["id"])))
        # test 1->0
        selected = df2.filter_feat_grp("id").values
        selected = np.zeros([selected.shape[0], 0])
        df3 = df2.replace_feat_grp("id", selected, "id2")
        self.assertTrue(np.all(df3.feat_grp == pd.Series(suffix)))
        self.assertTrue(np.all(df3.origin_grp == pd.Series(suffix)))
コード例 #7
0
ファイル: base.py プロジェクト: tqichun/HyperFlow
 def process(
         self, X_origin: Optional[GenericDataFrame]
 ) -> Optional[GenericDataFrame]:
     if X_origin is None:
         return None
     X = X_origin.filter_feat_grp(self.in_feat_grp)
     highR = self.hyperparams.get(self.key1_hp_name, self.key1_default_name)
     lowR = self.hyperparams.get(self.key2_hp_name, self.key2_default_name)
     collection = {
         highR: defaultdict(list),
         lowR: defaultdict(list),
     }
     for i, (col_name, feat_grp, origin_grp) in enumerate(
             zip(X.columns, X.feat_grp, X.origin_grp)):
         col = X.iloc[:, i]
         if col_name in self.info[self.key1]["col_name"]:
             keyname = highR
         else:
             keyname = lowR
         collection[keyname]["X"].append(col)
         collection[keyname]["col_name"].append(col_name)
         collection[keyname]["feat_grp"].append(feat_grp)
         collection[keyname]["origin_grp"].append(origin_grp)
     dfs = []
     for feat_grp_name, dict_ in collection.items():
         X = np.array(dict_["X"]).T
         origin_grp = dict_["origin_grp"]
         feat_grp = [feat_grp_name] * len(origin_grp)
         if X.shape == (0, ):
             X = np.zeros([X_origin.shape[0], 0])
         df = GenericDataFrame(pd.DataFrame(X, columns=dict_["col_name"]),
                               feat_grp=feat_grp,
                               origin_grp=origin_grp)
         dfs.append(df)
     assert len(dfs) == 2
     df = dfs[0].concat_two(dfs[0], dfs[1])
     return X_origin.replace_feat_grp(self.in_feat_grp, df, df.feat_grp,
                                      df.origin_grp)
コード例 #8
0
ファイル: base.py プロジェクト: tqichun/HyperFlow
 def fit(self,
         X_train: GenericDataFrame,
         y_train=None,
         X_valid=None,
         y_valid=None,
         X_test=None,
         y_test=None):
     self.threshold = self.hyperparams.get("threshold",
                                           self.default_threshold)
     info = {
         self.key1: defaultdict(list),
         self.key2: defaultdict(list),
     }
     X: GenericDataFrame = X_train.filter_feat_grp(self.in_feat_grp)
     rows = X.shape[0]
     for i, (col_name, feat_grp, origin_grp) in enumerate(
             zip(X.columns, X.feat_grp, X.origin_grp)):
         col = X.iloc[:, i]
         keyname = self.judge_keyname(col, rows)
         info[keyname]["col_name"].append(col_name)
         info[keyname]["feat_grp"].append(feat_grp)
         info[keyname]["origin_grp"].append(origin_grp)
     self.info = info
     return self
コード例 #9
0
    def test_replace_feat_grp(self):
        df = pd.read_csv("../examples/classification/train_classification.csv")
        suffix = ["num"] * 2 + ["cat"] * 2 + ["num"] * 5 + ["cat"] * 2
        feat_grp = ["id"] + suffix

        df2 = GenericDataFrame(df, feat_grp=feat_grp)
        # test 1->2
        selected = df2.filter_feat_grp("id").values
        selected = np.hstack([selected, selected])
        df3 = df2.replace_feat_grp("id", selected, "id2")
        self.assertTrue(np.all(df3.feat_grp == pd.Series(suffix + ["id2"] * 2)))
        self.assertTrue(np.all(df3.origin_grp == pd.Series(suffix + ["id"] * 2)))
        # test 1->1
        selected = df2.filter_feat_grp("id").values
        selected = np.hstack([selected])
        df3 = df2.replace_feat_grp("id", selected, "id2")
        self.assertTrue(np.all(df3.feat_grp == pd.Series(suffix + ["id2"])))
        self.assertTrue(np.all(df3.origin_grp == pd.Series(suffix + ["id"])))
        # test 1->0
        selected = df2.filter_feat_grp("id").values
        selected = np.zeros([selected.shape[0], 0])
        df3 = df2.replace_feat_grp("id", selected, "id2")
        self.assertTrue(np.all(df3.feat_grp == pd.Series(suffix)))
        self.assertTrue(np.all(df3.origin_grp == pd.Series(suffix)))


if __name__ == '__main__':
    df = pd.read_csv("../examples/classification/train_classification.csv")
    df2 = GenericDataFrame(df, feat_grp=["id"] + ["num"] * 2 + ["cat"] * 9)
    df3 = df2.filter_feat_grp(["num", "id"])
コード例 #10
0
import logging

import pandas as pd

from hyperflow.pipeline.components.preprocessing.operate.drop import DropAll
from hyperflow.pipeline.components.preprocessing.operate.split.cat import SplitCat
from hyperflow.pipeline.dataframe import GenericDataFrame
from hyperflow.pipeline.pipeline import GenericPipeline

df = pd.read_csv("../examples/classification/train_classification.csv")
y = df.pop("Survived").values
df = df.loc[:, ["Sex", "Ticket", "Pclass"]]

df2 = GenericDataFrame(df, feat_grp=["cat", "cat", "num"])

split_cat = SplitCat()
split_cat.in_feat_grp = "cat"
split_cat.update_hyperparams({
    "highR": "highR_cat",
    "lowR": "lowR_cat",
    "threshold": 0.5
})
result = split_cat.fit_transform(df2)
logging.info(result)

df2 = GenericDataFrame(df, feat_grp=["cat", "cat", "num"])
drop_all = DropAll()
drop_all.in_feat_grp = ["cat", "num"]
drop_all.out_feat_grp = "drop"

split_cat = SplitCat()
コード例 #11
0
    def test_pipeline(self):
        self.logger = get_logger(__name__)
        df = pd.read_csv("../examples/classification/train_classification.csv")
        y = df.pop("Survived").values
        df = df.loc[:, ["Sex", "Cabin", "Age"]]
        feat_grp = ["cat_nan", "cat_nan", "num_nan"]
        df_train, df_test, y_train, y_test = train_test_split(df,
                                                              y,
                                                              test_size=0.2,
                                                              random_state=10)
        df_train = GenericDataFrame(df_train, feat_grp=feat_grp)
        df_test = GenericDataFrame(df_test, feat_grp=feat_grp)
        cv = KFold(n_splits=5, random_state=10, shuffle=True)
        train_ix, valid_ix = next(cv.split(df_train))

        df_train, df_valid = df_train.split([train_ix, valid_ix])
        y_valid = y_train[valid_ix]
        y_train = y_train[train_ix]

        fill_cat = FillCat()
        fill_cat.in_feat_grp = "cat_nan"
        fill_cat.out_feat_grp = "cat"
        fill_cat.update_hyperparams({"strategy": "<NULL>"})

        fill_num = FillNum()
        fill_num.in_feat_grp = "num_nan"
        fill_num.out_feat_grp = "num"
        fill_num.update_hyperparams({"strategy": "median"})

        ohe = OneHotEncoder()
        ohe.in_feat_grp = "cat"
        ohe.out_feat_grp = "num"

        sgd = SGD()
        sgd.in_feat_grp = "num"
        sgd.update_hyperparams({"loss": "log", "random_state": 10})

        pipeline = GenericPipeline([
            ("fill_cat", fill_cat),
            ("fill_num", fill_num),
            ("ohe", ohe),
            ("sgd", sgd),
        ])

        pipeline.fit(df_train, y_train, df_valid, y_valid, df_test, y_test)
        pred_train = pipeline.predict(df_train)
        pred_test = pipeline.predict(df_test)
        pred_valid = pipeline.predict(df_valid)
        score_valid = pipeline.predict_proba(df_valid)
        self.logger.info(accuracy_score(y_train, pred_train))
        self.logger.info(accuracy_score(y_valid, pred_valid))
        self.logger.info(accuracy_score(y_test, pred_test))
        result = pipeline.procedure(constants.binary_classification_task,
                                    df_train, y_train, df_valid, y_valid,
                                    df_test, y_test)
        pred_test = result["pred_test"]
        pred_valid = result["pred_valid"]
        self.logger.info(
            accuracy_score(y_valid, (pred_valid > .5).astype("int")[:, 1]))
        self.logger.info(
            accuracy_score(y_test, (pred_test > .5).astype("int")[:, 1]))

        pipeline = GenericPipeline([
            ("fill_cat", fill_cat),
            ("fill_num", fill_num),
            ("ohe", ohe),
        ])

        pipeline.fit(df_train, y_train, df_valid, y_valid, df_test, y_test)
        ret1 = pipeline.transform(df_train, df_valid, df_test)
        ret2 = pipeline.fit_transform(df_train, y_train, df_valid, y_valid,
                                      df_test, y_test)
        for key in ["X_train", "X_valid", "X_test"]:
            assert np.all(ret1[key] == ret2[key])

        pipeline = GenericPipeline([
            ("sgd", sgd),
        ])

        result = pipeline.procedure(constants.binary_classification_task,
                                    ret1["X_train"], y_train, ret1["X_valid"],
                                    y_valid, ret1["X_test"], y_test)
        pred_test = result["pred_test"]
        pred_valid = result["pred_valid"]
        self.logger.info(
            accuracy_score(y_valid, (pred_valid > .5).astype("int")[:, 1]))
        self.logger.info(
            accuracy_score(y_test, (pred_test > .5).astype("int")[:, 1]))