def __init__(self, X, y=None, X_test=None, y_test=None, dataset_metadata=frozenset(), column_descriptions=None): dataset_metadata = dict(dataset_metadata) super(XYDataManager, self).__init__( dataset_metadata.get("dataset_name", "default_dataset_name")) X, y, X_test, y_test, feat_grp, column2feat_grp = self.parse_column_descriptions( column_descriptions, X, y, X_test, y_test) self.column2feat_grp = column2feat_grp self.ml_task: MLTask = get_task_from_y(y) self.X_train = GenericDataFrame(X, feat_grp=feat_grp) self.y_train = y self.X_test = GenericDataFrame( X_test, feat_grp=feat_grp) if X_test is not None else None self.y_test = y_test if y_test is not None else None # todo: 用户自定义验证集可以通过RandomShuffle 或者mlxtend指定 # fixme: 不支持multilabel if len(y.shape) > 2: raise ValueError('y must not have more than two dimensions, ' 'but has %d.' % len(y.shape)) if X.shape[0] != y.shape[0]: raise ValueError('X and y must have the same number of ' 'datapoints, but have %d and %d.' % (X.shape[0], y.shape[0]))
def test_filter_feat_grp(self): df = pd.read_csv("../examples/classification/train_classification.csv") df2 = GenericDataFrame(df, feat_grp=["id"] + ["num"] * 2 + ["cat"] * 9) df3 = df2.filter_feat_grp(["num", "id"]) self.assertTrue(isinstance(df3, GenericDataFrame)) self.assertTrue(np.all(df3.origin_grp == pd.Series(["id", "num", "num"]))) self.assertTrue(np.all(df3.feat_grp == pd.Series(["id", "num", "num"])))
def is_nan_rejection(X: GenericDataFrame, y, nan_grp): selected_df = X.filter_feat_grp(nan_grp) deleted_index = selected_df.dropna().index if not isinstance(y, (pd.DataFrame, pd.Series)): y = pd.Series(y) y.index = selected_df.index return next(X.split([deleted_index], type="loc")), y[deleted_index].values
def _transform(self, X_: np.ndarray, X: GenericDataFrame): if X_ is None: return None X_ = self.before_trans_X(X_) X_ = self._transform_proc(X_) X_ = densify(X_) # todo: 改为判断的形式? return X.replace_feat_grp(self.in_feat_grp, X_, self.out_feat_grp)
def _transform(self, X: GenericDataFrame, y): columns = X.columns feat_grp = X.feat_grp origin_grp = X.origin_grp X_, y_ = self._transform_proc(X, y) X = GenericDataFrame(pd.DataFrame(X_, columns=columns), feat_grp=feat_grp, origin_grp=origin_grp) return X, y_
def test_replace_feat_grp(self): df = pd.read_csv("../examples/classification/train_classification.csv") suffix = ["num"] * 2 + ["cat"] * 2 + ["num"] * 5 + ["cat"] * 2 feat_grp = ["id"] + suffix df2 = GenericDataFrame(df, feat_grp=feat_grp) # test 1->2 selected = df2.filter_feat_grp("id").values selected = np.hstack([selected, selected]) df3 = df2.replace_feat_grp("id", selected, "id2") self.assertTrue(np.all(df3.feat_grp == pd.Series(suffix + ["id2"] * 2))) self.assertTrue(np.all(df3.origin_grp == pd.Series(suffix + ["id"] * 2))) # test 1->1 selected = df2.filter_feat_grp("id").values selected = np.hstack([selected]) df3 = df2.replace_feat_grp("id", selected, "id2") self.assertTrue(np.all(df3.feat_grp == pd.Series(suffix + ["id2"]))) self.assertTrue(np.all(df3.origin_grp == pd.Series(suffix + ["id"]))) # test 1->0 selected = df2.filter_feat_grp("id").values selected = np.zeros([selected.shape[0], 0]) df3 = df2.replace_feat_grp("id", selected, "id2") self.assertTrue(np.all(df3.feat_grp == pd.Series(suffix))) self.assertTrue(np.all(df3.origin_grp == pd.Series(suffix)))
def process( self, X_origin: Optional[GenericDataFrame] ) -> Optional[GenericDataFrame]: if X_origin is None: return None X = X_origin.filter_feat_grp(self.in_feat_grp) highR = self.hyperparams.get(self.key1_hp_name, self.key1_default_name) lowR = self.hyperparams.get(self.key2_hp_name, self.key2_default_name) collection = { highR: defaultdict(list), lowR: defaultdict(list), } for i, (col_name, feat_grp, origin_grp) in enumerate( zip(X.columns, X.feat_grp, X.origin_grp)): col = X.iloc[:, i] if col_name in self.info[self.key1]["col_name"]: keyname = highR else: keyname = lowR collection[keyname]["X"].append(col) collection[keyname]["col_name"].append(col_name) collection[keyname]["feat_grp"].append(feat_grp) collection[keyname]["origin_grp"].append(origin_grp) dfs = [] for feat_grp_name, dict_ in collection.items(): X = np.array(dict_["X"]).T origin_grp = dict_["origin_grp"] feat_grp = [feat_grp_name] * len(origin_grp) if X.shape == (0, ): X = np.zeros([X_origin.shape[0], 0]) df = GenericDataFrame(pd.DataFrame(X, columns=dict_["col_name"]), feat_grp=feat_grp, origin_grp=origin_grp) dfs.append(df) assert len(dfs) == 2 df = dfs[0].concat_two(dfs[0], dfs[1]) return X_origin.replace_feat_grp(self.in_feat_grp, df, df.feat_grp, df.origin_grp)
def fit(self, X_train: GenericDataFrame, y_train=None, X_valid=None, y_valid=None, X_test=None, y_test=None): self.threshold = self.hyperparams.get("threshold", self.default_threshold) info = { self.key1: defaultdict(list), self.key2: defaultdict(list), } X: GenericDataFrame = X_train.filter_feat_grp(self.in_feat_grp) rows = X.shape[0] for i, (col_name, feat_grp, origin_grp) in enumerate( zip(X.columns, X.feat_grp, X.origin_grp)): col = X.iloc[:, i] keyname = self.judge_keyname(col, rows) info[keyname]["col_name"].append(col_name) info[keyname]["feat_grp"].append(feat_grp) info[keyname]["origin_grp"].append(origin_grp) self.info = info return self
def test_replace_feat_grp(self): df = pd.read_csv("../examples/classification/train_classification.csv") suffix = ["num"] * 2 + ["cat"] * 2 + ["num"] * 5 + ["cat"] * 2 feat_grp = ["id"] + suffix df2 = GenericDataFrame(df, feat_grp=feat_grp) # test 1->2 selected = df2.filter_feat_grp("id").values selected = np.hstack([selected, selected]) df3 = df2.replace_feat_grp("id", selected, "id2") self.assertTrue(np.all(df3.feat_grp == pd.Series(suffix + ["id2"] * 2))) self.assertTrue(np.all(df3.origin_grp == pd.Series(suffix + ["id"] * 2))) # test 1->1 selected = df2.filter_feat_grp("id").values selected = np.hstack([selected]) df3 = df2.replace_feat_grp("id", selected, "id2") self.assertTrue(np.all(df3.feat_grp == pd.Series(suffix + ["id2"]))) self.assertTrue(np.all(df3.origin_grp == pd.Series(suffix + ["id"]))) # test 1->0 selected = df2.filter_feat_grp("id").values selected = np.zeros([selected.shape[0], 0]) df3 = df2.replace_feat_grp("id", selected, "id2") self.assertTrue(np.all(df3.feat_grp == pd.Series(suffix))) self.assertTrue(np.all(df3.origin_grp == pd.Series(suffix))) if __name__ == '__main__': df = pd.read_csv("../examples/classification/train_classification.csv") df2 = GenericDataFrame(df, feat_grp=["id"] + ["num"] * 2 + ["cat"] * 9) df3 = df2.filter_feat_grp(["num", "id"])
import logging import pandas as pd from hyperflow.pipeline.components.preprocessing.operate.drop import DropAll from hyperflow.pipeline.components.preprocessing.operate.split.cat import SplitCat from hyperflow.pipeline.dataframe import GenericDataFrame from hyperflow.pipeline.pipeline import GenericPipeline df = pd.read_csv("../examples/classification/train_classification.csv") y = df.pop("Survived").values df = df.loc[:, ["Sex", "Ticket", "Pclass"]] df2 = GenericDataFrame(df, feat_grp=["cat", "cat", "num"]) split_cat = SplitCat() split_cat.in_feat_grp = "cat" split_cat.update_hyperparams({ "highR": "highR_cat", "lowR": "lowR_cat", "threshold": 0.5 }) result = split_cat.fit_transform(df2) logging.info(result) df2 = GenericDataFrame(df, feat_grp=["cat", "cat", "num"]) drop_all = DropAll() drop_all.in_feat_grp = ["cat", "num"] drop_all.out_feat_grp = "drop" split_cat = SplitCat()
def test_pipeline(self): self.logger = get_logger(__name__) df = pd.read_csv("../examples/classification/train_classification.csv") y = df.pop("Survived").values df = df.loc[:, ["Sex", "Cabin", "Age"]] feat_grp = ["cat_nan", "cat_nan", "num_nan"] df_train, df_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=10) df_train = GenericDataFrame(df_train, feat_grp=feat_grp) df_test = GenericDataFrame(df_test, feat_grp=feat_grp) cv = KFold(n_splits=5, random_state=10, shuffle=True) train_ix, valid_ix = next(cv.split(df_train)) df_train, df_valid = df_train.split([train_ix, valid_ix]) y_valid = y_train[valid_ix] y_train = y_train[train_ix] fill_cat = FillCat() fill_cat.in_feat_grp = "cat_nan" fill_cat.out_feat_grp = "cat" fill_cat.update_hyperparams({"strategy": "<NULL>"}) fill_num = FillNum() fill_num.in_feat_grp = "num_nan" fill_num.out_feat_grp = "num" fill_num.update_hyperparams({"strategy": "median"}) ohe = OneHotEncoder() ohe.in_feat_grp = "cat" ohe.out_feat_grp = "num" sgd = SGD() sgd.in_feat_grp = "num" sgd.update_hyperparams({"loss": "log", "random_state": 10}) pipeline = GenericPipeline([ ("fill_cat", fill_cat), ("fill_num", fill_num), ("ohe", ohe), ("sgd", sgd), ]) pipeline.fit(df_train, y_train, df_valid, y_valid, df_test, y_test) pred_train = pipeline.predict(df_train) pred_test = pipeline.predict(df_test) pred_valid = pipeline.predict(df_valid) score_valid = pipeline.predict_proba(df_valid) self.logger.info(accuracy_score(y_train, pred_train)) self.logger.info(accuracy_score(y_valid, pred_valid)) self.logger.info(accuracy_score(y_test, pred_test)) result = pipeline.procedure(constants.binary_classification_task, df_train, y_train, df_valid, y_valid, df_test, y_test) pred_test = result["pred_test"] pred_valid = result["pred_valid"] self.logger.info( accuracy_score(y_valid, (pred_valid > .5).astype("int")[:, 1])) self.logger.info( accuracy_score(y_test, (pred_test > .5).astype("int")[:, 1])) pipeline = GenericPipeline([ ("fill_cat", fill_cat), ("fill_num", fill_num), ("ohe", ohe), ]) pipeline.fit(df_train, y_train, df_valid, y_valid, df_test, y_test) ret1 = pipeline.transform(df_train, df_valid, df_test) ret2 = pipeline.fit_transform(df_train, y_train, df_valid, y_valid, df_test, y_test) for key in ["X_train", "X_valid", "X_test"]: assert np.all(ret1[key] == ret2[key]) pipeline = GenericPipeline([ ("sgd", sgd), ]) result = pipeline.procedure(constants.binary_classification_task, ret1["X_train"], y_train, ret1["X_valid"], y_valid, ret1["X_test"], y_test) pred_test = result["pred_test"] pred_valid = result["pred_valid"] self.logger.info( accuracy_score(y_valid, (pred_valid > .5).astype("int")[:, 1])) self.logger.info( accuracy_score(y_test, (pred_test > .5).astype("int")[:, 1]))