def test_transform(self): # training data d = {"col1": ["a", "a", "c"], "col2": ["w", "e", "d"]} df = pd.DataFrame(data=d) # fit binarizer lb1 = LabelBinarizer() lb1.fit(df, "col1") lb2 = LabelBinarizer() lb2.fit(df, "col2") # test data d_test = {"col1": ["c", "c", "a"], "col2": ["e", "d", "w"], "col3": [2, 3, 4]} df_test = pd.DataFrame(data=d_test) # transform df_test = lb1.transform(df_test, "col1") df_test = lb2.transform(df_test, "col2") # for binary column, only one value is left, old column should be deleted self.assertTrue("col1_c" in df_test.columns) self.assertTrue("col1" not in df_test.columns) self.assertEqual(2, np.sum(df_test["col1_c"])) # for multiple value colum, all columns should be added self.assertTrue("col2_w" in df_test.columns) self.assertTrue("col2_e" in df_test.columns) self.assertTrue("col2_d" in df_test.columns) self.assertTrue("col2" not in df_test.columns) self.assertEqual(1, np.sum(df_test["col2_w"])) self.assertEqual(1, np.sum(df_test["col2_e"])) self.assertEqual(1, np.sum(df_test["col2_d"])) # do not touch continuous attribute self.assertTrue("col3" in df_test.columns)
def test_transform_with_new_values(self): # training data d = {"col1": ["a", "a", "c"], "col2": ["w", "e", "d"]} df = pd.DataFrame(data=d) # fit binarizer lb1 = LabelBinarizer() lb1.fit(df, "col1") lb2 = LabelBinarizer() lb2.fit(df, "col2") # test data d_test = {"col1": ["c", "d", "d"], "col2": ["g", "e", "f"], "col3": [2, 3, 4]} df_test = pd.DataFrame(data=d_test) # transform df_test = lb1.transform(df_test, "col1") df_test = lb2.transform(df_test, "col2") self.assertTrue("col1_c" in df_test.columns) self.assertTrue("col1_d" not in df_test.columns) self.assertTrue("col2_w" in df_test.columns) self.assertTrue("col2_e" in df_test.columns) self.assertTrue("col2_d" in df_test.columns) self.assertTrue("col2_g" not in df_test.columns) self.assertTrue("col2_f" not in df_test.columns) self.assertEqual(df_test["col1_c"][0], 1) self.assertEqual(df_test["col1_c"][1], 0) self.assertEqual(df_test["col1_c"][2], 0) self.assertEqual(np.sum(df_test["col2_w"]), 0) self.assertEqual(np.sum(df_test["col2_d"]), 0) self.assertEqual(df_test["col2_e"][0], 0) self.assertEqual(df_test["col2_e"][1], 1) self.assertEqual(df_test["col2_e"][2], 0)
def test_fit(self): # training data d = {"col1": ["a", "a", "c"], "col2": ["w", "e", "d"]} df = pd.DataFrame(data=d) lb = LabelBinarizer() # check first column lb.fit(df, "col1") data_json = lb.to_json() self.assertTrue("new_columns" in data_json) # we take alphabetical order self.assertTrue("col1_c" in data_json["new_columns"]) self.assertTrue("col1_a" not in data_json["new_columns"]) self.assertTrue("unique_values" in data_json) self.assertTrue("a" in data_json["unique_values"]) self.assertTrue("c" in data_json["unique_values"]) lb = LabelBinarizer() # check second column lb.fit(df, "col2") data_json = lb.to_json() self.assertTrue("new_columns" in data_json) self.assertTrue("col2_w" in data_json["new_columns"]) self.assertTrue("col2_e" in data_json["new_columns"]) self.assertTrue("col2_d" in data_json["new_columns"]) self.assertTrue("unique_values" in data_json) self.assertTrue("w" in data_json["unique_values"]) self.assertTrue("e" in data_json["unique_values"]) self.assertTrue("d" in data_json["unique_values"])
def from_json(self, data_json): if "remove_columns" in data_json: self._remove_columns = data_json.get("remove_columns", []) if "missing_values" in data_json: self._missing_values = [] for mv_data in data_json["missing_values"]: mv = PreprocessingMissingValues() mv.from_json(mv_data) self._missing_values += [mv] if "categorical" in data_json: self._categorical = [] for cat_data in data_json["categorical"]: cat = PreprocessingCategorical() cat.from_json(cat_data) self._categorical += [cat] if "scale" in data_json: self._scale = [] for scale_data in data_json["scale"]: sc = Scale() sc.from_json(scale_data) self._scale += [sc] if "categorical_y" in data_json: if "new_columns" in data_json["categorical_y"]: self._categorical_y = LabelBinarizer() else: self._categorical_y = LabelEncoder() self._categorical_y.from_json(data_json["categorical_y"]) if "scale_y" in data_json: self._scale_y = Scale() self._scale_y.from_json(data_json["scale_y"]) if "ml_task" in data_json: self._params["ml_task"] = data_json["ml_task"]
def from_json(self, data_json): self._params = data_json.get("params", self._params) if "remove_columns" in data_json: self._remove_columns = data_json.get("remove_columns", []) if "missing_values" in data_json: self._missing_values = [] for mv_data in data_json["missing_values"]: mv = PreprocessingMissingValues() mv.from_json(mv_data) self._missing_values += [mv] if "categorical" in data_json: self._categorical = [] for cat_data in data_json["categorical"]: cat = PreprocessingCategorical() cat.from_json(cat_data) self._categorical += [cat] if "datetime_transforms" in data_json: self._datetime_transforms = [] for dtt_params in data_json["datetime_transforms"]: dtt = DateTimeTransformer() dtt.from_json(dtt_params) self._datetime_transforms += [dtt] if "text_transforms" in data_json: self._text_transforms = [] for tt_params in data_json["text_transforms"]: tt = TextTransformer() tt.from_json(tt_params) self._text_transforms += [tt] if "golden_features" in data_json: self._golden_features = GoldenFeaturesTransformer() self._golden_features.from_json(data_json["golden_features"]) if "scale" in data_json: self._scale = [] for scale_data in data_json["scale"]: sc = Scale() sc.from_json(scale_data) self._scale += [sc] if "categorical_y" in data_json: if "new_columns" in data_json["categorical_y"]: self._categorical_y = LabelBinarizer() else: self._categorical_y = LabelEncoder() self._categorical_y.from_json(data_json["categorical_y"]) if "scale_y" in data_json: self._scale_y = Scale() self._scale_y.from_json(data_json["scale_y"]) if "ml_task" in data_json: self._params["ml_task"] = data_json["ml_task"] self._add_random_feature = data_json.get("add_random_feature", False) self._drop_features = data_json.get("drop_features", [])
def inverse_transform(self, X): for column, lbl_params in self._convert_params.items(): if "unique_values" in lbl_params and "new_columns" in lbl_params: # convert to one hot lbl = LabelBinarizer() lbl.from_json(lbl_params) X = lbl.inverse_transform(X, column) # should raise exception else: # convert to integer lbl = LabelEncoder() lbl.from_json(lbl_params) X.loc[:, column] = lbl.inverse_transform(X.loc[:, column]) return X
def transform(self, X): if (self._convert_method == PreprocessingCategorical.CONVERT_LOO and self._columns): return self._enc.transform(X) else: for column, lbl_params in self._convert_params.items(): if "unique_values" in lbl_params and "new_columns" in lbl_params: # convert to one hot lbl = LabelBinarizer() lbl.from_json(lbl_params) X = lbl.transform(X, column) else: # convert to integer lbl = LabelEncoder() lbl.from_json(lbl_params) X.loc[:, column] = lbl.transform(X.loc[:, column]) return X
def _fit_categorical_convert(self, X): for column in self._columns: if PreprocessingUtils.get_type( X[column]) != PreprocessingUtils.CATEGORICAL: # no need to convert, already a number continue # limit categories - it is needed when doing one hot encoding # this code is also used in predict.py file # and transform_utils.py # TODO it needs refactoring !!! too_much_categories = len(np.unique(list(X[column].values))) > 200 lbl = None if (self._convert_method == PreprocessingCategorical.CONVERT_ONE_HOT and not too_much_categories): lbl = LabelBinarizer() lbl.fit(X, column) else: lbl = LabelEncoder() lbl.fit(X[column]) if lbl is not None: self._convert_params[column] = lbl.to_json()
def test_inverse_transform(self): d = {"col1": ["a", "a", "c"], "col2": ["w", "e", "d"]} df = pd.DataFrame(data=d) lb = LabelBinarizer() # check first column lb.fit(df, "col1") bb = lb.transform(df, "col1") self.assertTrue("col1_c" in bb.columns) self.assertTrue(np.sum(bb["col1_c"]) == 1) bb = lb.inverse_transform(bb) self.assertTrue("col1_c" not in bb.columns) # check second column lb = LabelBinarizer() lb.fit(df, "col2") bb = lb.transform(df, "col2") self.assertTrue("col2_w" in bb.columns) self.assertTrue("col2_e" in bb.columns) self.assertTrue("col2_d" in bb.columns) self.assertTrue(np.sum(bb["col2_w"]) == 1) bb = lb.inverse_transform(bb) self.assertTrue("col2_w" not in bb.columns)
def test_to_and_from_json_booleans(self): # training data d = {"col1": ["a", "a", "c"], "col2": [True, True, False]} df = pd.DataFrame(data=d) # fit binarizer lb1 = LabelBinarizer() lb1.fit(df, "col1") lb2 = LabelBinarizer() lb2.fit(df, "col2") # test data d_test = { "col1": ["c", "c", "a"], "col2": [False, False, True], "col3": [2, 3, 4], } df_test = pd.DataFrame(data=d_test) # to json and from json new_lb1 = LabelBinarizer() new_lb2 = LabelBinarizer() new_lb1.from_json(lb1.to_json()) new_lb2.from_json(json.loads(json.dumps(lb2.to_json(), indent=4))) # transform df_test = new_lb1.transform(df_test, "col1") df_test = new_lb2.transform(df_test, "col2") # for binary column, only one value is left, old column should be deleted self.assertTrue("col1_c" in df_test.columns) self.assertTrue("col1" not in df_test.columns) self.assertEqual(2, np.sum(df_test["col1_c"])) # for multiple value colum, all columns should be added self.assertTrue("col2_True" in df_test.columns) self.assertTrue("col2" not in df_test.columns) self.assertEqual(1, np.sum(df_test["col2_True"])) # do not touch continuous attribute self.assertTrue("col3" in df_test.columns)
def fit_and_transform(self, X_train, y_train): logger.debug("Preprocessing.fit_and_transform") if y_train is not None: # target preprocessing # this must be used first, maybe we will drop some rows because of missing target values target_preprocessing = self._params.get("target_preprocessing") logger.debug( "target_preprocessing params: {}".format(target_preprocessing)) X_train, y_train = ExcludeRowsMissingTarget.transform( X_train, y_train) if PreprocessingCategorical.CONVERT_INTEGER in target_preprocessing: logger.debug("Convert target to integer") self._categorical_y = LabelEncoder() self._categorical_y.fit(y_train) y_train = pd.Series(self._categorical_y.transform(y_train)) if PreprocessingCategorical.CONVERT_ONE_HOT in target_preprocessing: logger.debug("Convert target to one-hot coding") self._categorical_y = LabelBinarizer() self._categorical_y.fit(pd.DataFrame({"target": y_train}), "target") y_train = self._categorical_y.transform( pd.DataFrame({"target": y_train}), "target") if Scale.SCALE_LOG_AND_NORMAL in target_preprocessing: logger.debug("Scale log and normal") self._scale_y = Scale(["target"], scale_method=Scale.SCALE_LOG_AND_NORMAL) y_train = pd.DataFrame({"target": y_train}) self._scale_y.fit(y_train) y_train = self._scale_y.transform(y_train) y_train = y_train["target"] if Scale.SCALE_NORMAL in target_preprocessing: logger.debug("Scale normal") self._scale_y = Scale(["target"], scale_method=Scale.SCALE_NORMAL) y_train = pd.DataFrame({"target": y_train}) self._scale_y.fit(y_train) y_train = self._scale_y.transform(y_train) y_train = y_train["target"] # columns preprocessing columns_preprocessing = self._params.get("columns_preprocessing") for column in columns_preprocessing: transforms = columns_preprocessing[column] # logger.debug("Preprocess column {} with: {}".format(column, transforms)) # remove empty or constant columns cols_to_remove = list( filter( lambda k: "remove_column" in columns_preprocessing[k], columns_preprocessing, )) if X_train is not None: X_train.drop(cols_to_remove, axis=1, inplace=True) self._remove_columns = cols_to_remove for missing_method in [PreprocessingMissingValues.FILL_NA_MEDIAN]: cols_to_process = list( filter( lambda k: missing_method in columns_preprocessing[k], columns_preprocessing, )) missing = PreprocessingMissingValues(cols_to_process, missing_method) missing.fit(X_train) X_train = missing.transform(X_train) self._missing_values += [missing] for convert_method in [PreprocessingCategorical.CONVERT_INTEGER]: cols_to_process = list( filter( lambda k: convert_method in columns_preprocessing[k], columns_preprocessing, )) convert = PreprocessingCategorical(cols_to_process, convert_method) convert.fit(X_train) X_train = convert.transform(X_train) self._categorical += [convert] # SCALE for scale_method in [Scale.SCALE_NORMAL, Scale.SCALE_LOG_AND_NORMAL]: cols_to_process = list( filter( lambda k: scale_method in columns_preprocessing[k], columns_preprocessing, )) if len(cols_to_process): scale = Scale(cols_to_process) scale.fit(X_train) X_train = scale.transform(X_train) self._scale += [scale] return X_train, y_train
def fit_and_transform(self, X_train, y_train, sample_weight=None): logger.debug("Preprocessing.fit_and_transform") if y_train is not None: # target preprocessing # this must be used first, maybe we will drop some rows because of missing target values target_preprocessing = self._params.get("target_preprocessing") logger.debug( "target_preprocessing params: {}".format(target_preprocessing)) X_train, y_train, sample_weight = ExcludeRowsMissingTarget.transform( X_train, y_train, sample_weight) if PreprocessingCategorical.CONVERT_INTEGER in target_preprocessing: logger.debug("Convert target to integer") self._categorical_y = LabelEncoder(try_to_fit_numeric=True) self._categorical_y.fit(y_train) y_train = pd.Series(self._categorical_y.transform(y_train)) if PreprocessingCategorical.CONVERT_ONE_HOT in target_preprocessing: logger.debug("Convert target to one-hot coding") self._categorical_y = LabelBinarizer() self._categorical_y.fit(pd.DataFrame({"target": y_train}), "target") y_train = self._categorical_y.transform( pd.DataFrame({"target": y_train}), "target") if Scale.SCALE_LOG_AND_NORMAL in target_preprocessing: logger.debug("Scale log and normal") self._scale_y = Scale(["target"], scale_method=Scale.SCALE_LOG_AND_NORMAL) y_train = pd.DataFrame({"target": y_train}) self._scale_y.fit(y_train) y_train = self._scale_y.transform(y_train) y_train = y_train["target"] if Scale.SCALE_NORMAL in target_preprocessing: logger.debug("Scale normal") self._scale_y = Scale(["target"], scale_method=Scale.SCALE_NORMAL) y_train = pd.DataFrame({"target": y_train}) self._scale_y.fit(y_train) y_train = self._scale_y.transform(y_train) y_train = y_train["target"] # columns preprocessing columns_preprocessing = self._params.get("columns_preprocessing") for column in columns_preprocessing: transforms = columns_preprocessing[column] # logger.debug("Preprocess column {} with: {}".format(column, transforms)) # remove empty or constant columns cols_to_remove = list( filter( lambda k: "remove_column" in columns_preprocessing[k], columns_preprocessing, )) if X_train is not None: X_train.drop(cols_to_remove, axis=1, inplace=True) self._remove_columns = cols_to_remove numeric_cols = [] # get numeric cols before text transformations # needed for golden features if X_train is not None and ("golden_features" in self._params or "kmeans_features" in self._params): numeric_cols = X_train.select_dtypes( include="number").columns.tolist() # there can be missing values in the text data, # but we don't want to handle it by fill missing methods # zeros will be imputed by text_transform method cols_to_process = list( filter( lambda k: "text_transform" in columns_preprocessing[k], columns_preprocessing, )) new_text_columns = [] for col in cols_to_process: t = TextTransformer() t.fit(X_train, col) X_train = t.transform(X_train) self._text_transforms += [t] new_text_columns += t._new_columns # end of text transform for missing_method in [PreprocessingMissingValues.FILL_NA_MEDIAN]: cols_to_process = list( filter( lambda k: missing_method in columns_preprocessing[k], columns_preprocessing, )) missing = PreprocessingMissingValues(cols_to_process, missing_method) missing.fit(X_train) X_train = missing.transform(X_train) self._missing_values += [missing] # golden features golden_columns = [] if "golden_features" in self._params: results_path = self._params["golden_features"]["results_path"] ml_task = self._params["golden_features"]["ml_task"] self._golden_features = GoldenFeaturesTransformer( results_path, ml_task) self._golden_features.fit(X_train[numeric_cols], y_train) X_train = self._golden_features.transform(X_train) golden_columns = self._golden_features._new_columns kmeans_columns = [] if "kmeans_features" in self._params: results_path = self._params["kmeans_features"]["results_path"] self._kmeans = KMeansTransformer(results_path, self._model_name, self._k_fold) self._kmeans.fit(X_train[numeric_cols], y_train) X_train = self._kmeans.transform(X_train) kmeans_columns = self._kmeans._new_features for convert_method in [ PreprocessingCategorical.CONVERT_INTEGER, PreprocessingCategorical.CONVERT_ONE_HOT, PreprocessingCategorical.CONVERT_LOO, ]: cols_to_process = list( filter( lambda k: convert_method in columns_preprocessing[k], columns_preprocessing, )) convert = PreprocessingCategorical(cols_to_process, convert_method) convert.fit(X_train, y_train) X_train = convert.transform(X_train) self._categorical += [convert] # datetime transform cols_to_process = list( filter( lambda k: "datetime_transform" in columns_preprocessing[k], columns_preprocessing, )) new_datetime_columns = [] for col in cols_to_process: t = DateTimeTransformer() t.fit(X_train, col) X_train = t.transform(X_train) self._datetime_transforms += [t] new_datetime_columns += t._new_columns # SCALE for scale_method in [Scale.SCALE_NORMAL, Scale.SCALE_LOG_AND_NORMAL]: cols_to_process = list( filter( lambda k: scale_method in columns_preprocessing[k], columns_preprocessing, )) if (len(cols_to_process) and len(new_datetime_columns) and scale_method == Scale.SCALE_NORMAL): cols_to_process += new_datetime_columns if (len(cols_to_process) and len(new_text_columns) and scale_method == Scale.SCALE_NORMAL): cols_to_process += new_text_columns if (len(cols_to_process) and len(golden_columns) and scale_method == Scale.SCALE_NORMAL): cols_to_process += golden_columns if (len(cols_to_process) and len(kmeans_columns) and scale_method == Scale.SCALE_NORMAL): cols_to_process += kmeans_columns if len(cols_to_process): scale = Scale(cols_to_process) scale.fit(X_train) X_train = scale.transform(X_train) self._scale += [scale] if self._add_random_feature: # -1, 1, with 0 mean X_train["random_feature"] = np.random.rand( X_train.shape[0]) * 2.0 - 1.0 if self._drop_features: available_cols = X_train.columns.tolist() drop_cols = [c for c in self._drop_features if c in available_cols] if len(drop_cols) == X_train.shape[1]: raise AutoMLException( "All features are droppped! Your data looks like random data." ) if drop_cols: X_train.drop(drop_cols, axis=1, inplace=True) self._drop_features = drop_cols if X_train is not None: # there can be catagorical columns (in CatBoost) which cant be clipped numeric_cols = X_train.select_dtypes( include="number").columns.tolist() X_train[numeric_cols] = X_train[numeric_cols].clip( lower=np.finfo(np.float32).min + 1000, upper=np.finfo(np.float32).max - 1000, ) return X_train, y_train, sample_weight
def fit_and_transform(self, X_train, y_train): logger.debug("Preprocessing.fit_and_transform") if y_train is not None: # target preprocessing # this must be used first, maybe we will drop some rows because of missing target values target_preprocessing = self._params.get("target_preprocessing") logger.debug("target_preprocessing params: {}".format(target_preprocessing)) X_train, y_train = ExcludeRowsMissingTarget.transform(X_train, y_train) if PreprocessingCategorical.CONVERT_INTEGER in target_preprocessing: logger.debug("Convert target to integer") self._categorical_y = LabelEncoder() self._categorical_y.fit(y_train) y_train = pd.Series(self._categorical_y.transform(y_train)) if PreprocessingCategorical.CONVERT_ONE_HOT in target_preprocessing: logger.debug("Convert target to one-hot coding") self._categorical_y = LabelBinarizer() self._categorical_y.fit(pd.DataFrame({"target": y_train}), "target") y_train = self._categorical_y.transform( pd.DataFrame({"target": y_train}), "target" ) if Scale.SCALE_LOG_AND_NORMAL in target_preprocessing: logger.debug("Scale log and normal") self._scale_y = Scale( ["target"], scale_method=Scale.SCALE_LOG_AND_NORMAL ) y_train = pd.DataFrame({"target": y_train}) self._scale_y.fit(y_train) y_train = self._scale_y.transform(y_train) y_train = y_train["target"] if Scale.SCALE_NORMAL in target_preprocessing: logger.debug("Scale normal") self._scale_y = Scale(["target"], scale_method=Scale.SCALE_NORMAL) y_train = pd.DataFrame({"target": y_train}) self._scale_y.fit(y_train) y_train = self._scale_y.transform(y_train) y_train = y_train["target"] # columns preprocessing columns_preprocessing = self._params.get("columns_preprocessing") for column in columns_preprocessing: transforms = columns_preprocessing[column] # logger.debug("Preprocess column {} with: {}".format(column, transforms)) # remove empty or constant columns cols_to_remove = list( filter( lambda k: "remove_column" in columns_preprocessing[k], columns_preprocessing, ) ) if X_train is not None: X_train.drop(cols_to_remove, axis=1, inplace=True) self._remove_columns = cols_to_remove # there can be missing values in the text data, # but we don't want to handle it by fill missing methods # zeros will be imputed by text_transform method cols_to_process = list( filter( lambda k: "text_transform" in columns_preprocessing[k], columns_preprocessing, ) ) new_text_columns = [] for col in cols_to_process: t = TextTransformer() t.fit(X_train, col) X_train = t.transform(X_train) self._text_transforms += [t] new_text_columns += t._new_columns # end of text transform for missing_method in [PreprocessingMissingValues.FILL_NA_MEDIAN]: cols_to_process = list( filter( lambda k: missing_method in columns_preprocessing[k], columns_preprocessing, ) ) missing = PreprocessingMissingValues(cols_to_process, missing_method) missing.fit(X_train) X_train = missing.transform(X_train) self._missing_values += [missing] for convert_method in [ PreprocessingCategorical.CONVERT_INTEGER, PreprocessingCategorical.CONVERT_ONE_HOT, ]: cols_to_process = list( filter( lambda k: convert_method in columns_preprocessing[k], columns_preprocessing, ) ) convert = PreprocessingCategorical(cols_to_process, convert_method) convert.fit(X_train) X_train = convert.transform(X_train) self._categorical += [convert] # datetime transform cols_to_process = list( filter( lambda k: "datetime_transform" in columns_preprocessing[k], columns_preprocessing, ) ) new_datetime_columns = [] for col in cols_to_process: t = DateTimeTransformer() t.fit(X_train, col) X_train = t.transform(X_train) self._datetime_transforms += [t] new_datetime_columns += t._new_columns # SCALE for scale_method in [Scale.SCALE_NORMAL, Scale.SCALE_LOG_AND_NORMAL]: cols_to_process = list( filter( lambda k: scale_method in columns_preprocessing[k], columns_preprocessing, ) ) if ( len(cols_to_process) and len(new_datetime_columns) and scale_method == Scale.SCALE_NORMAL ): cols_to_process += new_datetime_columns if ( len(cols_to_process) and len(new_text_columns) and scale_method == Scale.SCALE_NORMAL ): cols_to_process += new_text_columns if len(cols_to_process): scale = Scale(cols_to_process) scale.fit(X_train) X_train = scale.transform(X_train) self._scale += [scale] return X_train, y_train