Example #1
0
 def test_transform(self):
     # training data
     d = {"col1": ["a", "a", "c"], "col2": ["w", "e", "d"]}
     df = pd.DataFrame(data=d)
     # fit binarizer
     lb1 = LabelBinarizer()
     lb1.fit(df, "col1")
     lb2 = LabelBinarizer()
     lb2.fit(df, "col2")
     # test data
     d_test = {"col1": ["c", "c", "a"], "col2": ["e", "d", "w"], "col3": [2, 3, 4]}
     df_test = pd.DataFrame(data=d_test)
     # transform
     df_test = lb1.transform(df_test, "col1")
     df_test = lb2.transform(df_test, "col2")
     # for binary column, only one value is left, old column should be deleted
     self.assertTrue("col1_c" in df_test.columns)
     self.assertTrue("col1" not in df_test.columns)
     self.assertEqual(2, np.sum(df_test["col1_c"]))
     # for multiple value colum, all columns should be added
     self.assertTrue("col2_w" in df_test.columns)
     self.assertTrue("col2_e" in df_test.columns)
     self.assertTrue("col2_d" in df_test.columns)
     self.assertTrue("col2" not in df_test.columns)
     self.assertEqual(1, np.sum(df_test["col2_w"]))
     self.assertEqual(1, np.sum(df_test["col2_e"]))
     self.assertEqual(1, np.sum(df_test["col2_d"]))
     # do not touch continuous attribute
     self.assertTrue("col3" in df_test.columns)
Example #2
0
 def test_transform_with_new_values(self):
     # training data
     d = {"col1": ["a", "a", "c"], "col2": ["w", "e", "d"]}
     df = pd.DataFrame(data=d)
     # fit binarizer
     lb1 = LabelBinarizer()
     lb1.fit(df, "col1")
     lb2 = LabelBinarizer()
     lb2.fit(df, "col2")
     # test data
     d_test = {"col1": ["c", "d", "d"], "col2": ["g", "e", "f"], "col3": [2, 3, 4]}
     df_test = pd.DataFrame(data=d_test)
     # transform
     df_test = lb1.transform(df_test, "col1")
     df_test = lb2.transform(df_test, "col2")
     self.assertTrue("col1_c" in df_test.columns)
     self.assertTrue("col1_d" not in df_test.columns)
     self.assertTrue("col2_w" in df_test.columns)
     self.assertTrue("col2_e" in df_test.columns)
     self.assertTrue("col2_d" in df_test.columns)
     self.assertTrue("col2_g" not in df_test.columns)
     self.assertTrue("col2_f" not in df_test.columns)
     self.assertEqual(df_test["col1_c"][0], 1)
     self.assertEqual(df_test["col1_c"][1], 0)
     self.assertEqual(df_test["col1_c"][2], 0)
     self.assertEqual(np.sum(df_test["col2_w"]), 0)
     self.assertEqual(np.sum(df_test["col2_d"]), 0)
     self.assertEqual(df_test["col2_e"][0], 0)
     self.assertEqual(df_test["col2_e"][1], 1)
     self.assertEqual(df_test["col2_e"][2], 0)
Example #3
0
    def test_fit(self):
        # training data
        d = {"col1": ["a", "a", "c"], "col2": ["w", "e", "d"]}
        df = pd.DataFrame(data=d)
        lb = LabelBinarizer()
        # check first column
        lb.fit(df, "col1")
        data_json = lb.to_json()
        self.assertTrue("new_columns" in data_json)
        # we take alphabetical order
        self.assertTrue("col1_c" in data_json["new_columns"])
        self.assertTrue("col1_a" not in data_json["new_columns"])
        self.assertTrue("unique_values" in data_json)
        self.assertTrue("a" in data_json["unique_values"])
        self.assertTrue("c" in data_json["unique_values"])

        lb = LabelBinarizer()
        # check second column
        lb.fit(df, "col2")
        data_json = lb.to_json()
        self.assertTrue("new_columns" in data_json)
        self.assertTrue("col2_w" in data_json["new_columns"])
        self.assertTrue("col2_e" in data_json["new_columns"])
        self.assertTrue("col2_d" in data_json["new_columns"])
        self.assertTrue("unique_values" in data_json)
        self.assertTrue("w" in data_json["unique_values"])
        self.assertTrue("e" in data_json["unique_values"])
        self.assertTrue("d" in data_json["unique_values"])
    def from_json(self, data_json):

        if "remove_columns" in data_json:
            self._remove_columns = data_json.get("remove_columns", [])
        if "missing_values" in data_json:
            self._missing_values = []
            for mv_data in data_json["missing_values"]:
                mv = PreprocessingMissingValues()
                mv.from_json(mv_data)
                self._missing_values += [mv]
        if "categorical" in data_json:
            self._categorical = []
            for cat_data in data_json["categorical"]:
                cat = PreprocessingCategorical()
                cat.from_json(cat_data)
                self._categorical += [cat]
        if "scale" in data_json:
            self._scale = []
            for scale_data in data_json["scale"]:
                sc = Scale()
                sc.from_json(scale_data)
                self._scale += [sc]
        if "categorical_y" in data_json:
            if "new_columns" in data_json["categorical_y"]:
                self._categorical_y = LabelBinarizer()
            else:
                self._categorical_y = LabelEncoder()

            self._categorical_y.from_json(data_json["categorical_y"])
        if "scale_y" in data_json:
            self._scale_y = Scale()
            self._scale_y.from_json(data_json["scale_y"])
        if "ml_task" in data_json:
            self._params["ml_task"] = data_json["ml_task"]
Example #5
0
    def from_json(self, data_json):

        self._params = data_json.get("params", self._params)

        if "remove_columns" in data_json:
            self._remove_columns = data_json.get("remove_columns", [])
        if "missing_values" in data_json:
            self._missing_values = []
            for mv_data in data_json["missing_values"]:
                mv = PreprocessingMissingValues()
                mv.from_json(mv_data)
                self._missing_values += [mv]
        if "categorical" in data_json:
            self._categorical = []
            for cat_data in data_json["categorical"]:
                cat = PreprocessingCategorical()
                cat.from_json(cat_data)
                self._categorical += [cat]

        if "datetime_transforms" in data_json:
            self._datetime_transforms = []
            for dtt_params in data_json["datetime_transforms"]:
                dtt = DateTimeTransformer()
                dtt.from_json(dtt_params)
                self._datetime_transforms += [dtt]

        if "text_transforms" in data_json:
            self._text_transforms = []
            for tt_params in data_json["text_transforms"]:
                tt = TextTransformer()
                tt.from_json(tt_params)
                self._text_transforms += [tt]

        if "golden_features" in data_json:
            self._golden_features = GoldenFeaturesTransformer()
            self._golden_features.from_json(data_json["golden_features"])

        if "scale" in data_json:
            self._scale = []
            for scale_data in data_json["scale"]:
                sc = Scale()
                sc.from_json(scale_data)
                self._scale += [sc]
        if "categorical_y" in data_json:
            if "new_columns" in data_json["categorical_y"]:
                self._categorical_y = LabelBinarizer()
            else:
                self._categorical_y = LabelEncoder()

            self._categorical_y.from_json(data_json["categorical_y"])
        if "scale_y" in data_json:
            self._scale_y = Scale()
            self._scale_y.from_json(data_json["scale_y"])
        if "ml_task" in data_json:
            self._params["ml_task"] = data_json["ml_task"]

        self._add_random_feature = data_json.get("add_random_feature", False)
        self._drop_features = data_json.get("drop_features", [])
Example #6
0
    def inverse_transform(self, X):
        for column, lbl_params in self._convert_params.items():
            if "unique_values" in lbl_params and "new_columns" in lbl_params:
                # convert to one hot
                lbl = LabelBinarizer()
                lbl.from_json(lbl_params)
                X = lbl.inverse_transform(X, column)  # should raise exception
            else:
                # convert to integer
                lbl = LabelEncoder()
                lbl.from_json(lbl_params)
                X.loc[:, column] = lbl.inverse_transform(X.loc[:, column])

        return X
Example #7
0
    def transform(self, X):
        if (self._convert_method == PreprocessingCategorical.CONVERT_LOO
                and self._columns):
            return self._enc.transform(X)
        else:
            for column, lbl_params in self._convert_params.items():
                if "unique_values" in lbl_params and "new_columns" in lbl_params:
                    # convert to one hot
                    lbl = LabelBinarizer()
                    lbl.from_json(lbl_params)
                    X = lbl.transform(X, column)
                else:
                    # convert to integer
                    lbl = LabelEncoder()
                    lbl.from_json(lbl_params)
                    X.loc[:, column] = lbl.transform(X.loc[:, column])

            return X
    def _fit_categorical_convert(self, X):
        for column in self._columns:

            if PreprocessingUtils.get_type(
                    X[column]) != PreprocessingUtils.CATEGORICAL:
                # no need to convert, already a number
                continue
            # limit categories - it is needed when doing one hot encoding
            # this code is also used in predict.py file
            # and transform_utils.py
            # TODO it needs refactoring !!!
            too_much_categories = len(np.unique(list(X[column].values))) > 200
            lbl = None
            if (self._convert_method
                    == PreprocessingCategorical.CONVERT_ONE_HOT
                    and not too_much_categories):
                lbl = LabelBinarizer()
                lbl.fit(X, column)
            else:
                lbl = LabelEncoder()
                lbl.fit(X[column])

            if lbl is not None:
                self._convert_params[column] = lbl.to_json()
Example #9
0
 def test_inverse_transform(self):
     d = {"col1": ["a", "a", "c"], "col2": ["w", "e", "d"]}
     df = pd.DataFrame(data=d)
     lb = LabelBinarizer()
     # check first column
     lb.fit(df, "col1")
     bb = lb.transform(df, "col1")
     self.assertTrue("col1_c" in bb.columns)
     self.assertTrue(np.sum(bb["col1_c"]) == 1)
     bb = lb.inverse_transform(bb)
     self.assertTrue("col1_c" not in bb.columns)
     # check second column
     lb = LabelBinarizer()
     lb.fit(df, "col2")
     bb = lb.transform(df, "col2")
     self.assertTrue("col2_w" in bb.columns)
     self.assertTrue("col2_e" in bb.columns)
     self.assertTrue("col2_d" in bb.columns)
     self.assertTrue(np.sum(bb["col2_w"]) == 1)
     bb = lb.inverse_transform(bb)
     self.assertTrue("col2_w" not in bb.columns)
Example #10
0
    def test_to_and_from_json_booleans(self):
        # training data
        d = {"col1": ["a", "a", "c"], "col2": [True, True, False]}
        df = pd.DataFrame(data=d)
        # fit binarizer
        lb1 = LabelBinarizer()
        lb1.fit(df, "col1")
        lb2 = LabelBinarizer()
        lb2.fit(df, "col2")
        # test data
        d_test = {
            "col1": ["c", "c", "a"],
            "col2": [False, False, True],
            "col3": [2, 3, 4],
        }
        df_test = pd.DataFrame(data=d_test)
        # to json and from json
        new_lb1 = LabelBinarizer()
        new_lb2 = LabelBinarizer()
        new_lb1.from_json(lb1.to_json())
        new_lb2.from_json(json.loads(json.dumps(lb2.to_json(), indent=4)))

        # transform
        df_test = new_lb1.transform(df_test, "col1")
        df_test = new_lb2.transform(df_test, "col2")
        # for binary column, only one value is left, old column should be deleted
        self.assertTrue("col1_c" in df_test.columns)
        self.assertTrue("col1" not in df_test.columns)
        self.assertEqual(2, np.sum(df_test["col1_c"]))
        # for multiple value colum, all columns should be added
        self.assertTrue("col2_True" in df_test.columns)
        self.assertTrue("col2" not in df_test.columns)
        self.assertEqual(1, np.sum(df_test["col2_True"]))
        # do not touch continuous attribute
        self.assertTrue("col3" in df_test.columns)
Example #11
0
    def fit_and_transform(self, X_train, y_train):
        logger.debug("Preprocessing.fit_and_transform")

        if y_train is not None:
            # target preprocessing
            # this must be used first, maybe we will drop some rows because of missing target values
            target_preprocessing = self._params.get("target_preprocessing")
            logger.debug(
                "target_preprocessing params: {}".format(target_preprocessing))

            X_train, y_train = ExcludeRowsMissingTarget.transform(
                X_train, y_train)

            if PreprocessingCategorical.CONVERT_INTEGER in target_preprocessing:
                logger.debug("Convert target to integer")
                self._categorical_y = LabelEncoder()
                self._categorical_y.fit(y_train)
                y_train = pd.Series(self._categorical_y.transform(y_train))

            if PreprocessingCategorical.CONVERT_ONE_HOT in target_preprocessing:
                logger.debug("Convert target to one-hot coding")
                self._categorical_y = LabelBinarizer()
                self._categorical_y.fit(pd.DataFrame({"target": y_train}),
                                        "target")
                y_train = self._categorical_y.transform(
                    pd.DataFrame({"target": y_train}), "target")

            if Scale.SCALE_LOG_AND_NORMAL in target_preprocessing:
                logger.debug("Scale log and normal")

                self._scale_y = Scale(["target"],
                                      scale_method=Scale.SCALE_LOG_AND_NORMAL)
                y_train = pd.DataFrame({"target": y_train})
                self._scale_y.fit(y_train)
                y_train = self._scale_y.transform(y_train)
                y_train = y_train["target"]

            if Scale.SCALE_NORMAL in target_preprocessing:
                logger.debug("Scale normal")

                self._scale_y = Scale(["target"],
                                      scale_method=Scale.SCALE_NORMAL)
                y_train = pd.DataFrame({"target": y_train})
                self._scale_y.fit(y_train)
                y_train = self._scale_y.transform(y_train)
                y_train = y_train["target"]

        # columns preprocessing
        columns_preprocessing = self._params.get("columns_preprocessing")
        for column in columns_preprocessing:
            transforms = columns_preprocessing[column]
            # logger.debug("Preprocess column {} with: {}".format(column, transforms))

        # remove empty or constant columns
        cols_to_remove = list(
            filter(
                lambda k: "remove_column" in columns_preprocessing[k],
                columns_preprocessing,
            ))

        if X_train is not None:
            X_train.drop(cols_to_remove, axis=1, inplace=True)
        self._remove_columns = cols_to_remove

        for missing_method in [PreprocessingMissingValues.FILL_NA_MEDIAN]:
            cols_to_process = list(
                filter(
                    lambda k: missing_method in columns_preprocessing[k],
                    columns_preprocessing,
                ))
            missing = PreprocessingMissingValues(cols_to_process,
                                                 missing_method)
            missing.fit(X_train)
            X_train = missing.transform(X_train)
            self._missing_values += [missing]

        for convert_method in [PreprocessingCategorical.CONVERT_INTEGER]:
            cols_to_process = list(
                filter(
                    lambda k: convert_method in columns_preprocessing[k],
                    columns_preprocessing,
                ))
            convert = PreprocessingCategorical(cols_to_process, convert_method)
            convert.fit(X_train)
            X_train = convert.transform(X_train)
            self._categorical += [convert]

        # SCALE
        for scale_method in [Scale.SCALE_NORMAL, Scale.SCALE_LOG_AND_NORMAL]:
            cols_to_process = list(
                filter(
                    lambda k: scale_method in columns_preprocessing[k],
                    columns_preprocessing,
                ))
            if len(cols_to_process):
                scale = Scale(cols_to_process)
                scale.fit(X_train)
                X_train = scale.transform(X_train)
                self._scale += [scale]

        return X_train, y_train
Example #12
0
    def fit_and_transform(self, X_train, y_train, sample_weight=None):
        logger.debug("Preprocessing.fit_and_transform")

        if y_train is not None:
            # target preprocessing
            # this must be used first, maybe we will drop some rows because of missing target values
            target_preprocessing = self._params.get("target_preprocessing")
            logger.debug(
                "target_preprocessing params: {}".format(target_preprocessing))

            X_train, y_train, sample_weight = ExcludeRowsMissingTarget.transform(
                X_train, y_train, sample_weight)

            if PreprocessingCategorical.CONVERT_INTEGER in target_preprocessing:
                logger.debug("Convert target to integer")
                self._categorical_y = LabelEncoder(try_to_fit_numeric=True)
                self._categorical_y.fit(y_train)
                y_train = pd.Series(self._categorical_y.transform(y_train))

            if PreprocessingCategorical.CONVERT_ONE_HOT in target_preprocessing:
                logger.debug("Convert target to one-hot coding")
                self._categorical_y = LabelBinarizer()
                self._categorical_y.fit(pd.DataFrame({"target": y_train}),
                                        "target")
                y_train = self._categorical_y.transform(
                    pd.DataFrame({"target": y_train}), "target")

            if Scale.SCALE_LOG_AND_NORMAL in target_preprocessing:
                logger.debug("Scale log and normal")

                self._scale_y = Scale(["target"],
                                      scale_method=Scale.SCALE_LOG_AND_NORMAL)
                y_train = pd.DataFrame({"target": y_train})
                self._scale_y.fit(y_train)
                y_train = self._scale_y.transform(y_train)
                y_train = y_train["target"]

            if Scale.SCALE_NORMAL in target_preprocessing:
                logger.debug("Scale normal")

                self._scale_y = Scale(["target"],
                                      scale_method=Scale.SCALE_NORMAL)
                y_train = pd.DataFrame({"target": y_train})
                self._scale_y.fit(y_train)
                y_train = self._scale_y.transform(y_train)
                y_train = y_train["target"]

        # columns preprocessing
        columns_preprocessing = self._params.get("columns_preprocessing")
        for column in columns_preprocessing:
            transforms = columns_preprocessing[column]
            # logger.debug("Preprocess column {} with: {}".format(column, transforms))

        # remove empty or constant columns
        cols_to_remove = list(
            filter(
                lambda k: "remove_column" in columns_preprocessing[k],
                columns_preprocessing,
            ))

        if X_train is not None:
            X_train.drop(cols_to_remove, axis=1, inplace=True)
        self._remove_columns = cols_to_remove

        numeric_cols = []  # get numeric cols before text transformations
        # needed for golden features
        if X_train is not None and ("golden_features" in self._params
                                    or "kmeans_features" in self._params):
            numeric_cols = X_train.select_dtypes(
                include="number").columns.tolist()

        # there can be missing values in the text data,
        # but we don't want to handle it by fill missing methods
        # zeros will be imputed by text_transform method
        cols_to_process = list(
            filter(
                lambda k: "text_transform" in columns_preprocessing[k],
                columns_preprocessing,
            ))

        new_text_columns = []
        for col in cols_to_process:
            t = TextTransformer()
            t.fit(X_train, col)
            X_train = t.transform(X_train)
            self._text_transforms += [t]
            new_text_columns += t._new_columns
        # end of text transform

        for missing_method in [PreprocessingMissingValues.FILL_NA_MEDIAN]:
            cols_to_process = list(
                filter(
                    lambda k: missing_method in columns_preprocessing[k],
                    columns_preprocessing,
                ))
            missing = PreprocessingMissingValues(cols_to_process,
                                                 missing_method)
            missing.fit(X_train)
            X_train = missing.transform(X_train)
            self._missing_values += [missing]

        # golden features
        golden_columns = []
        if "golden_features" in self._params:
            results_path = self._params["golden_features"]["results_path"]
            ml_task = self._params["golden_features"]["ml_task"]
            self._golden_features = GoldenFeaturesTransformer(
                results_path, ml_task)
            self._golden_features.fit(X_train[numeric_cols], y_train)
            X_train = self._golden_features.transform(X_train)
            golden_columns = self._golden_features._new_columns

        kmeans_columns = []
        if "kmeans_features" in self._params:
            results_path = self._params["kmeans_features"]["results_path"]
            self._kmeans = KMeansTransformer(results_path, self._model_name,
                                             self._k_fold)
            self._kmeans.fit(X_train[numeric_cols], y_train)
            X_train = self._kmeans.transform(X_train)
            kmeans_columns = self._kmeans._new_features

        for convert_method in [
                PreprocessingCategorical.CONVERT_INTEGER,
                PreprocessingCategorical.CONVERT_ONE_HOT,
                PreprocessingCategorical.CONVERT_LOO,
        ]:
            cols_to_process = list(
                filter(
                    lambda k: convert_method in columns_preprocessing[k],
                    columns_preprocessing,
                ))
            convert = PreprocessingCategorical(cols_to_process, convert_method)
            convert.fit(X_train, y_train)
            X_train = convert.transform(X_train)
            self._categorical += [convert]

        # datetime transform
        cols_to_process = list(
            filter(
                lambda k: "datetime_transform" in columns_preprocessing[k],
                columns_preprocessing,
            ))

        new_datetime_columns = []
        for col in cols_to_process:

            t = DateTimeTransformer()
            t.fit(X_train, col)
            X_train = t.transform(X_train)
            self._datetime_transforms += [t]
            new_datetime_columns += t._new_columns

        # SCALE
        for scale_method in [Scale.SCALE_NORMAL, Scale.SCALE_LOG_AND_NORMAL]:
            cols_to_process = list(
                filter(
                    lambda k: scale_method in columns_preprocessing[k],
                    columns_preprocessing,
                ))
            if (len(cols_to_process) and len(new_datetime_columns)
                    and scale_method == Scale.SCALE_NORMAL):
                cols_to_process += new_datetime_columns
            if (len(cols_to_process) and len(new_text_columns)
                    and scale_method == Scale.SCALE_NORMAL):
                cols_to_process += new_text_columns

            if (len(cols_to_process) and len(golden_columns)
                    and scale_method == Scale.SCALE_NORMAL):
                cols_to_process += golden_columns

            if (len(cols_to_process) and len(kmeans_columns)
                    and scale_method == Scale.SCALE_NORMAL):
                cols_to_process += kmeans_columns

            if len(cols_to_process):
                scale = Scale(cols_to_process)
                scale.fit(X_train)
                X_train = scale.transform(X_train)
                self._scale += [scale]

        if self._add_random_feature:
            # -1, 1, with 0 mean
            X_train["random_feature"] = np.random.rand(
                X_train.shape[0]) * 2.0 - 1.0

        if self._drop_features:
            available_cols = X_train.columns.tolist()
            drop_cols = [c for c in self._drop_features if c in available_cols]
            if len(drop_cols) == X_train.shape[1]:
                raise AutoMLException(
                    "All features are droppped! Your data looks like random data."
                )
            if drop_cols:
                X_train.drop(drop_cols, axis=1, inplace=True)
            self._drop_features = drop_cols

        if X_train is not None:
            # there can be catagorical columns (in CatBoost) which cant be clipped
            numeric_cols = X_train.select_dtypes(
                include="number").columns.tolist()
            X_train[numeric_cols] = X_train[numeric_cols].clip(
                lower=np.finfo(np.float32).min + 1000,
                upper=np.finfo(np.float32).max - 1000,
            )

        return X_train, y_train, sample_weight
Example #13
0
    def fit_and_transform(self, X_train, y_train):
        logger.debug("Preprocessing.fit_and_transform")

        if y_train is not None:
            # target preprocessing
            # this must be used first, maybe we will drop some rows because of missing target values
            target_preprocessing = self._params.get("target_preprocessing")
            logger.debug("target_preprocessing params: {}".format(target_preprocessing))

            X_train, y_train = ExcludeRowsMissingTarget.transform(X_train, y_train)

            if PreprocessingCategorical.CONVERT_INTEGER in target_preprocessing:
                logger.debug("Convert target to integer")
                self._categorical_y = LabelEncoder()
                self._categorical_y.fit(y_train)
                y_train = pd.Series(self._categorical_y.transform(y_train))

            if PreprocessingCategorical.CONVERT_ONE_HOT in target_preprocessing:
                logger.debug("Convert target to one-hot coding")
                self._categorical_y = LabelBinarizer()
                self._categorical_y.fit(pd.DataFrame({"target": y_train}), "target")
                y_train = self._categorical_y.transform(
                    pd.DataFrame({"target": y_train}), "target"
                )

            if Scale.SCALE_LOG_AND_NORMAL in target_preprocessing:
                logger.debug("Scale log and normal")

                self._scale_y = Scale(
                    ["target"], scale_method=Scale.SCALE_LOG_AND_NORMAL
                )
                y_train = pd.DataFrame({"target": y_train})
                self._scale_y.fit(y_train)
                y_train = self._scale_y.transform(y_train)
                y_train = y_train["target"]

            if Scale.SCALE_NORMAL in target_preprocessing:
                logger.debug("Scale normal")

                self._scale_y = Scale(["target"], scale_method=Scale.SCALE_NORMAL)
                y_train = pd.DataFrame({"target": y_train})
                self._scale_y.fit(y_train)
                y_train = self._scale_y.transform(y_train)
                y_train = y_train["target"]

        # columns preprocessing
        columns_preprocessing = self._params.get("columns_preprocessing")
        for column in columns_preprocessing:
            transforms = columns_preprocessing[column]
            # logger.debug("Preprocess column {} with: {}".format(column, transforms))

        # remove empty or constant columns
        cols_to_remove = list(
            filter(
                lambda k: "remove_column" in columns_preprocessing[k],
                columns_preprocessing,
            )
        )

        if X_train is not None:
            X_train.drop(cols_to_remove, axis=1, inplace=True)
        self._remove_columns = cols_to_remove

        # there can be missing values in the text data,
        # but we don't want to handle it by fill missing methods
        # zeros will be imputed by text_transform method
        cols_to_process = list(
            filter(
                lambda k: "text_transform" in columns_preprocessing[k],
                columns_preprocessing,
            )
        )

        new_text_columns = []
        for col in cols_to_process:
            t = TextTransformer()
            t.fit(X_train, col)
            X_train = t.transform(X_train)
            self._text_transforms += [t]
            new_text_columns += t._new_columns
        # end of text transform

        for missing_method in [PreprocessingMissingValues.FILL_NA_MEDIAN]:
            cols_to_process = list(
                filter(
                    lambda k: missing_method in columns_preprocessing[k],
                    columns_preprocessing,
                )
            )
            missing = PreprocessingMissingValues(cols_to_process, missing_method)
            missing.fit(X_train)
            X_train = missing.transform(X_train)
            self._missing_values += [missing]

        for convert_method in [
            PreprocessingCategorical.CONVERT_INTEGER,
            PreprocessingCategorical.CONVERT_ONE_HOT,
        ]:
            cols_to_process = list(
                filter(
                    lambda k: convert_method in columns_preprocessing[k],
                    columns_preprocessing,
                )
            )
            convert = PreprocessingCategorical(cols_to_process, convert_method)
            convert.fit(X_train)
            X_train = convert.transform(X_train)
            self._categorical += [convert]

        # datetime transform
        cols_to_process = list(
            filter(
                lambda k: "datetime_transform" in columns_preprocessing[k],
                columns_preprocessing,
            )
        )

        new_datetime_columns = []
        for col in cols_to_process:

            t = DateTimeTransformer()
            t.fit(X_train, col)
            X_train = t.transform(X_train)
            self._datetime_transforms += [t]
            new_datetime_columns += t._new_columns

        # SCALE
        for scale_method in [Scale.SCALE_NORMAL, Scale.SCALE_LOG_AND_NORMAL]:
            cols_to_process = list(
                filter(
                    lambda k: scale_method in columns_preprocessing[k],
                    columns_preprocessing,
                )
            )
            if (
                len(cols_to_process)
                and len(new_datetime_columns)
                and scale_method == Scale.SCALE_NORMAL
            ):
                cols_to_process += new_datetime_columns
            if (
                len(cols_to_process)
                and len(new_text_columns)
                and scale_method == Scale.SCALE_NORMAL
            ):
                cols_to_process += new_text_columns

            if len(cols_to_process):
                scale = Scale(cols_to_process)
                scale.fit(X_train)
                X_train = scale.transform(X_train)
                self._scale += [scale]

        return X_train, y_train