Esempio n. 1
0
    def _initial_prep(self,
                      X_train,
                      y_train,
                      X_validation=None,
                      y_validation=None):

        if not isinstance(X_train, pd.DataFrame):
            X_train = pd.DataFrame(X_train)

        if not isinstance(X_train.columns[0], str):
            X_train.columns = [str(c) for c in X_train.columns]

        X_train.reset_index(drop=True, inplace=True)

        if isinstance(y_train, pd.DataFrame):
            if "target" not in y_train.columns:
                raise AutoMLException(
                    "y_train should be Numpy array, Pandas Series or DataFrame with column 'target' "
                )
            else:
                y_train = y_train["target"]
        y_train = pd.Series(np.array(y_train), name="target")

        X_train, y_train = ExcludeRowsMissingTarget.transform(X_train,
                                                              y_train,
                                                              warn=True)

        return X_train, y_train, X_validation, y_validation
Esempio n. 2
0
    def test_transform_with_sample_weight(self):
        d_test = {
            "col1": [1, 1, np.nan, 3],
            "col2": ["a", "a", np.nan, "a"],
            "col3": [1, 1, 1, 3],
            "col4": ["a", "a", "b", "c"],
            "sample_weight": [1, 2, 3, 4],
            "y": [np.nan, 1, np.nan, 2],
        }
        df_test = pd.DataFrame(data=d_test)
        X = df_test.loc[:, ["col1", "col2", "col3", "col4"]]
        y = df_test.loc[:, "y"]
        sample_weight = df_test.loc[:, "sample_weight"]

        self.assertEqual(X.shape[0], 4)
        self.assertEqual(y.shape[0], 4)
        X, y, sw = ExcludeRowsMissingTarget.transform(X, y, sample_weight)
        self.assertEqual(X.shape[0], 2)
        self.assertEqual(y.shape[0], 2)
        self.assertEqual(sw.shape[0], 2)

        self.assertEqual(y[0], 1)
        self.assertEqual(y[1], 2)
        self.assertEqual(sw[0], 2)
        self.assertEqual(sw[1], 4)
Esempio n. 3
0
    def _build_dataframe(self, X, y=None):
        # If Inputs are not pandas dataframes use scikit-learn validation for X array
        if not isinstance(X, pd.DataFrame):
            # Validate X as array
            X = check_array(X, ensure_2d=False)
            # Force X to be 2D
            X = np.atleast_2d(X)
            # Create Pandas dataframe from np.arrays, columns get names with the schema: feature_{index}
            X = pd.DataFrame(
                X,
                columns=["feature_" + str(i) for i in range(1,
                                                            len(X[0]) + 1)])

        # Enforce column names
        # Enforce X_train columns to be string
        X.columns = X.columns.astype(str)

        X.reset_index(drop=True, inplace=True)

        if y is None:
            return X

        # Check if y is np.ndarray, transform to pd.Series
        if isinstance(y, np.ndarray):
            y = check_array(y, ensure_2d=False)
            y = pd.Series(np.array(y), name="target")
        # if pd.DataFrame, slice first column
        elif isinstance(y, pd.DataFrame):
            y = np.array(y.iloc[:, 0])
            y = check_array(y, ensure_2d=False)
            y = pd.Series(np.array(y), name="target")

        X, y = ExcludeRowsMissingTarget.transform(X, y, warn=True)

        return X, y
Esempio n. 4
0
    def _initial_prep(self, X_train, y_train, X_validation=None, y_validation=None):

        if not isinstance(X_train, pd.DataFrame):
            X_train = pd.DataFrame(X_train)

        if not isinstance(X_train.columns[0], str):
            X_train.columns = [str(c) for c in X_train.columns]

        X_train.reset_index(drop=True, inplace=True)

        y_train = pd.Series(np.array(y_train), name="target")

        X_train, y_train = ExcludeRowsMissingTarget.transform(
            X_train, y_train, warn=True
        )

        return X_train, y_train, X_validation, y_validation
    def test_transform(self):
        d_test = {
            "col1": [1, 1, np.nan, 3],
            "col2": ["a", "a", np.nan, "a"],
            "col3": [1, 1, 1, 3],
            "col4": ["a", "a", "b", "c"],
            "y": [np.nan, 1, np.nan, 2],
        }
        df_test = pd.DataFrame(data=d_test)
        X = df_test.loc[:, ["col1", "col2", "col3", "col4"]]
        y = df_test.loc[:, "y"]

        self.assertEqual(X.shape[0], 4)
        self.assertEqual(y.shape[0], 4)
        X, y = ExcludeRowsMissingTarget.transform(X, y)
        self.assertEqual(X.shape[0], 2)
        self.assertEqual(y.shape[0], 2)
        print(y)
        self.assertEqual(y[0], 1)
        self.assertEqual(y[1], 2)
Esempio n. 6
0
    def fit_and_transform(self, X_train, y_train):
        logger.debug("Preprocessing.fit_and_transform")

        if y_train is not None:
            # target preprocessing
            # this must be used first, maybe we will drop some rows because of missing target values
            target_preprocessing = self._params.get("target_preprocessing")
            logger.debug(
                "target_preprocessing params: {}".format(target_preprocessing))

            X_train, y_train = ExcludeRowsMissingTarget.transform(
                X_train, y_train)

            if PreprocessingCategorical.CONVERT_INTEGER in target_preprocessing:
                logger.debug("Convert target to integer")
                self._categorical_y = LabelEncoder()
                self._categorical_y.fit(y_train)
                y_train = pd.Series(self._categorical_y.transform(y_train))

            if PreprocessingCategorical.CONVERT_ONE_HOT in target_preprocessing:
                logger.debug("Convert target to one-hot coding")
                self._categorical_y = LabelBinarizer()
                self._categorical_y.fit(pd.DataFrame({"target": y_train}),
                                        "target")
                y_train = self._categorical_y.transform(
                    pd.DataFrame({"target": y_train}), "target")

            if Scale.SCALE_LOG_AND_NORMAL in target_preprocessing:
                logger.debug("Scale log and normal")

                self._scale_y = Scale(["target"],
                                      scale_method=Scale.SCALE_LOG_AND_NORMAL)
                y_train = pd.DataFrame({"target": y_train})
                self._scale_y.fit(y_train)
                y_train = self._scale_y.transform(y_train)
                y_train = y_train["target"]

            if Scale.SCALE_NORMAL in target_preprocessing:
                logger.debug("Scale normal")

                self._scale_y = Scale(["target"],
                                      scale_method=Scale.SCALE_NORMAL)
                y_train = pd.DataFrame({"target": y_train})
                self._scale_y.fit(y_train)
                y_train = self._scale_y.transform(y_train)
                y_train = y_train["target"]

        # columns preprocessing
        columns_preprocessing = self._params.get("columns_preprocessing")
        for column in columns_preprocessing:
            transforms = columns_preprocessing[column]
            # logger.debug("Preprocess column {} with: {}".format(column, transforms))

        # remove empty or constant columns
        cols_to_remove = list(
            filter(
                lambda k: "remove_column" in columns_preprocessing[k],
                columns_preprocessing,
            ))

        if X_train is not None:
            X_train.drop(cols_to_remove, axis=1, inplace=True)
        self._remove_columns = cols_to_remove

        for missing_method in [PreprocessingMissingValues.FILL_NA_MEDIAN]:
            cols_to_process = list(
                filter(
                    lambda k: missing_method in columns_preprocessing[k],
                    columns_preprocessing,
                ))
            missing = PreprocessingMissingValues(cols_to_process,
                                                 missing_method)
            missing.fit(X_train)
            X_train = missing.transform(X_train)
            self._missing_values += [missing]

        for convert_method in [PreprocessingCategorical.CONVERT_INTEGER]:
            cols_to_process = list(
                filter(
                    lambda k: convert_method in columns_preprocessing[k],
                    columns_preprocessing,
                ))
            convert = PreprocessingCategorical(cols_to_process, convert_method)
            convert.fit(X_train)
            X_train = convert.transform(X_train)
            self._categorical += [convert]

        # SCALE
        for scale_method in [Scale.SCALE_NORMAL, Scale.SCALE_LOG_AND_NORMAL]:
            cols_to_process = list(
                filter(
                    lambda k: scale_method in columns_preprocessing[k],
                    columns_preprocessing,
                ))
            if len(cols_to_process):
                scale = Scale(cols_to_process)
                scale.fit(X_train)
                X_train = scale.transform(X_train)
                self._scale += [scale]

        return X_train, y_train
Esempio n. 7
0
    def transform(self, X_validation, y_validation):
        logger.debug("Preprocessing.transform")

        # doing copy to avoid SettingWithCopyWarning
        if X_validation is not None:
            X_validation = X_validation.copy(deep=False)
        if y_validation is not None:
            y_validation = y_validation.copy(deep=False)

        # target preprocessing
        # this must be used first, maybe we will drop some rows because of missing target values
        if y_validation is not None:
            target_preprocessing = self._params.get("target_preprocessing")
            logger.debug(
                "target_preprocessing -> {}".format(target_preprocessing))

            X_validation, y_validation = ExcludeRowsMissingTarget.transform(
                X_validation, y_validation)

            if PreprocessingCategorical.CONVERT_INTEGER in target_preprocessing:
                if y_validation is not None and self._categorical_y is not None:
                    y_validation = pd.Series(
                        self._categorical_y.transform(y_validation))

            if PreprocessingCategorical.CONVERT_ONE_HOT in target_preprocessing:
                if y_validation is not None and self._categorical_y is not None:
                    y_validation = self._categorical_y.transform(
                        pd.DataFrame({"target": y_validation}), "target")

            if Scale.SCALE_LOG_AND_NORMAL in target_preprocessing:
                if self._scale_y is not None and y_validation is not None:
                    logger.debug("Transform log and normalize")
                    y_validation = pd.DataFrame({"target": y_validation})
                    y_validation = self._scale_y.transform(y_validation)
                    y_validation = y_validation["target"]

            if Scale.SCALE_NORMAL in target_preprocessing:
                if self._scale_y is not None and y_validation is not None:
                    logger.debug("Transform normalize")
                    y_validation = pd.DataFrame({"target": y_validation})
                    y_validation = self._scale_y.transform(y_validation)
                    y_validation = y_validation["target"]

        # columns preprocessing
        if len(self._remove_columns) and X_validation is not None:
            cols_to_remove = [
                col for col in X_validation.columns
                if col in self._remove_columns
            ]
            X_validation.drop(cols_to_remove, axis=1, inplace=True)

        for missing in self._missing_values:
            if X_validation is not None and missing is not None:
                X_validation = missing.transform(X_validation)

        # to be sure that all missing are filled
        # in case new data there can be gaps!
        if (X_validation is not None
                and np.sum(np.sum(pd.isnull(X_validation))) > 0
                and len(self._params["columns_preprocessing"]) > 0):
            # there is something missing, fill it
            # we should notice user about it!
            warnings.warn(
                "There are columns {} with missing values which didnt have missing values in train dataset."
                .format(
                    list(X_validation.columns[np.where(
                        np.sum(pd.isnull(X_validation)))])))
            missing = PreprocessingMissingValues(
                X_validation.columns,
                PreprocessingMissingValues.FILL_NA_MEDIAN)
            missing.fit(X_validation)
            X_validation = missing.transform(X_validation)
        for convert in self._categorical:
            if X_validation is not None and convert is not None:
                X_validation = convert.transform(X_validation)
        for scale in self._scale:
            if X_validation is not None and scale is not None:
                X_validation = scale.transform(X_validation)

        return X_validation, y_validation
Esempio n. 8
0
    def fit_and_transform(self, X_train, y_train, sample_weight=None):
        logger.debug("Preprocessing.fit_and_transform")

        if y_train is not None:
            # target preprocessing
            # this must be used first, maybe we will drop some rows because of missing target values
            target_preprocessing = self._params.get("target_preprocessing")
            logger.debug(
                "target_preprocessing params: {}".format(target_preprocessing))

            X_train, y_train, sample_weight = ExcludeRowsMissingTarget.transform(
                X_train, y_train, sample_weight)

            if PreprocessingCategorical.CONVERT_INTEGER in target_preprocessing:
                logger.debug("Convert target to integer")
                self._categorical_y = LabelEncoder(try_to_fit_numeric=True)
                self._categorical_y.fit(y_train)
                y_train = pd.Series(self._categorical_y.transform(y_train))

            if PreprocessingCategorical.CONVERT_ONE_HOT in target_preprocessing:
                logger.debug("Convert target to one-hot coding")
                self._categorical_y = LabelBinarizer()
                self._categorical_y.fit(pd.DataFrame({"target": y_train}),
                                        "target")
                y_train = self._categorical_y.transform(
                    pd.DataFrame({"target": y_train}), "target")

            if Scale.SCALE_LOG_AND_NORMAL in target_preprocessing:
                logger.debug("Scale log and normal")

                self._scale_y = Scale(["target"],
                                      scale_method=Scale.SCALE_LOG_AND_NORMAL)
                y_train = pd.DataFrame({"target": y_train})
                self._scale_y.fit(y_train)
                y_train = self._scale_y.transform(y_train)
                y_train = y_train["target"]

            if Scale.SCALE_NORMAL in target_preprocessing:
                logger.debug("Scale normal")

                self._scale_y = Scale(["target"],
                                      scale_method=Scale.SCALE_NORMAL)
                y_train = pd.DataFrame({"target": y_train})
                self._scale_y.fit(y_train)
                y_train = self._scale_y.transform(y_train)
                y_train = y_train["target"]

        # columns preprocessing
        columns_preprocessing = self._params.get("columns_preprocessing")
        for column in columns_preprocessing:
            transforms = columns_preprocessing[column]
            # logger.debug("Preprocess column {} with: {}".format(column, transforms))

        # remove empty or constant columns
        cols_to_remove = list(
            filter(
                lambda k: "remove_column" in columns_preprocessing[k],
                columns_preprocessing,
            ))

        if X_train is not None:
            X_train.drop(cols_to_remove, axis=1, inplace=True)
        self._remove_columns = cols_to_remove

        numeric_cols = []  # get numeric cols before text transformations
        # needed for golden features
        if X_train is not None and ("golden_features" in self._params
                                    or "kmeans_features" in self._params):
            numeric_cols = X_train.select_dtypes(
                include="number").columns.tolist()

        # there can be missing values in the text data,
        # but we don't want to handle it by fill missing methods
        # zeros will be imputed by text_transform method
        cols_to_process = list(
            filter(
                lambda k: "text_transform" in columns_preprocessing[k],
                columns_preprocessing,
            ))

        new_text_columns = []
        for col in cols_to_process:
            t = TextTransformer()
            t.fit(X_train, col)
            X_train = t.transform(X_train)
            self._text_transforms += [t]
            new_text_columns += t._new_columns
        # end of text transform

        for missing_method in [PreprocessingMissingValues.FILL_NA_MEDIAN]:
            cols_to_process = list(
                filter(
                    lambda k: missing_method in columns_preprocessing[k],
                    columns_preprocessing,
                ))
            missing = PreprocessingMissingValues(cols_to_process,
                                                 missing_method)
            missing.fit(X_train)
            X_train = missing.transform(X_train)
            self._missing_values += [missing]

        # golden features
        golden_columns = []
        if "golden_features" in self._params:
            results_path = self._params["golden_features"]["results_path"]
            ml_task = self._params["golden_features"]["ml_task"]
            self._golden_features = GoldenFeaturesTransformer(
                results_path, ml_task)
            self._golden_features.fit(X_train[numeric_cols], y_train)
            X_train = self._golden_features.transform(X_train)
            golden_columns = self._golden_features._new_columns

        kmeans_columns = []
        if "kmeans_features" in self._params:
            results_path = self._params["kmeans_features"]["results_path"]
            self._kmeans = KMeansTransformer(results_path, self._model_name,
                                             self._k_fold)
            self._kmeans.fit(X_train[numeric_cols], y_train)
            X_train = self._kmeans.transform(X_train)
            kmeans_columns = self._kmeans._new_features

        for convert_method in [
                PreprocessingCategorical.CONVERT_INTEGER,
                PreprocessingCategorical.CONVERT_ONE_HOT,
                PreprocessingCategorical.CONVERT_LOO,
        ]:
            cols_to_process = list(
                filter(
                    lambda k: convert_method in columns_preprocessing[k],
                    columns_preprocessing,
                ))
            convert = PreprocessingCategorical(cols_to_process, convert_method)
            convert.fit(X_train, y_train)
            X_train = convert.transform(X_train)
            self._categorical += [convert]

        # datetime transform
        cols_to_process = list(
            filter(
                lambda k: "datetime_transform" in columns_preprocessing[k],
                columns_preprocessing,
            ))

        new_datetime_columns = []
        for col in cols_to_process:

            t = DateTimeTransformer()
            t.fit(X_train, col)
            X_train = t.transform(X_train)
            self._datetime_transforms += [t]
            new_datetime_columns += t._new_columns

        # SCALE
        for scale_method in [Scale.SCALE_NORMAL, Scale.SCALE_LOG_AND_NORMAL]:
            cols_to_process = list(
                filter(
                    lambda k: scale_method in columns_preprocessing[k],
                    columns_preprocessing,
                ))
            if (len(cols_to_process) and len(new_datetime_columns)
                    and scale_method == Scale.SCALE_NORMAL):
                cols_to_process += new_datetime_columns
            if (len(cols_to_process) and len(new_text_columns)
                    and scale_method == Scale.SCALE_NORMAL):
                cols_to_process += new_text_columns

            if (len(cols_to_process) and len(golden_columns)
                    and scale_method == Scale.SCALE_NORMAL):
                cols_to_process += golden_columns

            if (len(cols_to_process) and len(kmeans_columns)
                    and scale_method == Scale.SCALE_NORMAL):
                cols_to_process += kmeans_columns

            if len(cols_to_process):
                scale = Scale(cols_to_process)
                scale.fit(X_train)
                X_train = scale.transform(X_train)
                self._scale += [scale]

        if self._add_random_feature:
            # -1, 1, with 0 mean
            X_train["random_feature"] = np.random.rand(
                X_train.shape[0]) * 2.0 - 1.0

        if self._drop_features:
            available_cols = X_train.columns.tolist()
            drop_cols = [c for c in self._drop_features if c in available_cols]
            if len(drop_cols) == X_train.shape[1]:
                raise AutoMLException(
                    "All features are droppped! Your data looks like random data."
                )
            if drop_cols:
                X_train.drop(drop_cols, axis=1, inplace=True)
            self._drop_features = drop_cols

        if X_train is not None:
            # there can be catagorical columns (in CatBoost) which cant be clipped
            numeric_cols = X_train.select_dtypes(
                include="number").columns.tolist()
            X_train[numeric_cols] = X_train[numeric_cols].clip(
                lower=np.finfo(np.float32).min + 1000,
                upper=np.finfo(np.float32).max - 1000,
            )

        return X_train, y_train, sample_weight
Esempio n. 9
0
    def transform(self,
                  X_validation,
                  y_validation,
                  sample_weight_validation=None):
        logger.debug("Preprocessing.transform")

        # doing copy to avoid SettingWithCopyWarning
        if X_validation is not None:
            X_validation = X_validation.copy(deep=False)
        if y_validation is not None:
            y_validation = y_validation.copy(deep=False)

        # target preprocessing
        # this must be used first, maybe we will drop some rows because of missing target values
        if y_validation is not None:
            target_preprocessing = self._params.get("target_preprocessing")
            logger.debug(
                "target_preprocessing -> {}".format(target_preprocessing))

            (
                X_validation,
                y_validation,
                sample_weight_validation,
            ) = ExcludeRowsMissingTarget.transform(X_validation, y_validation,
                                                   sample_weight_validation)

            if PreprocessingCategorical.CONVERT_INTEGER in target_preprocessing:
                if y_validation is not None and self._categorical_y is not None:
                    y_validation = pd.Series(
                        self._categorical_y.transform(y_validation))

            if PreprocessingCategorical.CONVERT_ONE_HOT in target_preprocessing:
                if y_validation is not None and self._categorical_y is not None:
                    y_validation = self._categorical_y.transform(
                        pd.DataFrame({"target": y_validation}), "target")

            if Scale.SCALE_LOG_AND_NORMAL in target_preprocessing:
                if self._scale_y is not None and y_validation is not None:
                    logger.debug("Transform log and normalize")
                    y_validation = pd.DataFrame({"target": y_validation})
                    y_validation = self._scale_y.transform(y_validation)
                    y_validation = y_validation["target"]

            if Scale.SCALE_NORMAL in target_preprocessing:
                if self._scale_y is not None and y_validation is not None:
                    logger.debug("Transform normalize")
                    y_validation = pd.DataFrame({"target": y_validation})
                    y_validation = self._scale_y.transform(y_validation)
                    y_validation = y_validation["target"]

        # columns preprocessing
        if len(self._remove_columns) and X_validation is not None:
            cols_to_remove = [
                col for col in X_validation.columns
                if col in self._remove_columns
            ]
            X_validation.drop(cols_to_remove, axis=1, inplace=True)

        # text transform
        for tt in self._text_transforms:
            if X_validation is not None and tt is not None:
                X_validation = tt.transform(X_validation)

        for missing in self._missing_values:
            if X_validation is not None and missing is not None:
                X_validation = missing.transform(X_validation)

        # to be sure that all missing are filled
        # in case new data there can be gaps!
        if (X_validation is not None
                and np.sum(np.sum(pd.isnull(X_validation))) > 0
                and len(self._params["columns_preprocessing"]) > 0):
            # there is something missing, fill it
            # we should notice user about it!
            # warnings should go to the separate file ...
            # warnings.warn(
            #    "There are columns {} with missing values which didnt have missing values in train dataset.".format(
            #        list(
            #            X_validation.columns[np.where(np.sum(pd.isnull(X_validation)))]
            #        )
            #    )
            # )
            missing = PreprocessingMissingValues(
                X_validation.columns,
                PreprocessingMissingValues.FILL_NA_MEDIAN)
            missing.fit(X_validation)
            X_validation = missing.transform(X_validation)

        # golden features
        if self._golden_features is not None:
            X_validation = self._golden_features.transform(X_validation)

        if self._kmeans is not None:
            X_validation = self._kmeans.transform(X_validation)

        for convert in self._categorical:
            if X_validation is not None and convert is not None:
                X_validation = convert.transform(X_validation)

        for dtt in self._datetime_transforms:
            if X_validation is not None and dtt is not None:
                X_validation = dtt.transform(X_validation)

        for scale in self._scale:
            if X_validation is not None and scale is not None:
                X_validation = scale.transform(X_validation)

        if self._add_random_feature:
            # -1, 1, with 0 mean
            X_validation["random_feature"] = (
                np.random.rand(X_validation.shape[0]) * 2.0 - 1.0)

        if self._drop_features and X_validation is not None:
            X_validation.drop(self._drop_features, axis=1, inplace=True)

        if X_validation is not None:
            # there can be catagorical columns (in CatBoost) which cant be clipped
            numeric_cols = X_validation.select_dtypes(
                include="number").columns.tolist()
            X_validation[numeric_cols] = X_validation[numeric_cols].clip(
                lower=np.finfo(np.float32).min + 1000,
                upper=np.finfo(np.float32).max - 1000,
            )

        return X_validation, y_validation, sample_weight_validation
Esempio n. 10
0
    def fit_and_transform(self, X_train, y_train):
        logger.debug("Preprocessing.fit_and_transform")

        if y_train is not None:
            # target preprocessing
            # this must be used first, maybe we will drop some rows because of missing target values
            target_preprocessing = self._params.get("target_preprocessing")
            logger.debug("target_preprocessing params: {}".format(target_preprocessing))

            X_train, y_train = ExcludeRowsMissingTarget.transform(X_train, y_train)

            if PreprocessingCategorical.CONVERT_INTEGER in target_preprocessing:
                logger.debug("Convert target to integer")
                self._categorical_y = LabelEncoder()
                self._categorical_y.fit(y_train)
                y_train = pd.Series(self._categorical_y.transform(y_train))

            if PreprocessingCategorical.CONVERT_ONE_HOT in target_preprocessing:
                logger.debug("Convert target to one-hot coding")
                self._categorical_y = LabelBinarizer()
                self._categorical_y.fit(pd.DataFrame({"target": y_train}), "target")
                y_train = self._categorical_y.transform(
                    pd.DataFrame({"target": y_train}), "target"
                )

            if Scale.SCALE_LOG_AND_NORMAL in target_preprocessing:
                logger.debug("Scale log and normal")

                self._scale_y = Scale(
                    ["target"], scale_method=Scale.SCALE_LOG_AND_NORMAL
                )
                y_train = pd.DataFrame({"target": y_train})
                self._scale_y.fit(y_train)
                y_train = self._scale_y.transform(y_train)
                y_train = y_train["target"]

            if Scale.SCALE_NORMAL in target_preprocessing:
                logger.debug("Scale normal")

                self._scale_y = Scale(["target"], scale_method=Scale.SCALE_NORMAL)
                y_train = pd.DataFrame({"target": y_train})
                self._scale_y.fit(y_train)
                y_train = self._scale_y.transform(y_train)
                y_train = y_train["target"]

        # columns preprocessing
        columns_preprocessing = self._params.get("columns_preprocessing")
        for column in columns_preprocessing:
            transforms = columns_preprocessing[column]
            # logger.debug("Preprocess column {} with: {}".format(column, transforms))

        # remove empty or constant columns
        cols_to_remove = list(
            filter(
                lambda k: "remove_column" in columns_preprocessing[k],
                columns_preprocessing,
            )
        )

        if X_train is not None:
            X_train.drop(cols_to_remove, axis=1, inplace=True)
        self._remove_columns = cols_to_remove

        # there can be missing values in the text data,
        # but we don't want to handle it by fill missing methods
        # zeros will be imputed by text_transform method
        cols_to_process = list(
            filter(
                lambda k: "text_transform" in columns_preprocessing[k],
                columns_preprocessing,
            )
        )

        new_text_columns = []
        for col in cols_to_process:
            t = TextTransformer()
            t.fit(X_train, col)
            X_train = t.transform(X_train)
            self._text_transforms += [t]
            new_text_columns += t._new_columns
        # end of text transform

        for missing_method in [PreprocessingMissingValues.FILL_NA_MEDIAN]:
            cols_to_process = list(
                filter(
                    lambda k: missing_method in columns_preprocessing[k],
                    columns_preprocessing,
                )
            )
            missing = PreprocessingMissingValues(cols_to_process, missing_method)
            missing.fit(X_train)
            X_train = missing.transform(X_train)
            self._missing_values += [missing]

        for convert_method in [
            PreprocessingCategorical.CONVERT_INTEGER,
            PreprocessingCategorical.CONVERT_ONE_HOT,
        ]:
            cols_to_process = list(
                filter(
                    lambda k: convert_method in columns_preprocessing[k],
                    columns_preprocessing,
                )
            )
            convert = PreprocessingCategorical(cols_to_process, convert_method)
            convert.fit(X_train)
            X_train = convert.transform(X_train)
            self._categorical += [convert]

        # datetime transform
        cols_to_process = list(
            filter(
                lambda k: "datetime_transform" in columns_preprocessing[k],
                columns_preprocessing,
            )
        )

        new_datetime_columns = []
        for col in cols_to_process:

            t = DateTimeTransformer()
            t.fit(X_train, col)
            X_train = t.transform(X_train)
            self._datetime_transforms += [t]
            new_datetime_columns += t._new_columns

        # SCALE
        for scale_method in [Scale.SCALE_NORMAL, Scale.SCALE_LOG_AND_NORMAL]:
            cols_to_process = list(
                filter(
                    lambda k: scale_method in columns_preprocessing[k],
                    columns_preprocessing,
                )
            )
            if (
                len(cols_to_process)
                and len(new_datetime_columns)
                and scale_method == Scale.SCALE_NORMAL
            ):
                cols_to_process += new_datetime_columns
            if (
                len(cols_to_process)
                and len(new_text_columns)
                and scale_method == Scale.SCALE_NORMAL
            ):
                cols_to_process += new_text_columns

            if len(cols_to_process):
                scale = Scale(cols_to_process)
                scale.fit(X_train)
                X_train = scale.transform(X_train)
                self._scale += [scale]

        return X_train, y_train