Exemple #1
0
    def fit_and_transform(self, X_train, y_train):
        logger.debug("Preprocessing.fit_and_transform")

        if y_train is not None:
            # target preprocessing
            # this must be used first, maybe we will drop some rows because of missing target values
            target_preprocessing = self._params.get("target_preprocessing")
            logger.debug("target_preprocessing params: {}".format(target_preprocessing))

            X_train, y_train = ExcludeRowsMissingTarget.transform(X_train, y_train)

            if PreprocessingCategorical.CONVERT_INTEGER in target_preprocessing:
                logger.debug("Convert target to integer")
                self._categorical_y = LabelEncoder()
                self._categorical_y.fit(y_train)
                y_train = pd.Series(self._categorical_y.transform(y_train))

            if PreprocessingCategorical.CONVERT_ONE_HOT in target_preprocessing:
                logger.debug("Convert target to one-hot coding")
                self._categorical_y = LabelBinarizer()
                self._categorical_y.fit(pd.DataFrame({"target": y_train}), "target")
                y_train = self._categorical_y.transform(
                    pd.DataFrame({"target": y_train}), "target"
                )

            if Scale.SCALE_LOG_AND_NORMAL in target_preprocessing:
                logger.debug("Scale log and normal")

                self._scale_y = Scale(
                    ["target"], scale_method=Scale.SCALE_LOG_AND_NORMAL
                )
                y_train = pd.DataFrame({"target": y_train})
                self._scale_y.fit(y_train)
                y_train = self._scale_y.transform(y_train)
                y_train = y_train["target"]

            if Scale.SCALE_NORMAL in target_preprocessing:
                logger.debug("Scale normal")

                self._scale_y = Scale(["target"], scale_method=Scale.SCALE_NORMAL)
                y_train = pd.DataFrame({"target": y_train})
                self._scale_y.fit(y_train)
                y_train = self._scale_y.transform(y_train)
                y_train = y_train["target"]

        # columns preprocessing
        columns_preprocessing = self._params.get("columns_preprocessing")
        for column in columns_preprocessing:
            transforms = columns_preprocessing[column]
            # logger.debug("Preprocess column {} with: {}".format(column, transforms))

        # remove empty or constant columns
        cols_to_remove = list(
            filter(
                lambda k: "remove_column" in columns_preprocessing[k],
                columns_preprocessing,
            )
        )

        if X_train is not None:
            X_train.drop(cols_to_remove, axis=1, inplace=True)
        self._remove_columns = cols_to_remove

        numeric_cols = []  # get numeric cols before text transformations
        # needed for golden features
        if X_train is not None and "golden_features" in self._params:
            numeric_cols = X_train.select_dtypes(include="number").columns.tolist()

        # there can be missing values in the text data,
        # but we don't want to handle it by fill missing methods
        # zeros will be imputed by text_transform method
        cols_to_process = list(
            filter(
                lambda k: "text_transform" in columns_preprocessing[k],
                columns_preprocessing,
            )
        )

        new_text_columns = []
        for col in cols_to_process:
            t = TextTransformer()
            t.fit(X_train, col)
            X_train = t.transform(X_train)
            self._text_transforms += [t]
            new_text_columns += t._new_columns
        # end of text transform

        for missing_method in [PreprocessingMissingValues.FILL_NA_MEDIAN]:
            cols_to_process = list(
                filter(
                    lambda k: missing_method in columns_preprocessing[k],
                    columns_preprocessing,
                )
            )
            missing = PreprocessingMissingValues(cols_to_process, missing_method)
            missing.fit(X_train)
            X_train = missing.transform(X_train)
            self._missing_values += [missing]

        # golden features
        golden_columns = []
        if "golden_features" in self._params:
            results_path = self._params["golden_features"]["results_path"]
            ml_task = self._params["golden_features"]["ml_task"]
            # if ml_task in [BINARY_CLASSIFICATION]:
            self._golden_features = GoldenFeaturesTransformer(results_path, ml_task)
            self._golden_features.fit(X_train[numeric_cols], y_train)
            X_train = self._golden_features.transform(X_train)
            golden_columns = self._golden_features._new_columns

        for convert_method in [
            PreprocessingCategorical.CONVERT_INTEGER,
            PreprocessingCategorical.CONVERT_ONE_HOT,
        ]:
            cols_to_process = list(
                filter(
                    lambda k: convert_method in columns_preprocessing[k],
                    columns_preprocessing,
                )
            )
            convert = PreprocessingCategorical(cols_to_process, convert_method)
            convert.fit(X_train)
            X_train = convert.transform(X_train)
            self._categorical += [convert]

        # datetime transform
        cols_to_process = list(
            filter(
                lambda k: "datetime_transform" in columns_preprocessing[k],
                columns_preprocessing,
            )
        )

        new_datetime_columns = []
        for col in cols_to_process:

            t = DateTimeTransformer()
            t.fit(X_train, col)
            X_train = t.transform(X_train)
            self._datetime_transforms += [t]
            new_datetime_columns += t._new_columns

        # SCALE
        for scale_method in [Scale.SCALE_NORMAL, Scale.SCALE_LOG_AND_NORMAL]:
            cols_to_process = list(
                filter(
                    lambda k: scale_method in columns_preprocessing[k],
                    columns_preprocessing,
                )
            )
            if (
                len(cols_to_process)
                and len(new_datetime_columns)
                and scale_method == Scale.SCALE_NORMAL
            ):
                cols_to_process += new_datetime_columns
            if (
                len(cols_to_process)
                and len(new_text_columns)
                and scale_method == Scale.SCALE_NORMAL
            ):
                cols_to_process += new_text_columns

            if (
                len(cols_to_process)
                and len(golden_columns)
                and scale_method == Scale.SCALE_NORMAL
            ):
                cols_to_process += golden_columns

            if len(cols_to_process):
                scale = Scale(cols_to_process)
                scale.fit(X_train)
                X_train = scale.transform(X_train)
                self._scale += [scale]

        if self._add_random_feature:
            # -1, 1, with 0 mean
            X_train["random_feature"] = np.random.rand(X_train.shape[0]) * 2.0 - 1.0

        if self._drop_features:
            available_cols = X_train.columns.tolist()
            drop_cols = [c for c in self._drop_features if c in available_cols]
            if drop_cols:
                X_train.drop(drop_cols, axis=1, inplace=True)
            self._drop_features = drop_cols

        return X_train, y_train
Exemple #2
0
    def from_json(self, data_json, results_path):

        self._params = data_json.get("params", self._params)

        if "remove_columns" in data_json:
            self._remove_columns = data_json.get("remove_columns", [])
        if "missing_values" in data_json:
            self._missing_values = []
            for mv_data in data_json["missing_values"]:
                mv = PreprocessingMissingValues()
                mv.from_json(mv_data)
                self._missing_values += [mv]
        if "categorical" in data_json:
            self._categorical = []
            for cat_data in data_json["categorical"]:
                cat = PreprocessingCategorical()
                cat.from_json(cat_data)
                self._categorical += [cat]

        if "datetime_transforms" in data_json:
            self._datetime_transforms = []
            for dtt_params in data_json["datetime_transforms"]:
                dtt = DateTimeTransformer()
                dtt.from_json(dtt_params)
                self._datetime_transforms += [dtt]

        if "text_transforms" in data_json:
            self._text_transforms = []
            for tt_params in data_json["text_transforms"]:
                tt = TextTransformer()
                tt.from_json(tt_params)
                self._text_transforms += [tt]

        if "golden_features" in data_json:
            self._golden_features = GoldenFeaturesTransformer()
            self._golden_features.from_json(data_json["golden_features"],
                                            results_path)

        if "kmeans" in data_json:
            self._kmeans = KMeansTransformer()
            self._kmeans.from_json(data_json["kmeans"], results_path)

        if "scale" in data_json:
            self._scale = []
            for scale_data in data_json["scale"]:
                sc = Scale()
                sc.from_json(scale_data)
                self._scale += [sc]
        if "categorical_y" in data_json:
            if "new_columns" in data_json["categorical_y"]:
                self._categorical_y = LabelBinarizer()
            else:
                self._categorical_y = LabelEncoder()

            self._categorical_y.from_json(data_json["categorical_y"])
        if "scale_y" in data_json:
            self._scale_y = Scale()
            self._scale_y.from_json(data_json["scale_y"])
        if "ml_task" in data_json:
            self._params["ml_task"] = data_json["ml_task"]

        self._add_random_feature = data_json.get("add_random_feature", False)
        self._drop_features = data_json.get("drop_features", [])
Exemple #3
0
class Preprocessing(object):
    def __init__(
        self,
        preprocessing_params={"target_preprocessing": [], "columns_preprocessing": {}},
    ):
        self._params = preprocessing_params

        if "target_preprocessing" not in preprocessing_params:
            self._params["target_preprocessing"] = []
        if "columns_preprocessing" not in preprocessing_params:
            self._params["columns_preprocessing"] = {}

        # preprocssing step attributes
        self._categorical_y = None
        self._scale_y = None
        self._missing_values = []
        self._categorical = []
        self._scale = []
        self._remove_columns = []
        self._datetime_transforms = []
        self._text_transforms = []
        self._golden_features = None
        self._add_random_feature = self._params.get("add_random_feature", False)
        self._drop_features = self._params.get("drop_features", [])

    def _exclude_missing_targets(self, X=None, y=None):
        # check if there are missing values in target column
        if y is None:
            return X, y
        y_missing = pd.isnull(y)
        if np.sum(np.array(y_missing)) == 0:
            return X, y
        y = y.drop(y.index[y_missing])
        y.index = range(y.shape[0])
        if X is not None:
            X = X.drop(X.index[y_missing])
            X.index = range(X.shape[0])
        return X, y

    # fit and transform
    def fit_and_transform(self, X_train, y_train):
        logger.debug("Preprocessing.fit_and_transform")

        if y_train is not None:
            # target preprocessing
            # this must be used first, maybe we will drop some rows because of missing target values
            target_preprocessing = self._params.get("target_preprocessing")
            logger.debug("target_preprocessing params: {}".format(target_preprocessing))

            X_train, y_train = ExcludeRowsMissingTarget.transform(X_train, y_train)

            if PreprocessingCategorical.CONVERT_INTEGER in target_preprocessing:
                logger.debug("Convert target to integer")
                self._categorical_y = LabelEncoder()
                self._categorical_y.fit(y_train)
                y_train = pd.Series(self._categorical_y.transform(y_train))

            if PreprocessingCategorical.CONVERT_ONE_HOT in target_preprocessing:
                logger.debug("Convert target to one-hot coding")
                self._categorical_y = LabelBinarizer()
                self._categorical_y.fit(pd.DataFrame({"target": y_train}), "target")
                y_train = self._categorical_y.transform(
                    pd.DataFrame({"target": y_train}), "target"
                )

            if Scale.SCALE_LOG_AND_NORMAL in target_preprocessing:
                logger.debug("Scale log and normal")

                self._scale_y = Scale(
                    ["target"], scale_method=Scale.SCALE_LOG_AND_NORMAL
                )
                y_train = pd.DataFrame({"target": y_train})
                self._scale_y.fit(y_train)
                y_train = self._scale_y.transform(y_train)
                y_train = y_train["target"]

            if Scale.SCALE_NORMAL in target_preprocessing:
                logger.debug("Scale normal")

                self._scale_y = Scale(["target"], scale_method=Scale.SCALE_NORMAL)
                y_train = pd.DataFrame({"target": y_train})
                self._scale_y.fit(y_train)
                y_train = self._scale_y.transform(y_train)
                y_train = y_train["target"]

        # columns preprocessing
        columns_preprocessing = self._params.get("columns_preprocessing")
        for column in columns_preprocessing:
            transforms = columns_preprocessing[column]
            # logger.debug("Preprocess column {} with: {}".format(column, transforms))

        # remove empty or constant columns
        cols_to_remove = list(
            filter(
                lambda k: "remove_column" in columns_preprocessing[k],
                columns_preprocessing,
            )
        )

        if X_train is not None:
            X_train.drop(cols_to_remove, axis=1, inplace=True)
        self._remove_columns = cols_to_remove

        numeric_cols = []  # get numeric cols before text transformations
        # needed for golden features
        if X_train is not None and "golden_features" in self._params:
            numeric_cols = X_train.select_dtypes(include="number").columns.tolist()

        # there can be missing values in the text data,
        # but we don't want to handle it by fill missing methods
        # zeros will be imputed by text_transform method
        cols_to_process = list(
            filter(
                lambda k: "text_transform" in columns_preprocessing[k],
                columns_preprocessing,
            )
        )

        new_text_columns = []
        for col in cols_to_process:
            t = TextTransformer()
            t.fit(X_train, col)
            X_train = t.transform(X_train)
            self._text_transforms += [t]
            new_text_columns += t._new_columns
        # end of text transform

        for missing_method in [PreprocessingMissingValues.FILL_NA_MEDIAN]:
            cols_to_process = list(
                filter(
                    lambda k: missing_method in columns_preprocessing[k],
                    columns_preprocessing,
                )
            )
            missing = PreprocessingMissingValues(cols_to_process, missing_method)
            missing.fit(X_train)
            X_train = missing.transform(X_train)
            self._missing_values += [missing]

        # golden features
        golden_columns = []
        if "golden_features" in self._params:
            results_path = self._params["golden_features"]["results_path"]
            ml_task = self._params["golden_features"]["ml_task"]
            # if ml_task in [BINARY_CLASSIFICATION]:
            self._golden_features = GoldenFeaturesTransformer(results_path, ml_task)
            self._golden_features.fit(X_train[numeric_cols], y_train)
            X_train = self._golden_features.transform(X_train)
            golden_columns = self._golden_features._new_columns

        for convert_method in [
            PreprocessingCategorical.CONVERT_INTEGER,
            PreprocessingCategorical.CONVERT_ONE_HOT,
        ]:
            cols_to_process = list(
                filter(
                    lambda k: convert_method in columns_preprocessing[k],
                    columns_preprocessing,
                )
            )
            convert = PreprocessingCategorical(cols_to_process, convert_method)
            convert.fit(X_train)
            X_train = convert.transform(X_train)
            self._categorical += [convert]

        # datetime transform
        cols_to_process = list(
            filter(
                lambda k: "datetime_transform" in columns_preprocessing[k],
                columns_preprocessing,
            )
        )

        new_datetime_columns = []
        for col in cols_to_process:

            t = DateTimeTransformer()
            t.fit(X_train, col)
            X_train = t.transform(X_train)
            self._datetime_transforms += [t]
            new_datetime_columns += t._new_columns

        # SCALE
        for scale_method in [Scale.SCALE_NORMAL, Scale.SCALE_LOG_AND_NORMAL]:
            cols_to_process = list(
                filter(
                    lambda k: scale_method in columns_preprocessing[k],
                    columns_preprocessing,
                )
            )
            if (
                len(cols_to_process)
                and len(new_datetime_columns)
                and scale_method == Scale.SCALE_NORMAL
            ):
                cols_to_process += new_datetime_columns
            if (
                len(cols_to_process)
                and len(new_text_columns)
                and scale_method == Scale.SCALE_NORMAL
            ):
                cols_to_process += new_text_columns

            if (
                len(cols_to_process)
                and len(golden_columns)
                and scale_method == Scale.SCALE_NORMAL
            ):
                cols_to_process += golden_columns

            if len(cols_to_process):
                scale = Scale(cols_to_process)
                scale.fit(X_train)
                X_train = scale.transform(X_train)
                self._scale += [scale]

        if self._add_random_feature:
            # -1, 1, with 0 mean
            X_train["random_feature"] = np.random.rand(X_train.shape[0]) * 2.0 - 1.0

        if self._drop_features:
            available_cols = X_train.columns.tolist()
            drop_cols = [c for c in self._drop_features if c in available_cols]
            if drop_cols:
                X_train.drop(drop_cols, axis=1, inplace=True)
            self._drop_features = drop_cols

        return X_train, y_train

    def transform(self, X_validation, y_validation):
        logger.debug("Preprocessing.transform")

        # doing copy to avoid SettingWithCopyWarning
        if X_validation is not None:
            X_validation = X_validation.copy(deep=False)
        if y_validation is not None:
            y_validation = y_validation.copy(deep=False)

        # target preprocessing
        # this must be used first, maybe we will drop some rows because of missing target values
        if y_validation is not None:
            target_preprocessing = self._params.get("target_preprocessing")
            logger.debug("target_preprocessing -> {}".format(target_preprocessing))

            X_validation, y_validation = ExcludeRowsMissingTarget.transform(
                X_validation, y_validation
            )

            if PreprocessingCategorical.CONVERT_INTEGER in target_preprocessing:
                if y_validation is not None and self._categorical_y is not None:
                    y_validation = pd.Series(
                        self._categorical_y.transform(y_validation)
                    )

            if PreprocessingCategorical.CONVERT_ONE_HOT in target_preprocessing:
                if y_validation is not None and self._categorical_y is not None:
                    y_validation = self._categorical_y.transform(
                        pd.DataFrame({"target": y_validation}), "target"
                    )

            if Scale.SCALE_LOG_AND_NORMAL in target_preprocessing:
                if self._scale_y is not None and y_validation is not None:
                    logger.debug("Transform log and normalize")
                    y_validation = pd.DataFrame({"target": y_validation})
                    y_validation = self._scale_y.transform(y_validation)
                    y_validation = y_validation["target"]

            if Scale.SCALE_NORMAL in target_preprocessing:
                if self._scale_y is not None and y_validation is not None:
                    logger.debug("Transform normalize")
                    y_validation = pd.DataFrame({"target": y_validation})
                    y_validation = self._scale_y.transform(y_validation)
                    y_validation = y_validation["target"]

        # columns preprocessing
        if len(self._remove_columns) and X_validation is not None:
            cols_to_remove = [
                col for col in X_validation.columns if col in self._remove_columns
            ]
            X_validation.drop(cols_to_remove, axis=1, inplace=True)

        # text transform
        for tt in self._text_transforms:
            if X_validation is not None and tt is not None:
                X_validation = tt.transform(X_validation)

        for missing in self._missing_values:
            if X_validation is not None and missing is not None:
                X_validation = missing.transform(X_validation)

        # to be sure that all missing are filled
        # in case new data there can be gaps!
        if (
            X_validation is not None
            and np.sum(np.sum(pd.isnull(X_validation))) > 0
            and len(self._params["columns_preprocessing"]) > 0
        ):
            # there is something missing, fill it
            # we should notice user about it!
            # warnings should go to the separate file ...
            # warnings.warn(
            #    "There are columns {} with missing values which didnt have missing values in train dataset.".format(
            #        list(
            #            X_validation.columns[np.where(np.sum(pd.isnull(X_validation)))]
            #        )
            #    )
            # )
            missing = PreprocessingMissingValues(
                X_validation.columns, PreprocessingMissingValues.FILL_NA_MEDIAN
            )
            missing.fit(X_validation)
            X_validation = missing.transform(X_validation)

        # golden features
        if self._golden_features is not None:
            X_validation = self._golden_features.transform(X_validation)

        for convert in self._categorical:
            if X_validation is not None and convert is not None:
                X_validation = convert.transform(X_validation)

        for dtt in self._datetime_transforms:
            if X_validation is not None and dtt is not None:
                X_validation = dtt.transform(X_validation)

        for scale in self._scale:
            if X_validation is not None and scale is not None:
                X_validation = scale.transform(X_validation)

        if self._add_random_feature:
            # -1, 1, with 0 mean
            X_validation["random_feature"] = (
                np.random.rand(X_validation.shape[0]) * 2.0 - 1.0
            )

        if self._drop_features and X_validation is not None:
            X_validation.drop(self._drop_features, axis=1, inplace=True)

        return X_validation, y_validation

    def inverse_scale_target(self, y):
        if self._scale_y is not None:
            y = pd.DataFrame({"target": y})
            y = self._scale_y.inverse_transform(y)
            y = y["target"]
        return y

    def inverse_categorical_target(self, y):
        if self._categorical_y is not None:
            y = self._categorical_y.inverse_transform(
                pd.DataFrame({"target": np.array(y)})
            )
            y = y.astype(str)
        return y

    def get_target_class_names(self):
        pos_label, neg_label = "1", "0"
        if self._categorical_y is not None:
            if self._params["ml_task"] == BINARY_CLASSIFICATION:
                # binary classification
                for label, value in self._categorical_y.to_json().items():
                    if value == 1:
                        pos_label = label
                    else:
                        neg_label = label
                return [neg_label, pos_label]
            else:
                # multiclass classification
                # logger.debug(self._categorical_y.to_json())
                if "unique_values" not in self._categorical_y.to_json():
                    labels = dict(
                        (v, k) for k, v in self._categorical_y.to_json().items()
                    )
                else:
                    labels = {
                        i: v
                        for i, v in enumerate(
                            self._categorical_y.to_json()["unique_values"]
                        )
                    }

                return list(labels.values())

        else:  # self._categorical_y is None
            if "ml_task" in self._params:
                if self._params["ml_task"] == BINARY_CLASSIFICATION:
                    return ["0", "1"]
        return []

    def prepare_target_labels(self, y):
        pos_label, neg_label = "1", "0"

        if self._categorical_y is not None:
            if len(y.shape) == 1:
                # binary classification
                for label, value in self._categorical_y.to_json().items():
                    if value == 1:
                        pos_label = label
                    else:
                        neg_label = label
                # threshold is applied in AutoML class
                return pd.DataFrame(
                    {
                        "prediction_{}".format(neg_label): 1 - y,
                        "prediction_{}".format(pos_label): y,
                    }
                )
            else:
                # multiclass classification

                if "unique_values" not in self._categorical_y.to_json():
                    labels = dict(
                        (v, k) for k, v in self._categorical_y.to_json().items()
                    )
                else:
                    labels = {
                        i: v
                        for i, v in enumerate(
                            self._categorical_y.to_json()["unique_values"]
                        )
                    }
                d = {}
                cols = []
                for i in range(y.shape[1]):
                    d["prediction_{}".format(labels[i])] = y[:, i]
                    cols += ["prediction_{}".format(labels[i])]
                df = pd.DataFrame(d)
                df["label"] = np.argmax(np.array(df[cols]), axis=1)

                df["label"] = df["label"].map(labels)

                return df
        else:  # self._categorical_y is None
            if "ml_task" in self._params:
                if self._params["ml_task"] == BINARY_CLASSIFICATION:
                    return pd.DataFrame({"prediction_0": 1 - y, "prediction_1": y})
                elif self._params["ml_task"] == MULTICLASS_CLASSIFICATION:
                    return pd.DataFrame(
                        data=y,
                        columns=["prediction_{}".format(i) for i in range(y.shape[1])],
                    )

        return pd.DataFrame({"prediction": y})

    def to_json(self):
        preprocessing_params = {}
        if self._remove_columns:
            preprocessing_params["remove_columns"] = self._remove_columns
        if self._missing_values is not None and len(self._missing_values):
            mvs = []  # refactor
            for mv in self._missing_values:
                if mv.to_json():
                    mvs += [mv.to_json()]
            if mvs:
                preprocessing_params["missing_values"] = mvs
        if self._categorical is not None and len(self._categorical):
            cats = []  # refactor
            for cat in self._categorical:
                if cat.to_json():
                    cats += [cat.to_json()]
            if cats:
                preprocessing_params["categorical"] = cats

        if self._datetime_transforms is not None and len(self._datetime_transforms):
            dtts = []
            for dtt in self._datetime_transforms:
                dtts += [dtt.to_json()]
            if dtts:
                preprocessing_params["datetime_transforms"] = dtts

        if self._text_transforms is not None and len(self._text_transforms):
            tts = []
            for tt in self._text_transforms:
                tts += [tt.to_json()]
            if tts:
                preprocessing_params["text_transforms"] = tts

        if self._golden_features is not None:
            preprocessing_params["golden_features"] = self._golden_features.to_json()

        if self._scale is not None and len(self._scale):
            scs = [sc.to_json() for sc in self._scale if sc.to_json()]
            if scs:
                preprocessing_params["scale"] = scs
        if self._categorical_y is not None:
            cat_y = self._categorical_y.to_json()
            if cat_y:
                preprocessing_params["categorical_y"] = cat_y
        if self._scale_y is not None:
            preprocessing_params["scale_y"] = self._scale_y.to_json()

        if "ml_task" in self._params:
            preprocessing_params["ml_task"] = self._params["ml_task"]

        if self._add_random_feature:
            preprocessing_params["add_random_feature"] = True

        if self._drop_features:
            preprocessing_params["drop_features"] = self._drop_features

        return preprocessing_params

    def from_json(self, data_json):

        if "remove_columns" in data_json:
            self._remove_columns = data_json.get("remove_columns", [])
        if "missing_values" in data_json:
            self._missing_values = []
            for mv_data in data_json["missing_values"]:
                mv = PreprocessingMissingValues()
                mv.from_json(mv_data)
                self._missing_values += [mv]
        if "categorical" in data_json:
            self._categorical = []
            for cat_data in data_json["categorical"]:
                cat = PreprocessingCategorical()
                cat.from_json(cat_data)
                self._categorical += [cat]

        if "datetime_transforms" in data_json:
            self._datetime_transforms = []
            for dtt_params in data_json["datetime_transforms"]:
                dtt = DateTimeTransformer()
                dtt.from_json(dtt_params)
                self._datetime_transforms += [dtt]

        if "text_transforms" in data_json:
            self._text_transforms = []
            for tt_params in data_json["text_transforms"]:
                tt = TextTransformer()
                tt.from_json(tt_params)
                self._text_transforms += [tt]

        if "golden_features" in data_json:
            self._golden_features = GoldenFeaturesTransformer()
            self._golden_features.from_json(data_json["golden_features"])

        if "scale" in data_json:
            self._scale = []
            for scale_data in data_json["scale"]:
                sc = Scale()
                sc.from_json(scale_data)
                self._scale += [sc]
        if "categorical_y" in data_json:
            if "new_columns" in data_json["categorical_y"]:
                self._categorical_y = LabelBinarizer()
            else:
                self._categorical_y = LabelEncoder()

            self._categorical_y.from_json(data_json["categorical_y"])
        if "scale_y" in data_json:
            self._scale_y = Scale()
            self._scale_y.from_json(data_json["scale_y"])
        if "ml_task" in data_json:
            self._params["ml_task"] = data_json["ml_task"]

        self._add_random_feature = data_json.get("add_random_feature", False)
        self._drop_features = data_json.get("drop_features", [])
Exemple #4
0
    def fit_and_transform(self, X_train, y_train):
        logger.debug("Preprocessing.fit_and_transform")

        if y_train is not None:
            # target preprocessing
            # this must be used first, maybe we will drop some rows because of missing target values
            target_preprocessing = self._params.get("target_preprocessing")
            logger.debug("target_preprocessing params: {}".format(target_preprocessing))

            X_train, y_train = ExcludeRowsMissingTarget.transform(X_train, y_train)

            if PreprocessingCategorical.CONVERT_INTEGER in target_preprocessing:
                logger.debug("Convert target to integer")
                self._categorical_y = LabelEncoder()
                self._categorical_y.fit(y_train)
                y_train = pd.Series(self._categorical_y.transform(y_train))

            if PreprocessingCategorical.CONVERT_ONE_HOT in target_preprocessing:
                logger.debug("Convert target to one-hot coding")
                self._categorical_y = LabelBinarizer()
                self._categorical_y.fit(pd.DataFrame({"target": y_train}), "target")
                y_train = self._categorical_y.transform(
                    pd.DataFrame({"target": y_train}), "target"
                )

            if Scale.SCALE_LOG_AND_NORMAL in target_preprocessing:
                logger.debug("Scale log and normal")

                self._scale_y = Scale(
                    ["target"], scale_method=Scale.SCALE_LOG_AND_NORMAL
                )
                y_train = pd.DataFrame({"target": y_train})
                self._scale_y.fit(y_train)
                y_train = self._scale_y.transform(y_train)
                y_train = y_train["target"]

            if Scale.SCALE_NORMAL in target_preprocessing:
                logger.debug("Scale normal")

                self._scale_y = Scale(["target"], scale_method=Scale.SCALE_NORMAL)
                y_train = pd.DataFrame({"target": y_train})
                self._scale_y.fit(y_train)
                y_train = self._scale_y.transform(y_train)
                y_train = y_train["target"]

        # columns preprocessing
        columns_preprocessing = self._params.get("columns_preprocessing")
        for column in columns_preprocessing:
            transforms = columns_preprocessing[column]
            # logger.debug("Preprocess column {} with: {}".format(column, transforms))

        # remove empty or constant columns
        cols_to_remove = list(
            filter(
                lambda k: "remove_column" in columns_preprocessing[k],
                columns_preprocessing,
            )
        )

        if X_train is not None:
            X_train.drop(cols_to_remove, axis=1, inplace=True)
        self._remove_columns = cols_to_remove

        # there can be missing values in the text data,
        # but we don't want to handle it by fill missing methods
        # zeros will be imputed by text_transform method
        cols_to_process = list(
            filter(
                lambda k: "text_transform" in columns_preprocessing[k],
                columns_preprocessing,
            )
        )

        new_text_columns = []
        for col in cols_to_process:
            t = TextTransformer()
            t.fit(X_train, col)
            X_train = t.transform(X_train)
            self._text_transforms += [t]
            new_text_columns += t._new_columns
        # end of text transform

        for missing_method in [PreprocessingMissingValues.FILL_NA_MEDIAN]:
            cols_to_process = list(
                filter(
                    lambda k: missing_method in columns_preprocessing[k],
                    columns_preprocessing,
                )
            )
            missing = PreprocessingMissingValues(cols_to_process, missing_method)
            missing.fit(X_train)
            X_train = missing.transform(X_train)
            self._missing_values += [missing]

        for convert_method in [
            PreprocessingCategorical.CONVERT_INTEGER,
            PreprocessingCategorical.CONVERT_ONE_HOT,
        ]:
            cols_to_process = list(
                filter(
                    lambda k: convert_method in columns_preprocessing[k],
                    columns_preprocessing,
                )
            )
            convert = PreprocessingCategorical(cols_to_process, convert_method)
            convert.fit(X_train)
            X_train = convert.transform(X_train)
            self._categorical += [convert]

        # datetime transform
        cols_to_process = list(
            filter(
                lambda k: "datetime_transform" in columns_preprocessing[k],
                columns_preprocessing,
            )
        )

        new_datetime_columns = []
        for col in cols_to_process:

            t = DateTimeTransformer()
            t.fit(X_train, col)
            X_train = t.transform(X_train)
            self._datetime_transforms += [t]
            new_datetime_columns += t._new_columns

        # SCALE
        for scale_method in [Scale.SCALE_NORMAL, Scale.SCALE_LOG_AND_NORMAL]:
            cols_to_process = list(
                filter(
                    lambda k: scale_method in columns_preprocessing[k],
                    columns_preprocessing,
                )
            )
            if (
                len(cols_to_process)
                and len(new_datetime_columns)
                and scale_method == Scale.SCALE_NORMAL
            ):
                cols_to_process += new_datetime_columns
            if (
                len(cols_to_process)
                and len(new_text_columns)
                and scale_method == Scale.SCALE_NORMAL
            ):
                cols_to_process += new_text_columns

            if len(cols_to_process):
                scale = Scale(cols_to_process)
                scale.fit(X_train)
                X_train = scale.transform(X_train)
                self._scale += [scale]

        return X_train, y_train
    def fit_and_transform(self, X_train, y_train):
        logger.debug("Preprocessing.fit_and_transform")

        if y_train is not None:
            # target preprocessing
            # this must be used first, maybe we will drop some rows because of missing target values
            target_preprocessing = self._params.get("target_preprocessing")
            logger.debug("target_preprocessing params: {}".format(target_preprocessing))

            X_train, y_train = ExcludeRowsMissingTarget.transform(X_train, y_train)

            if PreprocessingCategorical.CONVERT_INTEGER in target_preprocessing:
                logger.debug("Convert target to integer")
                self._categorical_y = LabelEncoder()
                self._categorical_y.fit(y_train)
                y_train = pd.Series(self._categorical_y.transform(y_train))

            if PreprocessingCategorical.CONVERT_ONE_HOT in target_preprocessing:
                logger.debug("Convert target to one-hot coding")
                self._categorical_y = LabelBinarizer()
                self._categorical_y.fit(pd.DataFrame({"target": y_train}), "target")
                y_train = self._categorical_y.transform(
                    pd.DataFrame({"target": y_train}), "target"
                )

            if Scale.SCALE_LOG_AND_NORMAL in target_preprocessing:
                logger.debug("Scale log and normal")

                self._scale_y = Scale(
                    ["target"], scale_method=Scale.SCALE_LOG_AND_NORMAL
                )
                y_train = pd.DataFrame({"target": y_train})
                self._scale_y.fit(y_train)
                y_train = self._scale_y.transform(y_train)
                y_train = y_train["target"]

            if Scale.SCALE_NORMAL in target_preprocessing:
                logger.error("not implemented SCALE_NORMAL")
                raise Exception("not implemented SCALE_NORMAL")

        # columns preprocessing
        columns_preprocessing = self._params.get("columns_preprocessing")
        for column in columns_preprocessing:
            transforms = columns_preprocessing[column]
            # logger.debug("Preprocess column {} with: {}".format(column, transforms))

        # remove empty or constant columns
        cols_to_remove = list(
            filter(
                lambda k: "remove_column" in columns_preprocessing[k],
                columns_preprocessing,
            )
        )

        if X_train is not None:
            X_train.drop(cols_to_remove, axis=1, inplace=True)
        self._remove_columns = cols_to_remove

        for missing_method in [PreprocessingMissingValues.FILL_NA_MEDIAN]:
            cols_to_process = list(
                filter(
                    lambda k: missing_method in columns_preprocessing[k],
                    columns_preprocessing,
                )
            )
            missing = PreprocessingMissingValues(cols_to_process, missing_method)
            missing.fit(X_train)
            X_train = missing.transform(X_train)
            self._missing_values += [missing]

        for convert_method in [PreprocessingCategorical.CONVERT_INTEGER]:
            cols_to_process = list(
                filter(
                    lambda k: convert_method in columns_preprocessing[k],
                    columns_preprocessing,
                )
            )
            convert = PreprocessingCategorical(cols_to_process, convert_method)
            convert.fit(X_train)
            X_train = convert.transform(X_train)
            self._categorical += [convert]

        # SCALE
        for scale_method in [Scale.SCALE_NORMAL]:
            cols_to_process = list(
                filter(
                    lambda k: scale_method in columns_preprocessing[k],
                    columns_preprocessing,
                )
            )
            if len(cols_to_process):
                scale = Scale(cols_to_process)
                scale.fit(X_train)
                X_train = scale.transform(X_train)
                self._scale += [scale]

        return X_train, y_train
class Preprocessing(object):
    def __init__(
        self,
        preprocessing_params={"target_preprocessing": [], "columns_preprocessing": {}},
    ):
        self._params = preprocessing_params

        if "target_preprocessing" not in preprocessing_params:
            self._params["target_preprocessing"] = []
        if "columns_preprocessing" not in preprocessing_params:
            self._params["columns_preprocessing"] = {}

        # preprocssing step attributes
        self._categorical_y = None
        self._scale_y = None
        self._missing_values = []
        self._categorical = []
        self._scale = []
        self._remove_columns = []

    def _exclude_missing_targets(self, X=None, y=None):
        # check if there are missing values in target column
        if y is None:
            return X, y
        y_missing = pd.isnull(y)
        if np.sum(np.array(y_missing)) == 0:
            return X, y
        y = y.drop(y.index[y_missing])
        y.index = range(y.shape[0])
        if X is not None:
            X = X.drop(X.index[y_missing])
            X.index = range(X.shape[0])
        return X, y

    # fit and transform
    def fit_and_transform(self, X_train, y_train):
        logger.debug("Preprocessing.fit_and_transform")

        if y_train is not None:
            # target preprocessing
            # this must be used first, maybe we will drop some rows because of missing target values
            target_preprocessing = self._params.get("target_preprocessing")
            logger.debug("target_preprocessing params: {}".format(target_preprocessing))

            X_train, y_train = ExcludeRowsMissingTarget.transform(X_train, y_train)

            if PreprocessingCategorical.CONVERT_INTEGER in target_preprocessing:
                logger.debug("Convert target to integer")
                self._categorical_y = LabelEncoder()
                self._categorical_y.fit(y_train)
                y_train = pd.Series(self._categorical_y.transform(y_train))

            if PreprocessingCategorical.CONVERT_ONE_HOT in target_preprocessing:
                logger.debug("Convert target to one-hot coding")
                self._categorical_y = LabelBinarizer()
                self._categorical_y.fit(pd.DataFrame({"target": y_train}), "target")
                y_train = self._categorical_y.transform(
                    pd.DataFrame({"target": y_train}), "target"
                )

            if Scale.SCALE_LOG_AND_NORMAL in target_preprocessing:
                logger.debug("Scale log and normal")

                self._scale_y = Scale(
                    ["target"], scale_method=Scale.SCALE_LOG_AND_NORMAL
                )
                y_train = pd.DataFrame({"target": y_train})
                self._scale_y.fit(y_train)
                y_train = self._scale_y.transform(y_train)
                y_train = y_train["target"]

            if Scale.SCALE_NORMAL in target_preprocessing:
                logger.error("not implemented SCALE_NORMAL")
                raise Exception("not implemented SCALE_NORMAL")

        # columns preprocessing
        columns_preprocessing = self._params.get("columns_preprocessing")
        for column in columns_preprocessing:
            transforms = columns_preprocessing[column]
            # logger.debug("Preprocess column {} with: {}".format(column, transforms))

        # remove empty or constant columns
        cols_to_remove = list(
            filter(
                lambda k: "remove_column" in columns_preprocessing[k],
                columns_preprocessing,
            )
        )

        if X_train is not None:
            X_train.drop(cols_to_remove, axis=1, inplace=True)
        self._remove_columns = cols_to_remove

        for missing_method in [PreprocessingMissingValues.FILL_NA_MEDIAN]:
            cols_to_process = list(
                filter(
                    lambda k: missing_method in columns_preprocessing[k],
                    columns_preprocessing,
                )
            )
            missing = PreprocessingMissingValues(cols_to_process, missing_method)
            missing.fit(X_train)
            X_train = missing.transform(X_train)
            self._missing_values += [missing]

        for convert_method in [PreprocessingCategorical.CONVERT_INTEGER]:
            cols_to_process = list(
                filter(
                    lambda k: convert_method in columns_preprocessing[k],
                    columns_preprocessing,
                )
            )
            convert = PreprocessingCategorical(cols_to_process, convert_method)
            convert.fit(X_train)
            X_train = convert.transform(X_train)
            self._categorical += [convert]

        # SCALE
        for scale_method in [Scale.SCALE_NORMAL]:
            cols_to_process = list(
                filter(
                    lambda k: scale_method in columns_preprocessing[k],
                    columns_preprocessing,
                )
            )
            if len(cols_to_process):
                scale = Scale(cols_to_process)
                scale.fit(X_train)
                X_train = scale.transform(X_train)
                self._scale += [scale]

        return X_train, y_train

    def transform(self, X_validation, y_validation):
        logger.debug("Preprocessing.transform")

        # doing copy to avoid SettingWithCopyWarning
        if X_validation is not None:
            X_validation = X_validation.copy(deep=False)
        if y_validation is not None:
            y_validation = y_validation.copy(deep=False)

        # target preprocessing
        # this must be used first, maybe we will drop some rows because of missing target values
        if y_validation is not None:
            target_preprocessing = self._params.get("target_preprocessing")
            logger.debug("target_preprocessing -> {}".format(target_preprocessing))

            X_validation, y_validation = ExcludeRowsMissingTarget.transform(
                X_validation, y_validation
            )

            if PreprocessingCategorical.CONVERT_INTEGER in target_preprocessing:
                if y_validation is not None and self._categorical_y is not None:
                    y_validation = pd.Series(
                        self._categorical_y.transform(y_validation)
                    )

            if PreprocessingCategorical.CONVERT_ONE_HOT in target_preprocessing:
                if y_validation is not None and self._categorical_y is not None:
                    y_validation = self._categorical_y.transform(
                        pd.DataFrame({"target": y_validation}), "target"
                    )

            if Scale.SCALE_LOG_AND_NORMAL in target_preprocessing:
                if self._scale_y is not None and y_validation is not None:
                    logger.debug("Transform log and normalize")
                    y_validation = pd.DataFrame({"target": y_validation})
                    y_validation = self._scale_y.transform(y_validation)
                    y_validation = y_validation["target"]

            if Scale.SCALE_NORMAL in target_preprocessing:
                logger.error("not implemented SCALE_NORMAL")
                raise Exception("not implemented SCALE_NORMAL")

        # columns preprocessing
        if len(self._remove_columns) and X_validation is not None:
            cols_to_remove = [
                col for col in X_validation.columns if col in self._remove_columns
            ]
            X_validation.drop(cols_to_remove, axis=1, inplace=True)

        for missing in self._missing_values:
            if X_validation is not None and missing is not None:
                X_validation = missing.transform(X_validation)
        # to be sure that all missing are filled
        # in case new data there can be gaps!
        if X_validation is not None and np.sum(np.sum(pd.isnull(X_validation))) > 0:
            # there is something missing, fill it
            # we should notice user about it!
            warnings.warn(
                "There are columns {} with missing values which didnt have missing values in train dataset.".format(
                    list(
                        X_validation.columns[np.where(np.sum(pd.isnull(X_validation)))]
                    )
                )
            )
            missing = PreprocessingMissingValues(
                X_validation.columns, PreprocessingMissingValues.FILL_NA_MEDIAN
            )
            missing.fit(X_validation)
            X_validation = missing.transform(X_validation)
        for convert in self._categorical:
            if X_validation is not None and convert is not None:
                X_validation = convert.transform(X_validation)
        for scale in self._scale:
            if X_validation is not None and scale is not None:
                X_validation = scale.transform(X_validation)

        return X_validation, y_validation

    def inverse_scale_target(self, y):
        if self._scale_y is not None:
            y = pd.DataFrame({"target": y})
            y = self._scale_y.inverse_transform(y)
            y = y["target"]
        return y

    def inverse_categorical_target(self, y):
        if self._categorical_y is not None:
            y = self._categorical_y.inverse_transform(
                pd.DataFrame({"target": np.array(y)})
            )
            y = y.astype(str)
        return y

    def prepare_target_labels(self, y):
        pos_label, neg_label = "1", "0"
        if self._categorical_y is not None:
            if len(y.shape) == 1:
                # binary classification
                for label, value in self._categorical_y.to_json().items():
                    if value == 1:
                        pos_label = label
                    else:
                        neg_label = label
                # threshold is applied in AutoML class
                return pd.DataFrame(
                    {
                        "prediction_{}".format(neg_label): 1 - y,
                        "prediction_{}".format(pos_label): y,
                    }
                )
            else:
                # multiclass classification
                # logger.debug(self._categorical_y.to_json())
                if "unique_values" not in self._categorical_y.to_json():
                    labels = dict(
                        (v, k) for k, v in self._categorical_y.to_json().items()
                    )
                else:
                    labels = {
                        i: v
                        for i, v in enumerate(
                            self._categorical_y.to_json()["unique_values"]
                        )
                    }
                d = {}
                cols = []
                for i in range(y.shape[1]):
                    d["prediction_{}".format(labels[i])] = y[:, i]
                    cols += ["prediction_{}".format(labels[i])]
                df = pd.DataFrame(d)
                df["label"] = np.argmax(np.array(df[cols]), axis=1)

                df["label"] = df["label"].map(labels)
                return df
        else:  # self._categorical_y is None
            if "ml_task" in self._params:
                if self._params["ml_task"] == BINARY_CLASSIFICATION:
                    return pd.DataFrame({"prediction_0": 1 - y, "prediction_1": y})
                elif self._params["ml_task"] == MULTICLASS_CLASSIFICATION:
                    return pd.DataFrame(
                        data=y,
                        columns=["prediction_{}".format(i) for i in range(y.shape[1])],
                    )

        return pd.DataFrame({"prediction": y})

    def to_json(self):
        preprocessing_params = {}
        if self._remove_columns:
            preprocessing_params["remove_columns"] = self._remove_columns
        if self._missing_values is not None and len(self._missing_values):
            mvs = []  # refactor
            for mv in self._missing_values:
                if mv.to_json():
                    mvs += [mv.to_json()]
            if mvs:
                preprocessing_params["missing_values"] = mvs
        if self._categorical is not None and len(self._categorical):
            cats = []  # refactor
            for cat in self._categorical:
                if cat.to_json():
                    cats += [cat.to_json()]
            if cats:
                preprocessing_params["categorical"] = cats
        if self._scale is not None and len(self._scale):
            scs = [sc.to_json() for sc in self._scale if sc.to_json()]
            if scs:
                preprocessing_params["scale"] = scs
        if self._categorical_y is not None:
            cat_y = self._categorical_y.to_json()
            if cat_y:
                preprocessing_params["categorical_y"] = cat_y
        if self._scale_y is not None:
            preprocessing_params["scale_y"] = self._scale_y.to_json()

        if "ml_task" in self._params:
            preprocessing_params["ml_task"] = self._params["ml_task"]
        return preprocessing_params

    def from_json(self, data_json):
        if "remove_columns" in data_json:
            self._remove_columns = data_json.get("remove_columns", [])
        if "missing_values" in data_json:
            self._missing_values = []
            for mv_data in data_json["missing_values"]:
                mv = PreprocessingMissingValues()
                mv.from_json(mv_data)
                self._missing_values += [mv]
        if "categorical" in data_json:
            self._categorical = []
            for cat_data in data_json["categorical"]:
                cat = PreprocessingCategorical()
                cat.from_json(cat_data)
                self._categorical += [cat]
        if "scale" in data_json:
            self._scale = []
            for scale_data in data_json["scale"]:
                sc = Scale()
                sc.from_json(scale_data)
                self._scale += [sc]
        if "categorical_y" in data_json:
            self._categorical_y = LabelEncoder()
            self._categorical_y.from_json(data_json["categorical_y"])
        if "scale_y" in data_json:
            self._scale_y = Scale()
            self._scale_y.from_json(data_json["scale_y"])
        if "ml_task" in data_json:
            self._params["ml_task"] = data_json["ml_task"]
Exemple #7
0
    def run(self, train_data=None, validation_data=None):
        log.debug("PreprocessingStep.run")
        X_train, y_train = None, None
        if train_data is not None:
            if "X" in train_data:
                X_train = train_data.get("X").copy()
            if "y" in train_data:
                y_train = train_data.get("y").copy()
        X_validation, y_validation = None, None
        if validation_data is not None:
            if "X" in validation_data:
                X_validation = validation_data.get("X").copy()
            if "y" in validation_data:
                y_validation = validation_data.get("y").copy()

        if y_train is not None:
            # target preprocessing
            # this must be used first, maybe we will drop some rows because of missing target values
            target_preprocessing = self._params.get("target_preprocessing")
            log.debug(
                "target_preprocessing -> {}".format(target_preprocessing))

            # if PreprocessingMissingValues.NA_EXCLUDE in target_preprocessing:
            X_train, y_train = PreprocessingExcludeMissingValues.transform(
                X_train, y_train)
            if validation_data is not None:
                X_validation, y_validation = PreprocessingExcludeMissingValues.transform(
                    X_validation, y_validation)

            if PreprocessingCategorical.CONVERT_INTEGER in target_preprocessing:
                self._categorical_y = LabelEncoder()
                self._categorical_y.fit(y_train)
                y_train = pd.Series(self._categorical_y.transform(y_train))
                if y_validation is not None and self._categorical_y is not None:
                    y_validation = pd.Series(
                        self._categorical_y.transform(y_validation))

            if PreprocessingScale.SCALE_LOG_AND_NORMAL in target_preprocessing:
                log.error("not implemented SCALE_LOG_AND_NORMAL")
                raise Exception("not implemented SCALE_LOG_AND_NORMAL")

            if PreprocessingScale.SCALE_NORMAL in target_preprocessing:
                log.error("not implemented SCALE_NORMAL")
                raise Exception("not implemented SCALE_NORMAL")

        # columns preprocessing
        columns_preprocessing = self._params.get("columns_preprocessing")
        for column in columns_preprocessing:
            transforms = columns_preprocessing[column]
            log.debug("Preprocess column -> {}, {}".format(column, transforms))

        # remove empty or constant columns
        cols_to_remove = list(
            filter(
                lambda k: "remove_column" in columns_preprocessing[k],
                columns_preprocessing,
            ))

        if X_train is not None:
            X_train.drop(cols_to_remove, axis=1, inplace=True)
        if X_validation is not None:
            X_validation.drop(cols_to_remove, axis=1, inplace=True)
        self._remove_columns = cols_to_remove

        for missing_method in [PreprocessingMissingValues.FILL_NA_MEDIAN]:
            cols_to_process = list(
                filter(
                    lambda k: missing_method in columns_preprocessing[k],
                    columns_preprocessing,
                ))
            missing = PreprocessingMissingValues(cols_to_process,
                                                 missing_method)
            missing.fit(X_train)
            X_train = missing.transform(X_train)
            if X_validation is not None:
                X_validation = missing.transform(X_validation)
            self._missing_values += [missing]

        for convert_method in [PreprocessingCategorical.CONVERT_INTEGER]:
            cols_to_process = list(
                filter(
                    lambda k: convert_method in columns_preprocessing[k],
                    columns_preprocessing,
                ))
            convert = PreprocessingCategorical(cols_to_process, convert_method)
            convert.fit(X_train)
            X_train = convert.transform(X_train)
            if X_validation is not None:
                X_validation = convert.transform(X_validation)
            self._categorical += [convert]

        # SCALE
        for scale_method in [PreprocessingScale.SCALE_NORMAL]:
            cols_to_process = list(
                filter(
                    lambda k: scale_method in columns_preprocessing[k],
                    columns_preprocessing,
                ))
            if len(cols_to_process):
                scale = PreprocessingScale(cols_to_process)
                scale.fit(X_train)
                X_train = scale.transform(X_train)
                if X_validation is not None:
                    X_validation = scale.transform(X_validation)
                self._scale += [scale]

        return {
            "X": X_train,
            "y": y_train
        }, {
            "X": X_validation,
            "y": y_validation
        }
Exemple #8
0
class PreprocessingStep(object):
    def __init__(
        self,
        preprocessing_params={
            "target_preprocessing": [],
            "columns_preprocessing": {}
        },
    ):
        self._params = preprocessing_params

        if "target_preprocessing" not in preprocessing_params:
            self._params["target_preprocessing"] = []
        if "columns_preprocessing" not in preprocessing_params:
            self._params["columns_preprocessing"] = {}

        # preprocssing step attributes
        self._categorical_y = None
        self._missing_values = []
        self._categorical = []
        self._scale = []
        self._remove_columns = []

    def _exclude_missing_targets(self, X=None, y=None):
        # check if there are missing values in target column
        if y is None:
            return X, y
        y_missing = pd.isnull(y)
        if np.sum(np.array(y_missing)) == 0:
            return X, y
        y = y.drop(y.index[y_missing])
        y.index = range(y.shape[0])
        if X is not None:
            X = X.drop(X.index[y_missing])
            X.index = range(X.shape[0])
        return X, y

    def run(self, train_data=None, validation_data=None):
        log.debug("PreprocessingStep.run")
        X_train, y_train = None, None
        if train_data is not None:
            if "X" in train_data:
                X_train = train_data.get("X").copy()
            if "y" in train_data:
                y_train = train_data.get("y").copy()
        X_validation, y_validation = None, None
        if validation_data is not None:
            if "X" in validation_data:
                X_validation = validation_data.get("X").copy()
            if "y" in validation_data:
                y_validation = validation_data.get("y").copy()

        if y_train is not None:
            # target preprocessing
            # this must be used first, maybe we will drop some rows because of missing target values
            target_preprocessing = self._params.get("target_preprocessing")
            log.debug(
                "target_preprocessing -> {}".format(target_preprocessing))

            # if PreprocessingMissingValues.NA_EXCLUDE in target_preprocessing:
            X_train, y_train = PreprocessingExcludeMissingValues.transform(
                X_train, y_train)
            if validation_data is not None:
                X_validation, y_validation = PreprocessingExcludeMissingValues.transform(
                    X_validation, y_validation)

            if PreprocessingCategorical.CONVERT_INTEGER in target_preprocessing:
                self._categorical_y = LabelEncoder()
                self._categorical_y.fit(y_train)
                y_train = pd.Series(self._categorical_y.transform(y_train))
                if y_validation is not None and self._categorical_y is not None:
                    y_validation = pd.Series(
                        self._categorical_y.transform(y_validation))

            if PreprocessingScale.SCALE_LOG_AND_NORMAL in target_preprocessing:
                log.error("not implemented SCALE_LOG_AND_NORMAL")
                raise Exception("not implemented SCALE_LOG_AND_NORMAL")

            if PreprocessingScale.SCALE_NORMAL in target_preprocessing:
                log.error("not implemented SCALE_NORMAL")
                raise Exception("not implemented SCALE_NORMAL")

        # columns preprocessing
        columns_preprocessing = self._params.get("columns_preprocessing")
        for column in columns_preprocessing:
            transforms = columns_preprocessing[column]
            log.debug("Preprocess column -> {}, {}".format(column, transforms))

        # remove empty or constant columns
        cols_to_remove = list(
            filter(
                lambda k: "remove_column" in columns_preprocessing[k],
                columns_preprocessing,
            ))

        if X_train is not None:
            X_train.drop(cols_to_remove, axis=1, inplace=True)
        if X_validation is not None:
            X_validation.drop(cols_to_remove, axis=1, inplace=True)
        self._remove_columns = cols_to_remove

        for missing_method in [PreprocessingMissingValues.FILL_NA_MEDIAN]:
            cols_to_process = list(
                filter(
                    lambda k: missing_method in columns_preprocessing[k],
                    columns_preprocessing,
                ))
            missing = PreprocessingMissingValues(cols_to_process,
                                                 missing_method)
            missing.fit(X_train)
            X_train = missing.transform(X_train)
            if X_validation is not None:
                X_validation = missing.transform(X_validation)
            self._missing_values += [missing]

        for convert_method in [PreprocessingCategorical.CONVERT_INTEGER]:
            cols_to_process = list(
                filter(
                    lambda k: convert_method in columns_preprocessing[k],
                    columns_preprocessing,
                ))
            convert = PreprocessingCategorical(cols_to_process, convert_method)
            convert.fit(X_train)
            X_train = convert.transform(X_train)
            if X_validation is not None:
                X_validation = convert.transform(X_validation)
            self._categorical += [convert]

        # SCALE
        for scale_method in [PreprocessingScale.SCALE_NORMAL]:
            cols_to_process = list(
                filter(
                    lambda k: scale_method in columns_preprocessing[k],
                    columns_preprocessing,
                ))
            if len(cols_to_process):
                scale = PreprocessingScale(cols_to_process)
                scale.fit(X_train)
                X_train = scale.transform(X_train)
                if X_validation is not None:
                    X_validation = scale.transform(X_validation)
                self._scale += [scale]

        return {
            "X": X_train,
            "y": y_train
        }, {
            "X": X_validation,
            "y": y_validation
        }

    def transform(self, validation_data=None):
        log.debug("PreprocessingStep.transform")
        X_validation, y_validation = None, None
        if validation_data is not None:
            if "X" in validation_data:
                X_validation = validation_data.get("X").copy()
            if "y" in validation_data:
                y_validation = validation_data.get("y").copy()

        # target preprocessing
        # this must be used first, maybe we will drop some rows because of missing target values
        target_preprocessing = self._params.get("target_preprocessing")
        log.debug("target_preprocessing -> {}".format(target_preprocessing))

        if validation_data is not None:
            X_validation, y_validation = PreprocessingExcludeMissingValues.transform(
                X_validation, y_validation)

        if PreprocessingCategorical.CONVERT_INTEGER in target_preprocessing:
            if y_validation is not None and self._categorical_y is not None:
                y_validation = pd.Series(
                    self._categorical_y.transform(y_validation))

        if PreprocessingScale.SCALE_LOG_AND_NORMAL in target_preprocessing:
            log.error("not implemented SCALE_LOG_AND_NORMAL")
            raise Exception("not implemented SCALE_LOG_AND_NORMAL")

        if PreprocessingScale.SCALE_NORMAL in target_preprocessing:
            log.error("not implemented SCALE_NORMAL")
            raise Exception("not implemented SCALE_NORMAL")

        # columns preprocessing
        if len(self._remove_columns) and X_validation is not None:
            cols_to_remove = [
                col for col in X_validation.columns
                if col in self._remove_columns
            ]
            X_validation.drop(cols_to_remove, axis=1, inplace=True)

        for missing in self._missing_values:
            if X_validation is not None and missing is not None:
                X_validation = missing.transform(X_validation)
        for convert in self._categorical:
            if X_validation is not None and convert is not None:
                X_validation = convert.transform(X_validation)
        for scale in self._scale:
            if X_validation is not None and scale is not None:
                X_validation = scale.transform(X_validation)

        return {"X": X_validation, "y": y_validation}

    def to_json(self):
        preprocessing_params = {}
        if self._remove_columns:
            preprocessing_params["remove_columns"] = self._remove_columns
        if self._missing_values is not None and len(self._missing_values):
            mvs = []  # refactor
            for mv in self._missing_values:
                if mv.to_json():
                    mvs += [mv.to_json()]
            if mvs:
                preprocessing_params["missing_values"] = mvs
        if self._categorical is not None and len(self._categorical):
            cats = []  # refactor
            for cat in self._categorical:
                if cat.to_json():
                    cats += [cat.to_json()]
            if cats:
                preprocessing_params["categorical"] = cats
        if self._scale is not None and len(self._scale):
            scs = [sc.to_json() for sc in self._scale if sc.to_json()]
            if scs:
                preprocessing_params["scale"] = scs
        if self._categorical_y is not None:
            cat_y = self._categorical_y.to_json()
            if cat_y:
                preprocessing_params["categorical_y"] = cat_y
        return preprocessing_params

    def reverse_transform_target(self, y):

        #target_preprocessing = self._params.get("target_preprocessing")
        # assume for now that all tasks are binary classification
        # if there is no target preprocessing, assume that there is 0 and 1 target
        pos_label, neg_label = "1", "0"
        if self._categorical_y is not None:
            for label, value in self._categorical_y.to_json().items():
                if value == 1:
                    pos_label = label
                else:
                    neg_label = label

        return pd.DataFrame({
            "p_{}".format(neg_label): 1 - y,
            "p_{}".format(pos_label): y
        })

    def from_json(self, data_json):
        if "remove_columns" in data_json:
            self._remove_columns = data_json.get("remove_columns", [])
        if "missing_values" in data_json:
            self._missing_values = []
            for mv_data in data_json["missing_values"]:
                mv = PreprocessingMissingValues()
                mv.from_json(mv_data)
                self._missing_values += [mv]
        if "categorical" in data_json:
            self._categorical = []
            for cat_data in data_json["categorical"]:
                cat = PreprocessingCategorical()
                cat.from_json(cat_data)
                self._categorical += [cat]
        if "scale" in data_json:
            self._scale = []
            for scale_data in data_json["scale"]:
                sc = PreprocessingScale()
                sc.from_json(scale_data)
                self._scale += [sc]
        if "categorical_y" in data_json:
            self._categorical_y = LabelEncoder()
            self._categorical_y.from_json(data_json["categorical_y"])