Esempio n. 1
0
    def test_fit(self):
        # training data
        d = {
            "col1": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10.0],
            "col2": [21, 22, 23, 24, 25, 26, 27, 28, 29, 30.0],
        }
        df = pd.DataFrame(data=d)

        scale = PreprocessingScale(["col1"])
        scale.fit(df)
        df = scale.transform(df)

        assert_almost_equal(np.mean(df["col1"]), 0)
        assert_almost_equal(np.mean(df["col2"]), 25.5)
Esempio n. 2
0
    def test_to_and_from_json(self):
        # training data
        d = {
            "col1": [1, 2, 3, 4, 5, 6, 7, 8.0, 9, 10],
            "col2": [21, 22.0, 23, 24, 25, 26, 27, 28, 29, 30],
        }
        df = pd.DataFrame(data=d)

        scale = PreprocessingScale(["col1"])
        scale.fit(df)
        # do not transform
        assert_almost_equal(np.mean(df["col1"]), 5.5)
        assert_almost_equal(np.mean(df["col2"]), 25.5)
        # to and from json

        json_data = scale.to_json()
        print(json_data)
        scale2 = PreprocessingScale()
        scale2.from_json(json_data)
        # transform with loaded scaler
        df = scale2.transform(df)
        assert_almost_equal(np.mean(df["col1"]), 0)
        assert_almost_equal(np.mean(df["col2"]), 25.5)
Esempio n. 3
0
    def run(self, train_data=None, validation_data=None):
        log.debug("PreprocessingStep.run")
        X_train, y_train = None, None
        if train_data is not None:
            if "X" in train_data:
                X_train = train_data.get("X").copy()
            if "y" in train_data:
                y_train = train_data.get("y").copy()
        X_validation, y_validation = None, None
        if validation_data is not None:
            if "X" in validation_data:
                X_validation = validation_data.get("X").copy()
            if "y" in validation_data:
                y_validation = validation_data.get("y").copy()

        if y_train is not None:
            # target preprocessing
            # this must be used first, maybe we will drop some rows because of missing target values
            target_preprocessing = self._params.get("target_preprocessing")
            log.debug(
                "target_preprocessing -> {}".format(target_preprocessing))

            # if PreprocessingMissingValues.NA_EXCLUDE in target_preprocessing:
            X_train, y_train = PreprocessingExcludeMissingValues.transform(
                X_train, y_train)
            if validation_data is not None:
                X_validation, y_validation = PreprocessingExcludeMissingValues.transform(
                    X_validation, y_validation)

            if PreprocessingCategorical.CONVERT_INTEGER in target_preprocessing:
                self._categorical_y = LabelEncoder()
                self._categorical_y.fit(y_train)
                y_train = pd.Series(self._categorical_y.transform(y_train))
                if y_validation is not None and self._categorical_y is not None:
                    y_validation = pd.Series(
                        self._categorical_y.transform(y_validation))

            if PreprocessingScale.SCALE_LOG_AND_NORMAL in target_preprocessing:
                log.error("not implemented SCALE_LOG_AND_NORMAL")
                raise Exception("not implemented SCALE_LOG_AND_NORMAL")

            if PreprocessingScale.SCALE_NORMAL in target_preprocessing:
                log.error("not implemented SCALE_NORMAL")
                raise Exception("not implemented SCALE_NORMAL")

        # columns preprocessing
        columns_preprocessing = self._params.get("columns_preprocessing")
        for column in columns_preprocessing:
            transforms = columns_preprocessing[column]
            log.debug("Preprocess column -> {}, {}".format(column, transforms))

        # remove empty or constant columns
        cols_to_remove = list(
            filter(
                lambda k: "remove_column" in columns_preprocessing[k],
                columns_preprocessing,
            ))

        if X_train is not None:
            X_train.drop(cols_to_remove, axis=1, inplace=True)
        if X_validation is not None:
            X_validation.drop(cols_to_remove, axis=1, inplace=True)
        self._remove_columns = cols_to_remove

        for missing_method in [PreprocessingMissingValues.FILL_NA_MEDIAN]:
            cols_to_process = list(
                filter(
                    lambda k: missing_method in columns_preprocessing[k],
                    columns_preprocessing,
                ))
            missing = PreprocessingMissingValues(cols_to_process,
                                                 missing_method)
            missing.fit(X_train)
            X_train = missing.transform(X_train)
            if X_validation is not None:
                X_validation = missing.transform(X_validation)
            self._missing_values += [missing]

        for convert_method in [PreprocessingCategorical.CONVERT_INTEGER]:
            cols_to_process = list(
                filter(
                    lambda k: convert_method in columns_preprocessing[k],
                    columns_preprocessing,
                ))
            convert = PreprocessingCategorical(cols_to_process, convert_method)
            convert.fit(X_train)
            X_train = convert.transform(X_train)
            if X_validation is not None:
                X_validation = convert.transform(X_validation)
            self._categorical += [convert]

        # SCALE
        for scale_method in [PreprocessingScale.SCALE_NORMAL]:
            cols_to_process = list(
                filter(
                    lambda k: scale_method in columns_preprocessing[k],
                    columns_preprocessing,
                ))
            if len(cols_to_process):
                scale = PreprocessingScale(cols_to_process)
                scale.fit(X_train)
                X_train = scale.transform(X_train)
                if X_validation is not None:
                    X_validation = scale.transform(X_validation)
                self._scale += [scale]

        return {
            "X": X_train,
            "y": y_train
        }, {
            "X": X_validation,
            "y": y_validation
        }