Esempio n. 1
0
 def test_transform_float(self):
     imputer_value = ['', 'none', 'na', 'null', "10000", "-10000"]
     imputer = Imputer(missing_value_list=imputer_value)
     cols_transform_value_ground_true = [0.963102, 1.467675, 0.829202, 0.772457, 1.000835, 0.962702, 1.077099,
                                         1.053586, 2.996525, 0.961696]
     process_data = imputer.transform(self.table_instance, cols_transform_value_ground_true, output_format="float")
     test_data_fit = self.fit_test_data_float(self.test_data, cols_transform_value_ground_true, imputer_value)
     self.assertListEqual(self.table_to_list(process_data), test_data_fit)
Esempio n. 2
0
    def test_fit_none_replace_method(self):
        imputer_value = ['NA', 'naaa']
        imputer = Imputer(imputer_value)
        process_data, cols_transform_value = imputer.fit(self.table_instance, output_format='str')
        cols_transform_value_ground_true = ['0' for _ in range(10)]
        test_data_fit = self.fit_test_data(self.test_data, cols_transform_value_ground_true, imputer_value)

        self.assertListEqual(self.table_to_list(process_data), test_data_fit)
        self.assertListEqual(cols_transform_value, cols_transform_value_ground_true)
Esempio n. 3
0
    def test_fit_min(self):
        imputer_value = ['', 'none', 'na', 'null', "10000", "-10000"]
        imputer = Imputer(missing_value_list=imputer_value)
        process_data, cols_transform_value = imputer.fit(self.table_instance, "min", output_format='str')
        cols_transform_value_ground_true = [-1.451067, -2.147457, -1.166747, -1.092337, -1.037534, -1.302401, -1.305831,
                                            -1.745063, -1.591501, -1.230554]
        test_data_fit = self.fit_test_data(self.test_data, cols_transform_value_ground_true, imputer_value)

        self.assertListEqual(self.table_to_list(process_data), test_data_fit)
        self.assertListEqual(cols_transform_value, cols_transform_value_ground_true)
Esempio n. 4
0
    def test_fit_median(self):
        imputer_value = ['', 'none', 'na', 'null', "10000", "-10000"]
        imputer = Imputer(missing_value_list=imputer_value)
        process_data, cols_transform_value = imputer.fit(self.table_instance, "median", output_format='str')
        cols_transform_value_ground_true = [-0.606584, -0.193332, -0.620475, -0.591332, -0.327392, -0.519504, -0.610669,
                                            -0.768581, -0.28757, -0.247477]
        test_data_fit = self.fit_test_data(self.test_data, cols_transform_value_ground_true, imputer_value)

        self.assertListEqual(self.table_to_list(process_data), test_data_fit)
        self.assertListEqual(cols_transform_value, cols_transform_value_ground_true)
Esempio n. 5
0
    def test_fit_mean(self):
        imputer_value = ['', 'none', 'na', 'null', "10000", "-10000"]
        imputer = Imputer(missing_value_list=imputer_value)
        process_data, cols_transform_value = imputer.fit(self.table_instance, "mean", output_format='str')
        cols_transform_value_ground_true = [-0.413542, -0.330818, -0.343831, -0.444957, -0.107726, -0.569688, -0.548734,
                                            -0.670353, 0.002498, -0.275518]
        imputer_value = ['', 'none', 'na', 'null', "10000", "-10000"]
        test_data_fit = self.fit_test_data(self.test_data, cols_transform_value_ground_true, imputer_value)

        self.assertListEqual(self.table_to_list(process_data), test_data_fit)
        self.assertListEqual(cols_transform_value, cols_transform_value_ground_true)
Esempio n. 6
0
    def transform(self, data):
        LOGGER.info(f"Enter Feature Imputation transform")
        imputer_processor = Imputer(self.missing_impute)
        imputed_data = imputer_processor.transform(
            data, transform_value=self.default_value, skip_cols=self.skip_cols)
        if self.missing_impute is None:
            self.missing_impute = imputer_processor.get_missing_value_list()

        self.missing_impute_rate = imputer_processor.get_impute_rate(
            "transform")
        return imputed_data
Esempio n. 7
0
    def test_fit_max_float(self):
        imputer = Imputer()
        process_data, cols_transform_value = imputer.fit(self.table_instance,
                                                         "max",
                                                         output_format='float')
        cols_transform_value_ground_true = [
            0.963102, 1.467675, 0.829202, 0.772457, 1.000835, 0.962702,
            1.077099, 1.053586, 2.996525, 0.961696
        ]
        imputer_value = ['', 'none', 'na', 'null']
        test_data_fit = self.fit_test_data_float(
            self.test_data, cols_transform_value_ground_true, imputer_value)

        self.assertListEqual(self.table_to_list(process_data), test_data_fit)
        self.assertListEqual(cols_transform_value,
                             cols_transform_value_ground_true)
Esempio n. 8
0
    def fill_missing_value(self, input_data_features, mode="fit"):
        if self.missing_fill:
            from federatedml.feature.imputer import Imputer
            imputer_processor = Imputer(self.missing_impute)
            if mode == "fit":
                input_data_features, self.default_value = imputer_processor.fit(
                    input_data_features,
                    replace_method=self.missing_fill_method,
                    replace_value=self.default_value)
                if self.missing_impute is None:
                    self.missing_impute = imputer_processor.get_missing_value_list(
                    )
            else:
                input_data_features = imputer_processor.transform(
                    input_data_features, transform_value=self.default_value)

            if self.missing_impute is None:
                self.missing_impute = imputer_processor.get_missing_value_list(
                )

            self.missing_impute_rate = imputer_processor.get_impute_rate(mode)
            # callback("missing_value_ratio",
            #         missing_impute_rate,
            #         self.tracker)

            # callback("missing_value_list",
            #           self.missing_impute,
            #           self.tracker)

        return input_data_features
Esempio n. 9
0
    def replace_outlier_value(self, input_data_features, mode="fit"):
        if self.outlier_replace:
            from federatedml.feature.imputer import Imputer
            imputer_processor = Imputer(self.outlier_impute)
            if mode == "fit":
                input_data_features, self.outlier_replace_value = \
                    imputer_processor.fit(input_data_features,
                                          replace_method=self.outlier_replace_method,
                                          replace_value=self.outlier_replace_value)

                if self.outlier_impute is None:
                    self.outlier_impute = imputer_processor.get_imputer_value_list()
            else:
                input_data_features = imputer_processor.transform(input_data_features,
                                                                  replace_method=self.outlier_replace_method,
                                                                  transform_value=self.outlier_replace_value)

        return input_data_features
Esempio n. 10
0
    def test_get_impute_rate(self):
        imputer_value = ['', 'none', 'na', 'null', "10000", "-10000"]
        imputer = Imputer(missing_value_list=imputer_value)
        _, _ = imputer.fit(self.table_instance, "median", output_format='str')
        cols_impute_rate_ground_true = [0, 0.3, 0.1, 0.1, 0.1, 0.1, 0, 0.1, 0, 0]
        cols_fit_impute_rate = imputer.get_impute_rate(mode="fit")
        self.assertListEqual(cols_fit_impute_rate, cols_impute_rate_ground_true)

        cols_transform_value_ground_true = [-0.606584, -0.193332, -0.620475, -0.591332, -0.327392, -0.519504, -0.610669,
                                            -0.768581, -0.28757, -0.247477]
        _ = imputer.transform(self.table_instance, cols_transform_value_ground_true)
        cols_transform_impute_rate = imputer.get_impute_rate(mode="fit")
        self.assertListEqual(cols_transform_impute_rate, cols_impute_rate_ground_true)
Esempio n. 11
0
    def fill_missing_value(self, input_data, tags_dict, mode="fit"):
        str_trans_method = functools.partial(
            self.change_tag_to_str,
            tags_dict=tags_dict,
            delimitor=self.delimitor,
            with_label=self.with_label,
            tag_value_delimitor=self.tag_value_delimitor)

        input_data = input_data.mapValues(str_trans_method)
        schema = make_schema(self.header, self.sid_name, self.label_name)
        set_schema(input_data, schema)

        from federatedml.feature.imputer import Imputer
        imputer_processor = Imputer()
        if mode == "fit":
            data, self.default_value = imputer_processor.fit(
                input_data,
                replace_method=self.missing_fill_method,
                replace_value=self.default_value)
            LOGGER.debug("self.default_value is {}".format(self.default_value))
        else:
            data = imputer_processor.transform(
                input_data, transform_value=self.default_value)
        if self.missing_impute is None:
            self.missing_impute = imputer_processor.get_missing_value_list()

        LOGGER.debug("self.missing_impute is {}".format(self.missing_impute))

        self.missing_impute_rate = imputer_processor.get_impute_rate(mode)

        str_trans_tag_method = functools.partial(
            self.change_str_to_tag,
            tags_dict=tags_dict,
            delimitor=self.delimitor,
            tag_value_delimitor=self.tag_value_delimitor)

        data = data.mapValues(str_trans_tag_method)

        return data
Esempio n. 12
0
    def fit(self, data):
        LOGGER.info(f"Enter Feature Imputation fit")
        imputer_processor = Imputer(self.missing_impute)
        self.header = get_header(data)
        if self.col_missing_fill_method:
            for k in self.col_missing_fill_method.keys():
                if k not in self.header:
                    raise ValueError(
                        f"{k} not found in data header. Please check col_missing_fill_method keys."
                    )
        imputed_data, self.default_value = imputer_processor.fit(
            data,
            replace_method=self.missing_fill_method,
            replace_value=self.default_value,
            col_replace_method=self.col_missing_fill_method)
        if self.missing_impute is None:
            self.missing_impute = imputer_processor.get_missing_value_list()
        self.missing_impute_rate = imputer_processor.get_impute_rate("fit")
        # self.header = get_header(imputed_data)
        self.cols_replace_method = imputer_processor.cols_replace_method
        self.skip_cols = imputer_processor.get_skip_cols()
        self.set_summary(self.get_summary())

        return imputed_data