コード例 #1
0
 def setUp(self):
     self.param = FeatureBinningParam()
     json_config_file = home_dir + '/param_feature_binning.json'
     self.config_path = json_config_file
     with open(json_config_file, 'r', encoding='utf-8') as load_f:
         role_config = json.load(load_f)
     self.config_json = role_config
コード例 #2
0
ファイル: homo_split_points.py プロジェクト: zpskt/FATE
    def average_run(self, data_instances, bin_num=10, abnormal_list=None):
        if self.bin_param is None:
            bin_param = FeatureBinningParam(bin_num=bin_num)
            self.bin_param = bin_param
        else:
            bin_param = self.bin_param

        if self.bin_method == consts.QUANTILE:
            bin_obj = QuantileBinning(params=bin_param,
                                      abnormal_list=abnormal_list,
                                      allow_duplicate=True)
        else:
            raise ValueError(
                "H**o Split Point do not accept bin_method: {}".format(
                    self.bin_method))

        abnormal_detection.empty_table_detection(data_instances)
        abnormal_detection.empty_feature_detection(data_instances)

        split_points = bin_obj.fit_split_points(data_instances)
        split_points = {k: np.array(v) for k, v in split_points.items()}
        split_points_weights = DictWeights(d=split_points)

        self.aggregator.send_model(split_points_weights, self.suffix)
        dict_split_points = self.aggregator.get_aggregated_model(self.suffix)
        split_points = {
            k: list(v)
            for k, v in dict_split_points.unboxed.items()
        }
        self.bin_obj = bin_obj
        return split_points
コード例 #3
0
 def convert_feature_to_bin(self, data_instance):
     LOGGER.info("convert feature to bins")
     param_obj = FeatureBinningParam(bin_num=self.bin_num)
     binning_obj = QuantileBinning(param_obj)
     binning_obj.fit_split_points(data_instance)
     self.data_bin, self.bin_split_points, self.bin_sparse_points = binning_obj.convert_feature_to_bin(
         data_instance)
コード例 #4
0
 def test_directly_extract(self):
     param_obj = FeatureBinningParam()
     extractor = ParamExtract()
     param_obj = extractor.parse_param_from_config(param_obj,
                                                   self.config_json)
     self.assertTrue(param_obj.method == "quantile")
     self.assertTrue(param_obj.transform_param.transform_type == 'bin_num')
コード例 #5
0
ファイル: quantile_test.py プロジェクト: pangzx1/FATE1.1
 def test_new_sparse_quantile(self):
     param_obj = FeatureBinningParam(bin_num=4)
     binning_obj = QuantileBinning(param_obj)
     binning_obj.fit_split_points(self.sparse_table)
     data_bin, bin_splitpoints, bin_sparse = binning_obj.convert_feature_to_bin(self.sparse_table)
     bin_result = dict([(key, inst.features) for key, inst in data_bin.collect()])
     for i in range(20):
         self.assertTrue(len(self.sparse_inst[i][1].features.sparse_vec) == len(bin_result[i].sparse_vec))
コード例 #6
0
 def _get_quantile_median(self):
     cols_index = self._get_cols_index()
     bin_param = FeatureBinningParam(bin_num=2, cols=cols_index)
     binning_obj = QuantileBinning(bin_param, abnormal_list=self.abnormal_list)
     split_points = binning_obj.fit_split_points(self.data_instances)
     medians = {}
     for col_name, split_point in split_points.items():
         medians[col_name] = split_point[0]
     return medians
コード例 #7
0
    def _bin_obj_generator(self, abnormal_list: list = None, this_bin_num=bin_num):

        bin_param = FeatureBinningParam(method='quantile', compress_thres=consts.DEFAULT_COMPRESS_THRESHOLD,
                                        head_size=consts.DEFAULT_HEAD_SIZE,
                                        error=consts.DEFAULT_RELATIVE_ERROR,
                                        bin_indexes=-1,
                                        bin_num=this_bin_num)
        bin_obj = QuantileBinning(bin_param, abnormal_list=abnormal_list)
        return bin_obj
コード例 #8
0
    def convert_feature_to_bin(self, data_instance):
        LOGGER.info("convert feature to bins")
        param_obj = FeatureBinningParam(bin_num=self.bin_num)
        if self.use_missing:
            binning_obj = QuantileBinning(param_obj, abnormal_list=[NoneType()])
        else:
            binning_obj = QuantileBinning(param_obj)

        binning_obj.fit_split_points(data_instance)
        self.data_bin, self.bin_split_points, self.bin_sparse_points = binning_obj.convert_feature_to_bin(data_instance)
コード例 #9
0
    def _bin_obj_generator(self):

        bin_param = FeatureBinningParam(method='quantile',
                                        compress_thres=compress_thres,
                                        head_size=head_size,
                                        error=error,
                                        cols=-1,
                                        bin_num=bin_num)
        bin_obj = QuantileBinning(bin_param)
        return bin_obj
コード例 #10
0
 def __init__(self,
              bin_nums=consts.G_BIN_NUM,
              param_obj: FeatureBinningParam = None,
              abnormal_list=None,
              allow_duplicate=False):
     if param_obj is None:
         param_obj = FeatureBinningParam(bin_num=bin_nums)
     super().__init__(params=param_obj,
                      abnormal_list=abnormal_list,
                      allow_duplicate=allow_duplicate)
コード例 #11
0
    def __init__(self):
        super(BaseHeteroFeatureBinning, self).__init__()
        self.transfer_variable = HeteroFeatureBinningTransferVariable()
        self.binning_obj = None
        self.header = None
        self.schema = None
        self.host_results = []
        self.transform_type = None

        self.model_param = FeatureBinningParam()
        self.bin_inner_param = BinInnerParam()
コード例 #12
0
 def test_bucket_binning(self):
     bin_param = FeatureBinningParam(bin_num=self.bin_num, cols=self.cols)
     bucket_bin = BucketBinning(bin_param)
     split_points = bucket_bin.fit_split_points(self.table)
     split_point = list(split_points.values())[0]
     for kth, s_p in enumerate(split_point):
         expect_s_p = (self.data_num - 1) / self.bin_num * (kth + 1)
         self.assertEqual(s_p, expect_s_p)
     iv_attrs = bucket_bin.cal_local_iv(self.table)
     for col_name, iv_attr in iv_attrs.items():
         print('col_name: {}, iv: {}, woe_array: {}'.format(
             col_name, iv_attr.iv, iv_attr.woe_array))
コード例 #13
0
ファイル: statics.py プロジェクト: sf-fl/federatedML
    def _static_quantile_summaries(self):
        """
        Static summaries so that can query a specific quantile point
        """
        if self.binning_obj is not None:
            return self.binning_obj
        bin_param = FeatureBinningParam(bin_num=2, bin_indexes=self.cols_index,
                                        error=self.error)
        self.binning_obj = QuantileBinning(bin_param, abnormal_list=self.abnormal_list)
        self.binning_obj.fit_split_points(self.data_instances)

        return self.binning_obj
コード例 #14
0
ファイル: homo_split_points.py プロジェクト: zpskt/FATE
    def fit(self, data_instances):
        if self.bin_obj is not None:
            return self

        if self.bin_param is None:
            self.bin_param = FeatureBinningParam()

        self.bin_obj = QuantileBinning(params=self.bin_param,
                                       abnormal_list=self.abnormal_list,
                                       allow_duplicate=True)
        self.bin_obj.fit_split_points(data_instances)
        return self
コード例 #15
0
 def test_bucket_binning(self):
     bin_param = FeatureBinningParam(bin_num=self.bin_num,
                                     bin_indexes=self.cols)
     bucket_bin = BucketBinning(bin_param)
     split_points = bucket_bin.fit_split_points(self.table)
     split_point = list(split_points.values())[0]
     for kth, s_p in enumerate(split_point):
         expect_s_p = (self.data_num - 1) / self.bin_num * (kth + 1)
         self.assertEqual(s_p, expect_s_p)
     bucket_bin.cal_local_iv(self.table)
     for col_name, iv_attr in bucket_bin.bin_results.all_cols_results.items(
     ):
         # print('col_name: {}, iv: {}, woe_array: {}'.format(col_name, iv_attr.iv, iv_attr.woe_array))
         assert abs(iv_attr.iv - 0.00364386529386804) < 1e-6
コード例 #16
0
ファイル: base_feature_binning.py プロジェクト: pangzx1/FL1.0
 def __init__(self):
     super(BaseHeteroFeatureBinning, self).__init__()
     self.transfer_variable = HeteroFeatureBinningTransferVariable()
     self.cols = None
     self.cols_dict = {}
     self.binning_obj = None
     self.header = []
     self.schema = {}
     self.has_synchronized = False
     self.flowid = ''
     self.binning_result = {}  # dict of iv_attr
     self.host_results = {}  # dict of host results
     self.party_name = 'Base'
     self.model_param = FeatureBinningParam()
コード例 #17
0
ファイル: base_feature_binning.py プロジェクト: yubo1993/FATE
    def __init__(self):
        super(BaseFeatureBinning, self).__init__()
        self.transfer_variable = HeteroFeatureBinningTransferVariable()
        self.binning_obj: BaseBinning = None
        self.header = None
        self.header_anonymous = None
        self.schema = None
        self.host_results = []
        self.transform_type = None

        self.model_param = FeatureBinningParam()
        self.bin_inner_param = BinInnerParam()
        self.bin_result = MultiClassBinResult(labels=[0, 1])
        self.has_missing_value = False
        self.labels = []
コード例 #18
0
ファイル: boosting.py プロジェクト: zeta1999/FATE
    def convert_feature_to_bin(self, data_instance, handle_missing_value=False):
        """
        convert bin index to real value
        """
        LOGGER.info("convert feature to bins")
        param_obj = FeatureBinningParam(bin_num=self.bin_num, error=self.binning_error)

        if handle_missing_value:
            self.binning_obj = self.binning_class(param_obj, abnormal_list=[NoneType()],)
        else:
            self.binning_obj = self.binning_class(param_obj)

        self.binning_obj.fit_split_points(data_instance)
        LOGGER.info("convert feature to bins over")
        return self.binning_obj.convert_feature_to_bin(data_instance)
コード例 #19
0
    def federated_binning(self, data_instance):

        binning_param = FeatureBinningParam(bin_num=self.bin_num,
                                            error=self.binning_error)
        self.binning_obj.bin_param = binning_param

        if self.use_missing:
            binning_result = self.binning_obj.average_run(
                data_instances=data_instance, abnormal_list=[NoneType()])
        else:
            binning_result = self.binning_obj.average_run(
                data_instances=data_instance, )

        return self.binning_obj.convert_feature_to_bin(data_instance,
                                                       binning_result)
コード例 #20
0
ファイル: quantile_test.py プロジェクト: pangzx1/FATE1.1
    def test_new_dense_quantile(self):
        param_obj = FeatureBinningParam(bin_num=4)
        binning_obj = QuantileBinning(param_obj)
        binning_obj.fit_split_points(self.dense_table)
        data_bin, bin_splitpoints, bin_sparse = binning_obj.convert_feature_to_bin(self.dense_table)
        bin_result = dict([(key, inst.features) for key, inst in data_bin.collect()])
        # print(bin_result)
        for i in range(100):
            self.assertTrue((bin_result[i] == np.ones(20, dtype='int') * ((i % 16) // 4)).all())
            if i < 20:
                # col_name = 'x' + str(i)
                col_idx = i
                split_point = np.array(bin_splitpoints[col_idx])
                self.assertTrue((split_point == np.asarray([3, 7, 11, 15], dtype='int')).all())

        for split_points in bin_splitpoints:
            self.assertTrue(len(split_points) <= 4)
コード例 #21
0
ファイル: bucket_binning_test.py プロジェクト: pangzx1/FL1.0
 def test_bucket_binning(self):
     bin_param = FeatureBinningParam(bin_num=self.bin_num, cols=self.cols)
     bucket_bin = BucketBinning(bin_param)
     split_points = bucket_bin.fit_split_points(self.table)
     print(split_points)
コード例 #22
0
    def fit(self, expect_table, actual_table):

        LOGGER.info('start psi computing')

        header1 = expect_table.schema['header']
        header2 = actual_table.schema['header']

        if not set(header1) == set(header2):
            raise ValueError(
                'table header must be the same while computing psi values')

        # baseline table should not contain empty columns
        abnormal_detection.empty_column_detection(expect_table)

        self.all_feature_list = header1

        # make sure no duplicate features
        self.all_feature_list = self.check_duplicates(self.all_feature_list)

        # kv bi-directional mapping
        self.tag_id_mapping = {
            v: k
            for k, v in enumerate(self.all_feature_list)
        }
        self.id_tag_mapping = {
            k: v
            for k, v in enumerate(self.all_feature_list)
        }

        if not self.is_sparse(
                expect_table):  # convert missing value: nan to NoneType
            expect_table = self.convert_missing_val(expect_table)

        if not self.is_sparse(
                actual_table):  # convert missing value: nan to NoneType
            actual_table = self.convert_missing_val(actual_table)

        if not (self.check_table_content(expect_table)
                and self.check_table_content(actual_table)):
            raise ValueError(
                'contents of input table must be instances of class "Instance"'
            )

        param = FeatureBinningParam(method=consts.QUANTILE,
                                    bin_num=self.max_bin_num,
                                    local_only=True,
                                    error=self.binning_error)
        binning_obj = QuantileBinning(params=param,
                                      abnormal_list=[NoneType()],
                                      allow_duplicate=False)
        binning_obj.fit_split_points(expect_table)

        data_bin, bin_split_points, bin_sparse_points = binning_obj.convert_feature_to_bin(
            expect_table)
        LOGGER.debug('bin split points is {}, shape is {}'.format(
            bin_split_points, bin_split_points.shape))
        self.binning_obj = binning_obj

        self.data_bin1 = data_bin
        self.bin_split_points = bin_split_points
        self.bin_sparse_points = bin_sparse_points
        LOGGER.debug('expect table binning done')

        count_func1 = functools.partial(
            map_partition_handle,
            feat_num=len(self.all_feature_list),
            max_bin_num=self.max_bin_num +
            1,  # an additional bin for missing value
            missing_val=self.dense_missing_val,
            is_sparse=self.is_sparse(self.data_bin1))

        map_rs1 = self.data_bin1.applyPartitions(count_func1)
        count1 = count_rs_to_dict(map_rs1.reduce(map_partition_reduce))

        data_bin2, bin_split_points2, bin_sparse_points2 = binning_obj.convert_feature_to_bin(
            actual_table)
        self.data_bin2 = data_bin2
        LOGGER.debug('actual table binning done')

        count_func2 = functools.partial(
            map_partition_handle,
            feat_num=len(self.all_feature_list),
            max_bin_num=self.max_bin_num +
            1,  # an additional bin for missing value
            missing_val=self.dense_missing_val,
            is_sparse=self.is_sparse(self.data_bin2))

        map_rs2 = self.data_bin2.applyPartitions(count_func2)
        count2 = count_rs_to_dict(map_rs2.reduce(map_partition_reduce))

        self.count1, self.count2 = count1, count2

        LOGGER.info('psi counting done')

        # compute psi from counting result
        psi_result = psi_computer(count1, count2, expect_table.count(),
                                  actual_table.count())
        self.psi_rs = psi_result

        # get total psi score of features
        total_scores = {}
        for idx, rs in enumerate(self.psi_rs):
            feat_name = self.id_tag_mapping[idx]
            total_scores[feat_name] = rs['total_psi']
        self.total_scores = total_scores

        # id-feature mapping convert, str interval computation
        self.str_intervals = self.get_string_interval(
            bin_split_points,
            self.id_tag_mapping,
            missing_bin_idx=self.max_bin_num)

        self.interval_perc1 = self.count_dict_to_percentage(
            copy.deepcopy(count1), expect_table.count())
        self.interval_perc2 = self.count_dict_to_percentage(
            copy.deepcopy(count2), actual_table.count())

        self.set_summary(self.generate_summary())
        LOGGER.info('psi computation done')