def setUp(self): self.param = FeatureBinningParam() json_config_file = home_dir + '/param_feature_binning.json' self.config_path = json_config_file with open(json_config_file, 'r', encoding='utf-8') as load_f: role_config = json.load(load_f) self.config_json = role_config
def average_run(self, data_instances, bin_num=10, abnormal_list=None): if self.bin_param is None: bin_param = FeatureBinningParam(bin_num=bin_num) self.bin_param = bin_param else: bin_param = self.bin_param if self.bin_method == consts.QUANTILE: bin_obj = QuantileBinning(params=bin_param, abnormal_list=abnormal_list, allow_duplicate=True) else: raise ValueError( "H**o Split Point do not accept bin_method: {}".format( self.bin_method)) abnormal_detection.empty_table_detection(data_instances) abnormal_detection.empty_feature_detection(data_instances) split_points = bin_obj.fit_split_points(data_instances) split_points = {k: np.array(v) for k, v in split_points.items()} split_points_weights = DictWeights(d=split_points) self.aggregator.send_model(split_points_weights, self.suffix) dict_split_points = self.aggregator.get_aggregated_model(self.suffix) split_points = { k: list(v) for k, v in dict_split_points.unboxed.items() } self.bin_obj = bin_obj return split_points
def convert_feature_to_bin(self, data_instance): LOGGER.info("convert feature to bins") param_obj = FeatureBinningParam(bin_num=self.bin_num) binning_obj = QuantileBinning(param_obj) binning_obj.fit_split_points(data_instance) self.data_bin, self.bin_split_points, self.bin_sparse_points = binning_obj.convert_feature_to_bin( data_instance)
def test_directly_extract(self): param_obj = FeatureBinningParam() extractor = ParamExtract() param_obj = extractor.parse_param_from_config(param_obj, self.config_json) self.assertTrue(param_obj.method == "quantile") self.assertTrue(param_obj.transform_param.transform_type == 'bin_num')
def test_new_sparse_quantile(self): param_obj = FeatureBinningParam(bin_num=4) binning_obj = QuantileBinning(param_obj) binning_obj.fit_split_points(self.sparse_table) data_bin, bin_splitpoints, bin_sparse = binning_obj.convert_feature_to_bin(self.sparse_table) bin_result = dict([(key, inst.features) for key, inst in data_bin.collect()]) for i in range(20): self.assertTrue(len(self.sparse_inst[i][1].features.sparse_vec) == len(bin_result[i].sparse_vec))
def _get_quantile_median(self): cols_index = self._get_cols_index() bin_param = FeatureBinningParam(bin_num=2, cols=cols_index) binning_obj = QuantileBinning(bin_param, abnormal_list=self.abnormal_list) split_points = binning_obj.fit_split_points(self.data_instances) medians = {} for col_name, split_point in split_points.items(): medians[col_name] = split_point[0] return medians
def _bin_obj_generator(self, abnormal_list: list = None, this_bin_num=bin_num): bin_param = FeatureBinningParam(method='quantile', compress_thres=consts.DEFAULT_COMPRESS_THRESHOLD, head_size=consts.DEFAULT_HEAD_SIZE, error=consts.DEFAULT_RELATIVE_ERROR, bin_indexes=-1, bin_num=this_bin_num) bin_obj = QuantileBinning(bin_param, abnormal_list=abnormal_list) return bin_obj
def convert_feature_to_bin(self, data_instance): LOGGER.info("convert feature to bins") param_obj = FeatureBinningParam(bin_num=self.bin_num) if self.use_missing: binning_obj = QuantileBinning(param_obj, abnormal_list=[NoneType()]) else: binning_obj = QuantileBinning(param_obj) binning_obj.fit_split_points(data_instance) self.data_bin, self.bin_split_points, self.bin_sparse_points = binning_obj.convert_feature_to_bin(data_instance)
def _bin_obj_generator(self): bin_param = FeatureBinningParam(method='quantile', compress_thres=compress_thres, head_size=head_size, error=error, cols=-1, bin_num=bin_num) bin_obj = QuantileBinning(bin_param) return bin_obj
def __init__(self, bin_nums=consts.G_BIN_NUM, param_obj: FeatureBinningParam = None, abnormal_list=None, allow_duplicate=False): if param_obj is None: param_obj = FeatureBinningParam(bin_num=bin_nums) super().__init__(params=param_obj, abnormal_list=abnormal_list, allow_duplicate=allow_duplicate)
def __init__(self): super(BaseHeteroFeatureBinning, self).__init__() self.transfer_variable = HeteroFeatureBinningTransferVariable() self.binning_obj = None self.header = None self.schema = None self.host_results = [] self.transform_type = None self.model_param = FeatureBinningParam() self.bin_inner_param = BinInnerParam()
def test_bucket_binning(self): bin_param = FeatureBinningParam(bin_num=self.bin_num, cols=self.cols) bucket_bin = BucketBinning(bin_param) split_points = bucket_bin.fit_split_points(self.table) split_point = list(split_points.values())[0] for kth, s_p in enumerate(split_point): expect_s_p = (self.data_num - 1) / self.bin_num * (kth + 1) self.assertEqual(s_p, expect_s_p) iv_attrs = bucket_bin.cal_local_iv(self.table) for col_name, iv_attr in iv_attrs.items(): print('col_name: {}, iv: {}, woe_array: {}'.format( col_name, iv_attr.iv, iv_attr.woe_array))
def _static_quantile_summaries(self): """ Static summaries so that can query a specific quantile point """ if self.binning_obj is not None: return self.binning_obj bin_param = FeatureBinningParam(bin_num=2, bin_indexes=self.cols_index, error=self.error) self.binning_obj = QuantileBinning(bin_param, abnormal_list=self.abnormal_list) self.binning_obj.fit_split_points(self.data_instances) return self.binning_obj
def fit(self, data_instances): if self.bin_obj is not None: return self if self.bin_param is None: self.bin_param = FeatureBinningParam() self.bin_obj = QuantileBinning(params=self.bin_param, abnormal_list=self.abnormal_list, allow_duplicate=True) self.bin_obj.fit_split_points(data_instances) return self
def test_bucket_binning(self): bin_param = FeatureBinningParam(bin_num=self.bin_num, bin_indexes=self.cols) bucket_bin = BucketBinning(bin_param) split_points = bucket_bin.fit_split_points(self.table) split_point = list(split_points.values())[0] for kth, s_p in enumerate(split_point): expect_s_p = (self.data_num - 1) / self.bin_num * (kth + 1) self.assertEqual(s_p, expect_s_p) bucket_bin.cal_local_iv(self.table) for col_name, iv_attr in bucket_bin.bin_results.all_cols_results.items( ): # print('col_name: {}, iv: {}, woe_array: {}'.format(col_name, iv_attr.iv, iv_attr.woe_array)) assert abs(iv_attr.iv - 0.00364386529386804) < 1e-6
def __init__(self): super(BaseHeteroFeatureBinning, self).__init__() self.transfer_variable = HeteroFeatureBinningTransferVariable() self.cols = None self.cols_dict = {} self.binning_obj = None self.header = [] self.schema = {} self.has_synchronized = False self.flowid = '' self.binning_result = {} # dict of iv_attr self.host_results = {} # dict of host results self.party_name = 'Base' self.model_param = FeatureBinningParam()
def __init__(self): super(BaseFeatureBinning, self).__init__() self.transfer_variable = HeteroFeatureBinningTransferVariable() self.binning_obj: BaseBinning = None self.header = None self.header_anonymous = None self.schema = None self.host_results = [] self.transform_type = None self.model_param = FeatureBinningParam() self.bin_inner_param = BinInnerParam() self.bin_result = MultiClassBinResult(labels=[0, 1]) self.has_missing_value = False self.labels = []
def convert_feature_to_bin(self, data_instance, handle_missing_value=False): """ convert bin index to real value """ LOGGER.info("convert feature to bins") param_obj = FeatureBinningParam(bin_num=self.bin_num, error=self.binning_error) if handle_missing_value: self.binning_obj = self.binning_class(param_obj, abnormal_list=[NoneType()],) else: self.binning_obj = self.binning_class(param_obj) self.binning_obj.fit_split_points(data_instance) LOGGER.info("convert feature to bins over") return self.binning_obj.convert_feature_to_bin(data_instance)
def federated_binning(self, data_instance): binning_param = FeatureBinningParam(bin_num=self.bin_num, error=self.binning_error) self.binning_obj.bin_param = binning_param if self.use_missing: binning_result = self.binning_obj.average_run( data_instances=data_instance, abnormal_list=[NoneType()]) else: binning_result = self.binning_obj.average_run( data_instances=data_instance, ) return self.binning_obj.convert_feature_to_bin(data_instance, binning_result)
def test_new_dense_quantile(self): param_obj = FeatureBinningParam(bin_num=4) binning_obj = QuantileBinning(param_obj) binning_obj.fit_split_points(self.dense_table) data_bin, bin_splitpoints, bin_sparse = binning_obj.convert_feature_to_bin(self.dense_table) bin_result = dict([(key, inst.features) for key, inst in data_bin.collect()]) # print(bin_result) for i in range(100): self.assertTrue((bin_result[i] == np.ones(20, dtype='int') * ((i % 16) // 4)).all()) if i < 20: # col_name = 'x' + str(i) col_idx = i split_point = np.array(bin_splitpoints[col_idx]) self.assertTrue((split_point == np.asarray([3, 7, 11, 15], dtype='int')).all()) for split_points in bin_splitpoints: self.assertTrue(len(split_points) <= 4)
def test_bucket_binning(self): bin_param = FeatureBinningParam(bin_num=self.bin_num, cols=self.cols) bucket_bin = BucketBinning(bin_param) split_points = bucket_bin.fit_split_points(self.table) print(split_points)
def fit(self, expect_table, actual_table): LOGGER.info('start psi computing') header1 = expect_table.schema['header'] header2 = actual_table.schema['header'] if not set(header1) == set(header2): raise ValueError( 'table header must be the same while computing psi values') # baseline table should not contain empty columns abnormal_detection.empty_column_detection(expect_table) self.all_feature_list = header1 # make sure no duplicate features self.all_feature_list = self.check_duplicates(self.all_feature_list) # kv bi-directional mapping self.tag_id_mapping = { v: k for k, v in enumerate(self.all_feature_list) } self.id_tag_mapping = { k: v for k, v in enumerate(self.all_feature_list) } if not self.is_sparse( expect_table): # convert missing value: nan to NoneType expect_table = self.convert_missing_val(expect_table) if not self.is_sparse( actual_table): # convert missing value: nan to NoneType actual_table = self.convert_missing_val(actual_table) if not (self.check_table_content(expect_table) and self.check_table_content(actual_table)): raise ValueError( 'contents of input table must be instances of class "Instance"' ) param = FeatureBinningParam(method=consts.QUANTILE, bin_num=self.max_bin_num, local_only=True, error=self.binning_error) binning_obj = QuantileBinning(params=param, abnormal_list=[NoneType()], allow_duplicate=False) binning_obj.fit_split_points(expect_table) data_bin, bin_split_points, bin_sparse_points = binning_obj.convert_feature_to_bin( expect_table) LOGGER.debug('bin split points is {}, shape is {}'.format( bin_split_points, bin_split_points.shape)) self.binning_obj = binning_obj self.data_bin1 = data_bin self.bin_split_points = bin_split_points self.bin_sparse_points = bin_sparse_points LOGGER.debug('expect table binning done') count_func1 = functools.partial( map_partition_handle, feat_num=len(self.all_feature_list), max_bin_num=self.max_bin_num + 1, # an additional bin for missing value missing_val=self.dense_missing_val, is_sparse=self.is_sparse(self.data_bin1)) map_rs1 = self.data_bin1.applyPartitions(count_func1) count1 = count_rs_to_dict(map_rs1.reduce(map_partition_reduce)) data_bin2, bin_split_points2, bin_sparse_points2 = binning_obj.convert_feature_to_bin( actual_table) self.data_bin2 = data_bin2 LOGGER.debug('actual table binning done') count_func2 = functools.partial( map_partition_handle, feat_num=len(self.all_feature_list), max_bin_num=self.max_bin_num + 1, # an additional bin for missing value missing_val=self.dense_missing_val, is_sparse=self.is_sparse(self.data_bin2)) map_rs2 = self.data_bin2.applyPartitions(count_func2) count2 = count_rs_to_dict(map_rs2.reduce(map_partition_reduce)) self.count1, self.count2 = count1, count2 LOGGER.info('psi counting done') # compute psi from counting result psi_result = psi_computer(count1, count2, expect_table.count(), actual_table.count()) self.psi_rs = psi_result # get total psi score of features total_scores = {} for idx, rs in enumerate(self.psi_rs): feat_name = self.id_tag_mapping[idx] total_scores[feat_name] = rs['total_psi'] self.total_scores = total_scores # id-feature mapping convert, str interval computation self.str_intervals = self.get_string_interval( bin_split_points, self.id_tag_mapping, missing_bin_idx=self.max_bin_num) self.interval_perc1 = self.count_dict_to_percentage( copy.deepcopy(count1), expect_table.count()) self.interval_perc2 = self.count_dict_to_percentage( copy.deepcopy(count2), actual_table.count()) self.set_summary(self.generate_summary()) LOGGER.info('psi computation done')