def average_run(self, data_instances, bin_num=10, abnormal_list=None): if self.bin_param is None: bin_param = FeatureBinningParam(bin_num=bin_num) self.bin_param = bin_param else: bin_param = self.bin_param if self.bin_method == consts.QUANTILE: bin_obj = QuantileBinning(params=bin_param, abnormal_list=abnormal_list, allow_duplicate=True) else: raise ValueError( "H**o Split Point do not accept bin_method: {}".format( self.bin_method)) abnormal_detection.empty_table_detection(data_instances) abnormal_detection.empty_feature_detection(data_instances) split_points = bin_obj.fit_split_points(data_instances) split_points = {k: np.array(v) for k, v in split_points.items()} split_points_weights = DictWeights(d=split_points) self.aggregator.send_model(split_points_weights, self.suffix) dict_split_points = self.aggregator.get_aggregated_model(self.suffix) split_points = { k: list(v) for k, v in dict_split_points.unboxed.items() } self.bin_obj = bin_obj return split_points
def _init_model(self, params: FeatureBinningParam): self.model_param = params self.transform_type = self.model_param.transform_param.transform_type if self.role == consts.HOST: if self.transform_type == "woe": raise ValueError( "Host party do not support woe transform now.") if self.model_param.method == consts.QUANTILE: self.binning_obj = QuantileBinning(self.model_param) elif self.model_param.method == consts.BUCKET: self.binning_obj = BucketBinning(self.model_param) elif self.model_param.method == consts.OPTIMAL: if self.role == consts.HOST: self.model_param.bin_num = self.model_param.optimal_binning_param.init_bin_nums self.binning_obj = QuantileBinning(self.model_param) else: self.binning_obj = OptimalBinning(self.model_param) else: # self.binning_obj = QuantileBinning(self.bin_param) raise ValueError("Binning method: {} is not supported yet".format( self.model_param.method)) LOGGER.debug("in _init_model, role: {}, local_partyid: {}".format( self.role, self.component_properties)) self.binning_obj.set_role_party( self.role, self.component_properties.local_partyid)
def test_quantile_binning(self): error = 0.01 compress_thres = int(self.data_num / (self.data_num * error)) head_size = 5000 bin_num = 10 bin_percent = [int(i * (100.0 / bin_num)) for i in range(1, bin_num)] bin_param = FeatureBinningParam(method='quantile', compress_thres=compress_thres, head_size=head_size, error=error, cols=self.cols, bin_num=bin_num) quan_bin = QuantileBinning(bin_param) t0 = time.time() split_points = quan_bin.fit_split_points(self.table) t1 = time.time() print('Spend time: {}'.format(t1 - t0)) # collect and test numpy quantile speed local_table = self.table.collect() total_data = [] for _, data_inst in local_table: total_data.append(data_inst.features) total_data = np.array(total_data) for col in self.cols: col_idx = self.col_dict.get(col) x = total_data[:, col_idx] sk = np.percentile(x, bin_percent, interpolation="midpoint") t2 = time.time() print('collect and use numpy time: {}'.format(t2 - t1))
def _init_model(self, params: FeatureBinningParam): self.model_param = params self.transform_type = self.model_param.transform_param.transform_type if self.role == consts.HOST: if self.transform_type == "woe": raise ValueError( "Host party do not support woe transform now.") if self.model_param.method == consts.QUANTILE: self.binning_obj = QuantileBinning(self.model_param) elif self.model_param.method == consts.BUCKET: self.binning_obj = BucketBinning(self.model_param) elif self.model_param.method == consts.OPTIMAL: if self.role == consts.HOST: self.model_param.bin_num = self.model_param.optimal_binning_param.init_bin_nums self.binning_obj = QuantileBinning(self.model_param) else: self.binning_obj = OptimalBinning(self.model_param) else: raise ValueError("Binning method: {} is not supported yet".format( self.model_param.method)) self.iv_calculator = IvCalculator( self.model_param.adjustment_factor, role=self.role, party_id=self.component_properties.local_partyid)
def convert_feature_to_bin(self, data_instance): LOGGER.info("convert feature to bins") param_obj = FeatureBinningParam(bin_num=self.bin_num) binning_obj = QuantileBinning(param_obj) binning_obj.fit_split_points(data_instance) self.data_bin, self.bin_split_points, self.bin_sparse_points = binning_obj.convert_feature_to_bin( data_instance)
def test_new_sparse_quantile(self): param_obj = FeatureBinningParam(bin_num=4) binning_obj = QuantileBinning(param_obj) binning_obj.fit_split_points(self.sparse_table) data_bin, bin_splitpoints, bin_sparse = binning_obj.convert_feature_to_bin(self.sparse_table) bin_result = dict([(key, inst.features) for key, inst in data_bin.collect()]) for i in range(20): self.assertTrue(len(self.sparse_inst[i][1].features.sparse_vec) == len(bin_result[i].sparse_vec))
def _get_quantile_median(self): bin_param = FeatureBinningParam(bin_num=2, cols=self.cols) binning_obj = QuantileBinning(bin_param) split_points = binning_obj.fit_split_points(self.data_instances) medians = {} for col_name, split_point in split_points.items(): medians[col_name] = split_point[0] return medians
def _get_quantile_median(self): cols_index = self._get_cols_index() bin_param = FeatureBinningParam(bin_num=2, cols=cols_index) binning_obj = QuantileBinning(bin_param, abnormal_list=self.abnormal_list) split_points = binning_obj.fit_split_points(self.data_instances) medians = {} for col_name, split_point in split_points.items(): medians[col_name] = split_point[0] return medians
def _static_quantile_summaries(self): """ Static summaries so that can query a specific quantile point """ if self.binning_obj is not None: return self.binning_obj bin_param = FeatureBinningParam(bin_num=2, bin_indexes=self.cols_index, error=self.error) self.binning_obj = QuantileBinning(bin_param, abnormal_list=self.abnormal_list) self.binning_obj.fit_split_points(self.data_instances) return self.binning_obj
def fit(self, data_instances): if self.bin_obj is not None: return self if self.bin_param is None: self.bin_param = FeatureBinningParam() self.bin_obj = QuantileBinning(params=self.bin_param, abnormal_list=self.abnormal_list, allow_duplicate=True) self.bin_obj.fit_split_points(data_instances) return self
def fit(self, data_instances, bin_param=None): if bin_param is None: # Use default setting bin_param = FeatureBinningParam() bin_obj = QuantileBinning(bin_param) query_result = bin_obj.query_quantile_point(data_instances, self.cols, self.percentile) for col_name, feature_value in query_result.items(): self.feature_values[col_name] = feature_value if feature_value < self.upper_threshold: self.left_cols[col_name] = True else: self.left_cols[col_name] = False self.left_cols = self._keep_one_feature() return self.left_cols
def _load_model(self, model_dict): model_param = list( model_dict.get('model').values())[0].get(MODEL_PARAM_NAME) # self._parse_need_run(model_dict, MODEL_META_NAME) model_meta = list( model_dict.get('model').values())[0].get(MODEL_META_NAME) # model_meta.cols = list(model_meta.cols) # model_meta.transform_param.transform_cols = list(model_meta.transform_param.transform_cols) self.cols = list(map(int, model_meta.cols)) bin_method = str(model_meta.method) if bin_method == consts.QUANTILE: self.binning_obj = QuantileBinning(model_meta, self.party_name) else: self.binning_obj = BucketBinning(model_meta, self.party_name) binning_result_obj = dict(model_param.binning_result.binning_result) host_params = dict(model_param.host_results) self.binning_result = {} self.host_results = {} for col_name, iv_attr_obj in binning_result_obj.items(): iv_attr = IVAttributes([], [], [], [], [], []) iv_attr.reconstruct(iv_attr_obj) self.binning_obj.reconstruct_by_iv_obj(col_name, iv_attr) self.binning_result[col_name] = iv_attr # self.cols.append(col_name) for host_name, host_result_obj in host_params.items(): host_result_obj = dict(host_result_obj.binning_result) for col_name, iv_attr_obj in host_result_obj.items(): iv_attr = IVAttributes([], [], [], [], [], []) iv_attr.reconstruct(iv_attr_obj) host_result_obj[col_name] = iv_attr self.host_results[host_name] = host_result_obj
def load_model(self, model_dict): model_param = list(model_dict.get('model').values())[0].get(MODEL_PARAM_NAME) model_meta = list(model_dict.get('model').values())[0].get(MODEL_META_NAME) self.bin_inner_param = BinInnerParam() assert isinstance(model_meta, feature_binning_meta_pb2.FeatureBinningMeta) assert isinstance(model_param, feature_binning_param_pb2.FeatureBinningParam) self.header = list(model_param.header) self.bin_inner_param.set_header(self.header) self.bin_inner_param.add_transform_bin_indexes(list(model_meta.transform_param.transform_cols)) self.bin_inner_param.add_bin_names(list(model_meta.cols)) self.transform_type = model_meta.transform_param.transform_type bin_method = str(model_meta.method) if bin_method == consts.QUANTILE: self.binning_obj = QuantileBinning(params=model_meta) else: self.binning_obj = BucketBinning(params=model_meta) self.binning_obj.set_role_party(self.role, self.component_properties.local_partyid) self.binning_obj.set_bin_inner_param(self.bin_inner_param) self.binning_obj.bin_results.reconstruct(model_param.binning_result) self.host_results = [] for host_pb in model_param.host_results: host_bin_obj = BaseBinning() host_bin_obj.bin_results.reconstruct(host_pb) self.host_results.append(host_bin_obj)
def _init_binning_obj(self): if self.bin_param.method == consts.QUANTILE: self.binning_obj = QuantileBinning(self.bin_param, self.party_name) elif self.bin_param.method == consts.BUCKET: self.binning_obj = BucketBinning(self.bin_param, self.party_name) else: # self.binning_obj = QuantileBinning(self.bin_param) raise ValueError("Binning method: {} is not supported yet".format(self.bin_param.method))
def test_new_dense_quantile(self): param_obj = FeatureBinningParam(bin_num=4) binning_obj = QuantileBinning(param_obj) binning_obj.fit_split_points(self.dense_table) data_bin, bin_splitpoints, bin_sparse = binning_obj.convert_feature_to_bin(self.dense_table) bin_result = dict([(key, inst.features) for key, inst in data_bin.collect()]) # print(bin_result) for i in range(100): self.assertTrue((bin_result[i] == np.ones(20, dtype='int') * ((i % 16) // 4)).all()) if i < 20: # col_name = 'x' + str(i) col_idx = i split_point = np.array(bin_splitpoints[col_idx]) self.assertTrue((split_point == np.asarray([3, 7, 11, 15], dtype='int')).all()) for split_points in bin_splitpoints: self.assertTrue(len(split_points) <= 4)
def _bin_obj_generator(self, abnormal_list: list = None, this_bin_num=bin_num): bin_param = FeatureBinningParam(method='quantile', compress_thres=consts.DEFAULT_COMPRESS_THRESHOLD, head_size=consts.DEFAULT_HEAD_SIZE, error=consts.DEFAULT_RELATIVE_ERROR, bin_indexes=-1, bin_num=this_bin_num) bin_obj = QuantileBinning(bin_param, abnormal_list=abnormal_list) return bin_obj
def _bin_obj_generator(self): bin_param = FeatureBinningParam(method='quantile', compress_thres=compress_thres, head_size=head_size, error=error, cols=-1, bin_num=bin_num) bin_obj = QuantileBinning(bin_param) return bin_obj
def test_quantile_binning(self): return compress_thres = 10000 head_size = 5000 error = 0.01 bin_num = 10 bin_param = FeatureBinningParam(method='quantile', compress_thres=compress_thres, head_size=head_size, error=error, cols=self.cols, bin_num=bin_num) quan_bin = QuantileBinning(bin_param) split_points = quan_bin.fit_split_points(self.table) for col_idx, col in enumerate(self.cols): bin_percent = [i * (1.0 / bin_num) for i in range(1, bin_num)] feature_idx = self.col_dict.get(col) x = self.numpy_table[:, feature_idx] x = sorted(x) for bin_idx, percent in enumerate(bin_percent): min_rank = int( math.floor(percent * self.data_num - self.data_num * error)) max_rank = int( math.ceil(percent * self.data_num + self.data_num * error)) if min_rank < 0: min_rank = 0 if max_rank > len(x) - 1: max_rank = len(x) - 1 try: self.assertTrue(x[min_rank] <= split_points[col_idx] [bin_idx] <= x[max_rank]) except: print(x[min_rank], x[max_rank], split_points[col_idx][bin_idx]) found_index = x.index(split_points[col_idx][bin_idx]) print("min_rank: {}, found_rank: {}, max_rank: {}".format( min_rank, found_index, max_rank)) self.assertTrue(x[min_rank] <= split_points[col_idx][bin_idx] <= x[max_rank])
def convert_feature_to_bin(self, data_instance): LOGGER.info("convert feature to bins") param_obj = FeatureBinningParam(bin_num=self.bin_num) if self.use_missing: binning_obj = QuantileBinning(param_obj, abnormal_list=[NoneType()]) else: binning_obj = QuantileBinning(param_obj) binning_obj.fit_split_points(data_instance) self.data_bin, self.bin_split_points, self.bin_sparse_points = binning_obj.convert_feature_to_bin(data_instance)
def _init_model(self, params): self.model_param = params self.cols_index = params.cols if self.model_param.method == consts.QUANTILE: self.binning_obj = QuantileBinning(self.model_param, self.party_name) elif self.model_param.method == consts.BUCKET: self.binning_obj = BucketBinning(self.model_param, self.party_name) else: # self.binning_obj = QuantileBinning(self.bin_param) raise ValueError("Binning method: {} is not supported yet".format( self.model_param.method))
def _init_model(self, params: FeatureBinningParam): self.model_param = params self.transform_type = self.model_param.transform_param.transform_type if self.model_param.method == consts.QUANTILE: self.binning_obj = QuantileBinning(self.model_param) elif self.model_param.method == consts.BUCKET: self.binning_obj = BucketBinning(self.model_param) else: # self.binning_obj = QuantileBinning(self.bin_param) raise ValueError("Binning method: {} is not supported yet".format(self.model_param.method)) LOGGER.debug("in _init_model, role: {}, local_partyid: {}".format(self.role, self.component_properties)) self.binning_obj.set_role_party(self.role, self.component_properties.local_partyid)
def fit(self, expect_table, actual_table): LOGGER.info('start psi computing') header1 = expect_table.schema['header'] header2 = actual_table.schema['header'] if not set(header1) == set(header2): raise ValueError( 'table header must be the same while computing psi values') # baseline table should not contain empty columns abnormal_detection.empty_column_detection(expect_table) self.all_feature_list = header1 # make sure no duplicate features self.all_feature_list = self.check_duplicates(self.all_feature_list) # kv bi-directional mapping self.tag_id_mapping = { v: k for k, v in enumerate(self.all_feature_list) } self.id_tag_mapping = { k: v for k, v in enumerate(self.all_feature_list) } if not self.is_sparse( expect_table): # convert missing value: nan to NoneType expect_table = self.convert_missing_val(expect_table) if not self.is_sparse( actual_table): # convert missing value: nan to NoneType actual_table = self.convert_missing_val(actual_table) if not (self.check_table_content(expect_table) and self.check_table_content(actual_table)): raise ValueError( 'contents of input table must be instances of class "Instance"' ) param = FeatureBinningParam(method=consts.QUANTILE, bin_num=self.max_bin_num, local_only=True, error=self.binning_error) binning_obj = QuantileBinning(params=param, abnormal_list=[NoneType()], allow_duplicate=False) binning_obj.fit_split_points(expect_table) data_bin, bin_split_points, bin_sparse_points = binning_obj.convert_feature_to_bin( expect_table) LOGGER.debug('bin split points is {}, shape is {}'.format( bin_split_points, bin_split_points.shape)) self.binning_obj = binning_obj self.data_bin1 = data_bin self.bin_split_points = bin_split_points self.bin_sparse_points = bin_sparse_points LOGGER.debug('expect table binning done') count_func1 = functools.partial( map_partition_handle, feat_num=len(self.all_feature_list), max_bin_num=self.max_bin_num + 1, # an additional bin for missing value missing_val=self.dense_missing_val, is_sparse=self.is_sparse(self.data_bin1)) map_rs1 = self.data_bin1.applyPartitions(count_func1) count1 = count_rs_to_dict(map_rs1.reduce(map_partition_reduce)) data_bin2, bin_split_points2, bin_sparse_points2 = binning_obj.convert_feature_to_bin( actual_table) self.data_bin2 = data_bin2 LOGGER.debug('actual table binning done') count_func2 = functools.partial( map_partition_handle, feat_num=len(self.all_feature_list), max_bin_num=self.max_bin_num + 1, # an additional bin for missing value missing_val=self.dense_missing_val, is_sparse=self.is_sparse(self.data_bin2)) map_rs2 = self.data_bin2.applyPartitions(count_func2) count2 = count_rs_to_dict(map_rs2.reduce(map_partition_reduce)) self.count1, self.count2 = count1, count2 LOGGER.info('psi counting done') # compute psi from counting result psi_result = psi_computer(count1, count2, expect_table.count(), actual_table.count()) self.psi_rs = psi_result # get total psi score of features total_scores = {} for idx, rs in enumerate(self.psi_rs): feat_name = self.id_tag_mapping[idx] total_scores[feat_name] = rs['total_psi'] self.total_scores = total_scores # id-feature mapping convert, str interval computation self.str_intervals = self.get_string_interval( bin_split_points, self.id_tag_mapping, missing_bin_idx=self.max_bin_num) self.interval_perc1 = self.count_dict_to_percentage( copy.deepcopy(count1), expect_table.count()) self.interval_perc2 = self.count_dict_to_percentage( copy.deepcopy(count2), actual_table.count()) self.set_summary(self.generate_summary()) LOGGER.info('psi computation done')
class MultivariateStatisticalSummary(object): """ """ def __init__(self, data_instances, cols_index=-1, abnormal_list=None, error=consts.DEFAULT_RELATIVE_ERROR, stat_order=2, bias=True): self.finish_fit_statics = False # Use for static data # self.finish_fit_summaries = False # Use for quantile data self.binning_obj: QuantileBinning = None self.summary_statistics = None self.header = None # self.quantile_summary_dict = {} self.cols_dict = {} # self.medians = None self.data_instances = data_instances self.cols_index = None if not isinstance(abnormal_list, list): abnormal_list = [abnormal_list] self.abnormal_list = abnormal_list self.__init_cols(data_instances, cols_index, stat_order, bias) self.label_summary = None self.error = error def __init_cols(self, data_instances, cols_index, stat_order, bias): header = data_overview.get_header(data_instances) self.header = header if cols_index == -1: self.cols_index = [i for i in range(len(header))] else: self.cols_index = cols_index LOGGER.debug( f"col_index: {cols_index}, self.col_index: {self.cols_index}") self.cols_dict = { header[indices]: indices for indices in self.cols_index } self.summary_statistics = SummaryStatistics( length=len(self.cols_index), abnormal_list=self.abnormal_list, stat_order=stat_order, bias=bias) def _static_sums(self): """ Statics sum, sum_square, max_value, min_value, so that variance is available. """ is_sparse = data_overview.is_sparse_data(self.data_instances) partition_cal = functools.partial(self.static_in_partition, cols_index=self.cols_index, summary_statistics=copy.deepcopy( self.summary_statistics), is_sparse=is_sparse) self.summary_statistics = self.data_instances.applyPartitions(partition_cal). \ reduce(lambda x, y: self.copy_merge(x, y)) # self.summary_statistics = summary_statistic_dict.reduce(self.aggregate_statics) self.finish_fit_statics = True def _static_quantile_summaries(self): """ Static summaries so that can query a specific quantile point """ if self.binning_obj is not None: return self.binning_obj bin_param = FeatureBinningParam(bin_num=2, bin_indexes=self.cols_index, error=self.error) self.binning_obj = QuantileBinning(bin_param, abnormal_list=self.abnormal_list) self.binning_obj.fit_split_points(self.data_instances) return self.binning_obj @staticmethod def copy_merge(s1, s2): new_s1 = copy.deepcopy(s1) return new_s1.merge(s2) @staticmethod def static_in_partition(data_instances, cols_index, summary_statistics, is_sparse): """ Statics sums, sum_square, max and min value through one traversal Parameters ---------- data_instances : DTable The input data cols_index : indices Specify which column(s) need to apply statistic. summary_statistics: SummaryStatistics Returns ------- Dict of SummaryStatistics object """ for k, instances in data_instances: if not is_sparse: if isinstance(instances, Instance): features = instances.features else: features = instances # try: # features = np.array(instances, dtype=float) # except ValueError as e: # raise ValueError(f"Static Module accept numeric input only. Error info: {e}") # LOGGER.debug(f"In statics, features: {features}") row_values = [ x for idx, x in enumerate(features) if idx in cols_index ] # row_values = features[cols_index] else: sparse_data = instances.features.get_sparse_vector() row_values = np.array( [sparse_data.get(x, 0) for x in cols_index]) summary_statistics.add_rows(row_values) return summary_statistics @staticmethod def static_summaries_in_partition(data_instances, cols_dict, abnormal_list, error): """ Statics sums, sum_square, max and min value through one traversal Parameters ---------- data_instances : DTable The input data cols_dict : dict Specify which column(s) need to apply statistic. abnormal_list: list Specify which values are not permitted. Returns ------- Dict of SummaryStatistics object """ summary_dict = {} for col_name in cols_dict: summary_dict[col_name] = QuantileSummaries( abnormal_list=abnormal_list, error=error) for k, instances in data_instances: if isinstance(instances, Instance): features = instances.features else: features = instances for col_name, col_index in cols_dict.items(): value = features[col_index] summary_obj = summary_dict[col_name] summary_obj.insert(value) return summary_dict @staticmethod def aggregate_statics(s_dict1, s_dict2): if s_dict1 is None and s_dict2 is None: return None if s_dict1 is None: return s_dict2 if s_dict2 is None: return s_dict1 new_dict = {} for col_name, static_1 in s_dict1.items(): static_1.merge(s_dict2[col_name]) new_dict[col_name] = static_1 return new_dict def get_median(self): if self.binning_obj is None: self._static_quantile_summaries() medians = self.binning_obj.query_quantile_point(query_points=0.5) return medians @property def median(self): median_dict = self.get_median() return np.array( [median_dict[self.header[idx]] for idx in self.cols_index]) def get_quantile_point(self, quantile): """ Return the specific quantile point value Parameters ---------- quantile : float, 0 <= quantile <= 1 Specify which column(s) need to apply statistic. Returns ------- return a dict of result quantile points. eg. quantile_point = {"x1": 3, "x2": 5... } """ if self.binning_obj is None: self._static_quantile_summaries() quantile_points = self.binning_obj.query_quantile_point(quantile) return quantile_points def get_mean(self): """ Return the mean value(s) of the given column Returns ------- return a dict of result mean. """ return self.get_statics("mean") def get_variance(self): return self.get_statics("variance") def get_std_variance(self): return self.get_statics("stddev") def get_max(self): return self.get_statics("max_value") def get_min(self): return self.get_statics("min_value") def get_statics(self, data_type): """ Return the specific static value(s) of the given column Parameters ---------- data_type : str, "mean", "variance", "std_variance", "max_value" or "mim_value" Specify which type to show. Returns ------- return a list of result result. The order is the same as cols. """ if not self.finish_fit_statics: self._static_sums() if hasattr(self.summary_statistics, data_type): result_row = getattr(self.summary_statistics, data_type) elif hasattr(self, data_type): result_row = getattr(self, data_type) else: raise ValueError( f"Statistic data type: {data_type} cannot be recognized") # LOGGER.debug(f"col_index: {self.cols_index}, result_row: {result_row}," # f"header: {self.header}, data_type: {data_type}") result = {} result_row = result_row.tolist() for col_idx, header_idx in enumerate(self.cols_index): result[self.header[header_idx]] = result_row[col_idx] return result def get_missing_ratio(self): return self.get_statics("missing_ratio") @property def missing_ratio(self): missing_static_obj = MissingStatistic() all_missing_ratio = missing_static_obj.fit(self.data_instances) return np.array( [all_missing_ratio[self.header[idx]] for idx in self.cols_index]) @property def missing_count(self): missing_ratio = self.missing_ratio missing_count = missing_ratio * self.data_instances.count() return missing_count.astype(int) @staticmethod def get_label_static_dict(data_instances): result_dict = {} for instance in data_instances: label_key = instance[1].label if label_key not in result_dict: result_dict[label_key] = 1 else: result_dict[label_key] += 1 return result_dict @staticmethod def merge_result_dict(dict_a, dict_b): for k, v in dict_b.items(): if k in dict_a: dict_a[k] += v else: dict_a[k] = v return dict_a def get_label_histogram(self): label_histogram = self.data_instances.applyPartitions( self.get_label_static_dict).reduce(self.merge_result_dict) return label_histogram
def load_model(self, model_dict): model_param = list( model_dict.get('model').values())[0].get(MODEL_PARAM_NAME) model_meta = list( model_dict.get('model').values())[0].get(MODEL_META_NAME) self.bin_inner_param = BinInnerParam() multi_class_result = model_param.multi_class_result self.labels = list(multi_class_result.labels) # if not self.labels: # self.labels = [0, 1] if self.labels: self.bin_result = MultiClassBinResult.reconstruct( list(multi_class_result.results), self.labels) assert isinstance(model_meta, feature_binning_meta_pb2.FeatureBinningMeta) assert isinstance(model_param, feature_binning_param_pb2.FeatureBinningParam) self.header = list(model_param.header) self.bin_inner_param.set_header(self.header) self.bin_inner_param.add_transform_bin_indexes( list(model_meta.transform_param.transform_cols)) self.bin_inner_param.add_bin_names(list(model_meta.cols)) self.transform_type = model_meta.transform_param.transform_type bin_method = str(model_meta.method) if bin_method == consts.QUANTILE: self.binning_obj = QuantileBinning(params=model_meta) elif bin_method == consts.OPTIMAL: self.binning_obj = OptimalBinning(params=model_meta) else: self.binning_obj = BucketBinning(params=model_meta) # self.binning_obj.set_role_party(self.role, self.component_properties.local_partyid) self.binning_obj.set_bin_inner_param(self.bin_inner_param) split_results = dict(model_param.binning_result.binning_result) for col_name, sr_pb in split_results.items(): split_points = list(sr_pb.split_points) self.binning_obj.bin_results.put_col_split_points( col_name, split_points) # self.binning_obj.bin_results.reconstruct(model_param.binning_result) self.host_results = [] host_pbs = list(model_param.multi_class_result.host_results) if len(host_pbs): if len(self.labels) == 2: for host_pb in host_pbs: self.host_results.append( MultiClassBinResult.reconstruct(host_pb, self.labels)) else: assert len(host_pbs) % len(self.labels) == 0 i = 0 while i < len(host_pbs): this_pbs = host_pbs[i:i + len(self.labels)] self.host_results.append( MultiClassBinResult.reconstruct(this_pbs, self.labels)) i += len(self.labels) if list(model_param.header_anonymous): self.header_anonymous = list(model_param.header_anonymous)
class HomoFeatureBinningClient(object): def __init__(self, bin_method=consts.QUANTILE): self.aggregator = secure_mean_aggregator.Client( enable_secure_aggregate=True) self.suffix = tuple() self.bin_method = bin_method self.bin_obj: QuantileBinning = None self.bin_param = None self.abnormal_list = None def set_suffix(self, suffix): self.suffix = suffix def average_run(self, data_instances, bin_num=10, abnormal_list=None): if self.bin_param is None: bin_param = FeatureBinningParam(bin_num=bin_num) self.bin_param = bin_param else: bin_param = self.bin_param if self.bin_method == consts.QUANTILE: bin_obj = QuantileBinning(params=bin_param, abnormal_list=abnormal_list, allow_duplicate=True) else: raise ValueError( "H**o Split Point do not accept bin_method: {}".format( self.bin_method)) abnormal_detection.empty_table_detection(data_instances) abnormal_detection.empty_feature_detection(data_instances) split_points = bin_obj.fit_split_points(data_instances) split_points = {k: np.array(v) for k, v in split_points.items()} split_points_weights = DictWeights(d=split_points) self.aggregator.send_model(split_points_weights, self.suffix) dict_split_points = self.aggregator.get_aggregated_model(self.suffix) split_points = { k: list(v) for k, v in dict_split_points.unboxed.items() } self.bin_obj = bin_obj return split_points def convert_feature_to_bin(self, data_instances, split_points=None): if self.bin_obj is None: return None, None, None return self.bin_obj.convert_feature_to_bin(data_instances, split_points) def set_bin_param(self, bin_param: FeatureBinningParam): if self.bin_param is not None: raise RuntimeError("Bin param has been set and it's immutable") self.bin_param = bin_param return self def set_abnormal_list(self, abnormal_list): self.abnormal_list = abnormal_list return self def fit(self, data_instances): if self.bin_obj is not None: return self if self.bin_param is None: self.bin_param = FeatureBinningParam() self.bin_obj = QuantileBinning(params=self.bin_param, abnormal_list=self.abnormal_list, allow_duplicate=True) self.bin_obj.fit_split_points(data_instances) return self def query_quantile_points(self, data_instances, quantile_points): if self.bin_obj is None: self.fit(data_instances) # bin_col_names = self.bin_obj.bin_inner_param.bin_names query_result = self.bin_obj.query_quantile_point(quantile_points) query_points = DictWeights(d=query_result) suffix = tuple(list(self.suffix) + [str(quantile_points)]) self.aggregator.send_model(query_points, suffix) query_points = self.aggregator.get_aggregated_model(suffix) query_points = {k: v for k, v in query_points.unboxed.items()} return query_points