def transform_regression_label(self, data_inst): edge = self.split_points[-1] + 1 split_points_bin = self.split_points + [edge] bin_labels = data_inst.mapValues( lambda v: BaseBinning.get_bin_num(v.label, split_points_bin)) binned_y = [v for k, v in bin_labels.collect()] return binned_y
def load_model(self, model_dict): model_param = list(model_dict.get('model').values())[0].get(MODEL_PARAM_NAME) model_meta = list(model_dict.get('model').values())[0].get(MODEL_META_NAME) self.bin_inner_param = BinInnerParam() assert isinstance(model_meta, feature_binning_meta_pb2.FeatureBinningMeta) assert isinstance(model_param, feature_binning_param_pb2.FeatureBinningParam) self.header = list(model_param.header) self.bin_inner_param.set_header(self.header) self.bin_inner_param.add_transform_bin_indexes(list(model_meta.transform_param.transform_cols)) self.bin_inner_param.add_bin_names(list(model_meta.cols)) self.transform_type = model_meta.transform_param.transform_type bin_method = str(model_meta.method) if bin_method == consts.QUANTILE: self.binning_obj = QuantileBinning(params=model_meta) else: self.binning_obj = BucketBinning(params=model_meta) self.binning_obj.set_role_party(self.role, self.component_properties.local_partyid) self.binning_obj.set_bin_inner_param(self.bin_inner_param) self.binning_obj.bin_results.reconstruct(model_param.binning_result) self.host_results = [] for host_pb in model_param.host_results: host_bin_obj = BaseBinning() host_bin_obj.bin_results.reconstruct(host_pb) self.host_results.append(host_bin_obj)
def convert(instances): if is_sparse: all_data = instances.features.get_all_data() indice = [] sparse_value = [] data_shape = instances.features.get_shape() for col_idx, col_value in all_data: if col_idx in transform_cols_idx: if col_value in abnormal_list: indice.append(col_idx) sparse_value.append(col_value) continue # Maybe it is because missing value add in sparse value, but col_name = bin_inner_param.header[col_idx] split_points = split_points_dict[col_name] bin_num = BaseBinning.get_bin_num(col_value, split_points) indice.append(col_idx) col_results = bin_res.all_cols_results.get(col_name) woe_value = col_results.woe_array[bin_num] sparse_value.append(woe_value) else: indice.append(col_idx) sparse_value.append(col_value) sparse_vector = SparseVector(indice, sparse_value, data_shape) instances.features = sparse_vector else: features = instances.features assert isinstance(features, np.ndarray) transform_cols_idx_set = set(transform_cols_idx) for col_idx, col_value in enumerate(features): if col_idx in transform_cols_idx_set: if col_value in abnormal_list: features[col_idx] = col_value continue col_name = bin_inner_param.header[col_idx] split_points = split_points_dict[col_name] bin_num = BaseBinning.get_bin_num(col_value, split_points) col_results = bin_res.all_cols_results.get(col_name) woe_value = col_results.woe_array[bin_num] features[col_idx] = woe_value instances.features = features return instances
def cal_local_iv(self, data_instances, split_points, labels=None, label_counts=None, bin_cols_map=None, label_table=None): """ data_bin_table : Table. Each element represent for the corresponding bin number this feature belongs to. e.g. it could be: [{'x1': 1, 'x2': 5, 'x3': 2} ... ] Returns: MultiClassBinResult object """ header = data_instances.schema.get("header") if bin_cols_map is None: bin_cols_map = {name: idx for idx, name in enumerate(header)} bin_indexes = [idx for idx, _ in enumerate(header)] else: bin_indexes = [] for h in header: if h in bin_cols_map: bin_indexes.append(bin_cols_map[h]) if label_counts is None: label_counts = data_overview.get_label_count(data_instances) labels = list(label_counts.keys()) label_counts = [label_counts[k] for k in labels] data_bin_table = BaseBinning.get_data_bin(data_instances, split_points, bin_cols_map) sparse_bin_points = BaseBinning.get_sparse_bin(bin_indexes, split_points, header) sparse_bin_points = {header[k]: v for k, v in sparse_bin_points.items()} if label_table is None: label_table = self.convert_label(data_instances, labels) result_counts = self.cal_bin_label(data_bin_table, sparse_bin_points, label_table, label_counts) multi_bin_res = self.cal_iv_from_counts(result_counts, labels, role=self.role, party_id=self.party_id) for col_name, sp in split_points.items(): multi_bin_res.put_col_split_points(col_name, sp) return multi_bin_res
def fit(self, data_instances): """ Apply binning method for both data instances in local party as well as the other one. Afterwards, calculate the specific metric value for specific columns. Currently, iv is support for binary labeled data only. """ LOGGER.info("Start feature binning fit and transform") self._abnormal_detection(data_instances) # self._parse_cols(data_instances) self._setup_bin_inner_param(data_instances, self.model_param) self.binning_obj.fit_split_points(data_instances) if self.model_param.skip_static: self.transform(data_instances) return self.data_output label_counts = data_overview.get_label_count(data_instances) if len(label_counts) > 2: raise ValueError("Iv calculation support binary-data only in this version.") data_instances = data_instances.mapValues(self.load_data) self.set_schema(data_instances) label_table = data_instances.mapValues(lambda x: x.label) if self.model_param.local_only: LOGGER.info("This is a local only binning fit") self.binning_obj.cal_local_iv(data_instances, label_table=label_table, label_counts=label_counts) self.transform(data_instances) self.set_summary(self.binning_obj.bin_results.summary()) return self.data_output if self.model_param.encrypt_param.method == consts.PAILLIER: cipher = PaillierEncrypt() cipher.generate_key(self.model_param.encrypt_param.key_length) else: raise NotImplementedError("encrypt method not supported yet") # from federatedml.secureprotol.encrypt import FakeEncrypt # cipher = FakeEncrypt() f = functools.partial(self.encrypt, cipher=cipher) encrypted_label_table = label_table.mapValues(f) self.transfer_variable.encrypted_label.remote(encrypted_label_table, role=consts.HOST, idx=-1) LOGGER.info("Sent encrypted_label_table to host") self.binning_obj.cal_local_iv(data_instances, label_table=label_table, label_counts=label_counts) encrypted_bin_sum_infos = self.transfer_variable.encrypted_bin_sum.get(idx=-1) encrypted_bin_infos = self.transfer_variable.optimal_info.get(idx=-1) total_summary = self.binning_obj.bin_results.summary() LOGGER.info("Get encrypted_bin_sum from host") for host_idx, encrypted_bin_info in enumerate(encrypted_bin_infos): host_party_id = self.component_properties.host_party_idlist[host_idx] encrypted_bin_sum = encrypted_bin_sum_infos[host_idx] result_counts = self.cipher_decompress(encrypted_bin_sum, cipher) host_bin_methods = encrypted_bin_info['bin_method'] category_names = encrypted_bin_info['category_names'] if host_bin_methods == consts.OPTIMAL: optimal_binning_params = encrypted_bin_info['optimal_params'] host_model_params = copy.deepcopy(self.model_param) host_model_params.bin_num = optimal_binning_params.get('bin_num') host_model_params.optimal_binning_param.metric_method = optimal_binning_params.get('metric_method') host_model_params.optimal_binning_param.mixture = optimal_binning_params.get('mixture') host_model_params.optimal_binning_param.max_bin_pct = optimal_binning_params.get('max_bin_pct') host_model_params.optimal_binning_param.min_bin_pct = optimal_binning_params.get('min_bin_pct') self.binning_obj.event_total, self.binning_obj.non_event_total = self.get_histogram(data_instances) result_counts = dict(result_counts.collect()) optimal_binning_cols = {x: y for x, y in result_counts.items() if x not in category_names} host_binning_obj = self.optimal_binning_sync(optimal_binning_cols, data_instances.count(), data_instances.partitions, host_idx, host_model_params) category_bins = {x: y for x, y in result_counts.items() if x in category_names} host_binning_obj.cal_iv_woe(category_bins, self.model_param.adjustment_factor) else: host_binning_obj = BaseBinning() host_binning_obj.cal_iv_woe(result_counts, self.model_param.adjustment_factor) host_binning_obj.set_role_party(role=consts.HOST, party_id=host_party_id) total_summary = self._merge_summary(total_summary, host_binning_obj.bin_results.summary()) self.host_results.append(host_binning_obj) self.set_schema(data_instances) self.transform(data_instances) LOGGER.info("Finish feature binning fit and transform") total_summary['test'] = 'test' self.set_summary(total_summary) return self.data_output
def fit(self, data_instances): """ Apply binning method for both data instances in local party as well as the other one. Afterwards, calculate the specific metric value for specific columns. Currently, iv is support for binary labeled data only. """ LOGGER.info("Start feature binning fit and transform") self._abnormal_detection(data_instances) # self._parse_cols(data_instances) self._setup_bin_inner_param(data_instances, self.model_param) self.binning_obj.fit_split_points(data_instances) label_counts = data_overview.count_labels(data_instances) if label_counts > 2: raise ValueError( "Iv calculation support binary-data only in this version.") data_instances = data_instances.mapValues(self.load_data) self.set_schema(data_instances) label_table = data_instances.mapValues(lambda x: x.label) if self.model_param.local_only: LOGGER.info("This is a local only binning fit") self.binning_obj.cal_local_iv(data_instances, label_table=label_table) self.transform(data_instances) return self.data_output cipher = PaillierEncrypt() cipher.generate_key() f = functools.partial(self.encrypt, cipher=cipher) encrypted_label_table = label_table.mapValues(f) self.transfer_variable.encrypted_label.remote(encrypted_label_table, role=consts.HOST, idx=-1) LOGGER.info("Sent encrypted_label_table to host") self.binning_obj.cal_local_iv(data_instances, label_table=label_table) encrypted_bin_infos = self.transfer_variable.encrypted_bin_sum.get( idx=-1) # LOGGER.debug("encrypted_bin_sums: {}".format(encrypted_bin_sums)) LOGGER.info("Get encrypted_bin_sum from host") for host_idx, encrypted_bin_info in enumerate(encrypted_bin_infos): host_party_id = self.component_properties.host_party_idlist[ host_idx] encrypted_bin_sum = encrypted_bin_info['encrypted_bin_sum'] host_bin_methods = encrypted_bin_info['bin_method'] category_names = encrypted_bin_info['category_names'] result_counts = self.__decrypt_bin_sum(encrypted_bin_sum, cipher) LOGGER.debug( "Received host {} result, length of buckets: {}".format( host_idx, len(result_counts))) LOGGER.debug("category_name: {}, host_bin_methods: {}".format( category_names, host_bin_methods)) # if self.model_param.method == consts.OPTIMAL: if host_bin_methods == consts.OPTIMAL: optimal_binning_params = encrypted_bin_info['optimal_params'] host_model_params = copy.deepcopy(self.model_param) host_model_params.bin_num = optimal_binning_params.get( 'bin_num') host_model_params.optimal_binning_param.metric_method = optimal_binning_params.get( 'metric_method') host_model_params.optimal_binning_param.mixture = optimal_binning_params.get( 'mixture') host_model_params.optimal_binning_param.max_bin_pct = optimal_binning_params.get( 'max_bin_pct') host_model_params.optimal_binning_param.min_bin_pct = optimal_binning_params.get( 'min_bin_pct') self.binning_obj.event_total, self.binning_obj.non_event_total = self.get_histogram( data_instances) optimal_binning_cols = { x: y for x, y in result_counts.items() if x not in category_names } host_binning_obj = self.optimal_binning_sync( optimal_binning_cols, data_instances.count(), data_instances._partitions, host_idx, host_model_params) category_bins = { x: y for x, y in result_counts.items() if x in category_names } host_binning_obj.cal_iv_woe(category_bins, self.model_param.adjustment_factor) else: host_binning_obj = BaseBinning() host_binning_obj.cal_iv_woe(result_counts, self.model_param.adjustment_factor) host_binning_obj.set_role_party(role=consts.HOST, party_id=host_party_id) self.host_results.append(host_binning_obj) self.set_schema(data_instances) self.transform(data_instances) LOGGER.info("Finish feature binning fit and transform") return self.data_output