Exemple #1
0
 def transform_regression_label(self, data_inst):
     edge = self.split_points[-1] + 1
     split_points_bin = self.split_points + [edge]
     bin_labels = data_inst.mapValues(
         lambda v: BaseBinning.get_bin_num(v.label, split_points_bin))
     binned_y = [v for k, v in bin_labels.collect()]
     return binned_y
    def load_model(self, model_dict):
        model_param = list(model_dict.get('model').values())[0].get(MODEL_PARAM_NAME)
        model_meta = list(model_dict.get('model').values())[0].get(MODEL_META_NAME)

        self.bin_inner_param = BinInnerParam()

        assert isinstance(model_meta, feature_binning_meta_pb2.FeatureBinningMeta)
        assert isinstance(model_param, feature_binning_param_pb2.FeatureBinningParam)

        self.header = list(model_param.header)
        self.bin_inner_param.set_header(self.header)

        self.bin_inner_param.add_transform_bin_indexes(list(model_meta.transform_param.transform_cols))
        self.bin_inner_param.add_bin_names(list(model_meta.cols))
        self.transform_type = model_meta.transform_param.transform_type

        bin_method = str(model_meta.method)
        if bin_method == consts.QUANTILE:
            self.binning_obj = QuantileBinning(params=model_meta)
        else:
            self.binning_obj = BucketBinning(params=model_meta)

        self.binning_obj.set_role_party(self.role, self.component_properties.local_partyid)
        self.binning_obj.set_bin_inner_param(self.bin_inner_param)
        self.binning_obj.bin_results.reconstruct(model_param.binning_result)

        self.host_results = []
        for host_pb in model_param.host_results:
            host_bin_obj = BaseBinning()
            host_bin_obj.bin_results.reconstruct(host_pb)
            self.host_results.append(host_bin_obj)
Exemple #3
0
        def convert(instances):
            if is_sparse:
                all_data = instances.features.get_all_data()
                indice = []
                sparse_value = []
                data_shape = instances.features.get_shape()
                for col_idx, col_value in all_data:
                    if col_idx in transform_cols_idx:
                        if col_value in abnormal_list:
                            indice.append(col_idx)
                            sparse_value.append(col_value)
                            continue
                        # Maybe it is because missing value add in sparse value, but
                        col_name = bin_inner_param.header[col_idx]
                        split_points = split_points_dict[col_name]
                        bin_num = BaseBinning.get_bin_num(col_value, split_points)
                        indice.append(col_idx)
                        col_results = bin_res.all_cols_results.get(col_name)
                        woe_value = col_results.woe_array[bin_num]
                        sparse_value.append(woe_value)
                    else:
                        indice.append(col_idx)
                        sparse_value.append(col_value)
                sparse_vector = SparseVector(indice, sparse_value, data_shape)
                instances.features = sparse_vector
            else:
                features = instances.features
                assert isinstance(features, np.ndarray)
                transform_cols_idx_set = set(transform_cols_idx)

                for col_idx, col_value in enumerate(features):
                    if col_idx in transform_cols_idx_set:
                        if col_value in abnormal_list:
                            features[col_idx] = col_value
                            continue
                        col_name = bin_inner_param.header[col_idx]
                        split_points = split_points_dict[col_name]
                        bin_num = BaseBinning.get_bin_num(col_value, split_points)
                        col_results = bin_res.all_cols_results.get(col_name)
                        woe_value = col_results.woe_array[bin_num]
                        features[col_idx] = woe_value
                instances.features = features
            return instances
Exemple #4
0
    def cal_local_iv(self, data_instances, split_points,
                     labels=None, label_counts=None, bin_cols_map=None,
                     label_table=None):
        """
        data_bin_table : Table.

            Each element represent for the corresponding bin number this feature belongs to.
            e.g. it could be:
            [{'x1': 1, 'x2': 5, 'x3': 2}
            ...
             ]
        Returns:
            MultiClassBinResult object
        """
        header = data_instances.schema.get("header")
        if bin_cols_map is None:
            bin_cols_map = {name: idx for idx, name in enumerate(header)}
            bin_indexes = [idx for idx, _ in enumerate(header)]
        else:
            bin_indexes = []
            for h in header:
                if h in bin_cols_map:
                    bin_indexes.append(bin_cols_map[h])
        if label_counts is None:
            label_counts = data_overview.get_label_count(data_instances)
            labels = list(label_counts.keys())
            label_counts = [label_counts[k] for k in labels]

        data_bin_table = BaseBinning.get_data_bin(data_instances, split_points, bin_cols_map)
        sparse_bin_points = BaseBinning.get_sparse_bin(bin_indexes, split_points, header)
        sparse_bin_points = {header[k]: v for k, v in sparse_bin_points.items()}

        if label_table is None:
            label_table = self.convert_label(data_instances, labels)

        result_counts = self.cal_bin_label(data_bin_table, sparse_bin_points, label_table, label_counts)
        multi_bin_res = self.cal_iv_from_counts(result_counts, labels,
                                                role=self.role,
                                                party_id=self.party_id)
        for col_name, sp in split_points.items():
            multi_bin_res.put_col_split_points(col_name, sp)
        return multi_bin_res
Exemple #5
0
    def fit(self, data_instances):
        """
        Apply binning method for both data instances in local party as well as the other one. Afterwards, calculate
        the specific metric value for specific columns. Currently, iv is support for binary labeled data only.
        """
        LOGGER.info("Start feature binning fit and transform")
        self._abnormal_detection(data_instances)

        # self._parse_cols(data_instances)
        self._setup_bin_inner_param(data_instances, self.model_param)

        self.binning_obj.fit_split_points(data_instances)
        if self.model_param.skip_static:
            self.transform(data_instances)
            return self.data_output

        label_counts = data_overview.get_label_count(data_instances)
        if len(label_counts) > 2:
            raise ValueError("Iv calculation support binary-data only in this version.")

        data_instances = data_instances.mapValues(self.load_data)
        self.set_schema(data_instances)
        label_table = data_instances.mapValues(lambda x: x.label)

        if self.model_param.local_only:
            LOGGER.info("This is a local only binning fit")
            self.binning_obj.cal_local_iv(data_instances, label_table=label_table,
                                          label_counts=label_counts)
            self.transform(data_instances)
            self.set_summary(self.binning_obj.bin_results.summary())
            return self.data_output

        if self.model_param.encrypt_param.method == consts.PAILLIER:
            cipher = PaillierEncrypt()
            cipher.generate_key(self.model_param.encrypt_param.key_length)
        else:
            raise NotImplementedError("encrypt method not supported yet")
        # from federatedml.secureprotol.encrypt import FakeEncrypt
        # cipher = FakeEncrypt()
        f = functools.partial(self.encrypt, cipher=cipher)
        encrypted_label_table = label_table.mapValues(f)

        self.transfer_variable.encrypted_label.remote(encrypted_label_table,
                                                      role=consts.HOST,
                                                      idx=-1)
        LOGGER.info("Sent encrypted_label_table to host")

        self.binning_obj.cal_local_iv(data_instances, label_table=label_table,
                                      label_counts=label_counts)

        encrypted_bin_sum_infos = self.transfer_variable.encrypted_bin_sum.get(idx=-1)
        encrypted_bin_infos = self.transfer_variable.optimal_info.get(idx=-1)
        total_summary = self.binning_obj.bin_results.summary()

        LOGGER.info("Get encrypted_bin_sum from host")
        for host_idx, encrypted_bin_info in enumerate(encrypted_bin_infos):
            host_party_id = self.component_properties.host_party_idlist[host_idx]
            encrypted_bin_sum = encrypted_bin_sum_infos[host_idx]
            result_counts = self.cipher_decompress(encrypted_bin_sum, cipher)

            host_bin_methods = encrypted_bin_info['bin_method']
            category_names = encrypted_bin_info['category_names']
            if host_bin_methods == consts.OPTIMAL:
                optimal_binning_params = encrypted_bin_info['optimal_params']

                host_model_params = copy.deepcopy(self.model_param)
                host_model_params.bin_num = optimal_binning_params.get('bin_num')
                host_model_params.optimal_binning_param.metric_method = optimal_binning_params.get('metric_method')
                host_model_params.optimal_binning_param.mixture = optimal_binning_params.get('mixture')
                host_model_params.optimal_binning_param.max_bin_pct = optimal_binning_params.get('max_bin_pct')
                host_model_params.optimal_binning_param.min_bin_pct = optimal_binning_params.get('min_bin_pct')

                self.binning_obj.event_total, self.binning_obj.non_event_total = self.get_histogram(data_instances)
                result_counts = dict(result_counts.collect())
                optimal_binning_cols = {x: y for x, y in result_counts.items() if x not in category_names}
                host_binning_obj = self.optimal_binning_sync(optimal_binning_cols, data_instances.count(),
                                                             data_instances.partitions,
                                                             host_idx, host_model_params)
                category_bins = {x: y for x, y in result_counts.items() if x in category_names}
                host_binning_obj.cal_iv_woe(category_bins, self.model_param.adjustment_factor)
            else:
                host_binning_obj = BaseBinning()
                host_binning_obj.cal_iv_woe(result_counts, self.model_param.adjustment_factor)
            host_binning_obj.set_role_party(role=consts.HOST, party_id=host_party_id)
            total_summary = self._merge_summary(total_summary,
                                                host_binning_obj.bin_results.summary())
            self.host_results.append(host_binning_obj)

        self.set_schema(data_instances)
        self.transform(data_instances)
        LOGGER.info("Finish feature binning fit and transform")
        total_summary['test'] = 'test'
        self.set_summary(total_summary)
        return self.data_output
    def fit(self, data_instances):
        """
        Apply binning method for both data instances in local party as well as the other one. Afterwards, calculate
        the specific metric value for specific columns. Currently, iv is support for binary labeled data only.
        """
        LOGGER.info("Start feature binning fit and transform")
        self._abnormal_detection(data_instances)

        # self._parse_cols(data_instances)
        self._setup_bin_inner_param(data_instances, self.model_param)

        self.binning_obj.fit_split_points(data_instances)

        label_counts = data_overview.count_labels(data_instances)
        if label_counts > 2:
            raise ValueError(
                "Iv calculation support binary-data only in this version.")

        data_instances = data_instances.mapValues(self.load_data)
        self.set_schema(data_instances)
        label_table = data_instances.mapValues(lambda x: x.label)

        if self.model_param.local_only:
            LOGGER.info("This is a local only binning fit")
            self.binning_obj.cal_local_iv(data_instances,
                                          label_table=label_table)
            self.transform(data_instances)
            return self.data_output

        cipher = PaillierEncrypt()
        cipher.generate_key()

        f = functools.partial(self.encrypt, cipher=cipher)
        encrypted_label_table = label_table.mapValues(f)

        self.transfer_variable.encrypted_label.remote(encrypted_label_table,
                                                      role=consts.HOST,
                                                      idx=-1)
        LOGGER.info("Sent encrypted_label_table to host")

        self.binning_obj.cal_local_iv(data_instances, label_table=label_table)

        encrypted_bin_infos = self.transfer_variable.encrypted_bin_sum.get(
            idx=-1)
        # LOGGER.debug("encrypted_bin_sums: {}".format(encrypted_bin_sums))

        LOGGER.info("Get encrypted_bin_sum from host")
        for host_idx, encrypted_bin_info in enumerate(encrypted_bin_infos):
            host_party_id = self.component_properties.host_party_idlist[
                host_idx]
            encrypted_bin_sum = encrypted_bin_info['encrypted_bin_sum']
            host_bin_methods = encrypted_bin_info['bin_method']
            category_names = encrypted_bin_info['category_names']
            result_counts = self.__decrypt_bin_sum(encrypted_bin_sum, cipher)
            LOGGER.debug(
                "Received host {} result, length of buckets: {}".format(
                    host_idx, len(result_counts)))
            LOGGER.debug("category_name: {}, host_bin_methods: {}".format(
                category_names, host_bin_methods))
            # if self.model_param.method == consts.OPTIMAL:
            if host_bin_methods == consts.OPTIMAL:
                optimal_binning_params = encrypted_bin_info['optimal_params']

                host_model_params = copy.deepcopy(self.model_param)
                host_model_params.bin_num = optimal_binning_params.get(
                    'bin_num')
                host_model_params.optimal_binning_param.metric_method = optimal_binning_params.get(
                    'metric_method')
                host_model_params.optimal_binning_param.mixture = optimal_binning_params.get(
                    'mixture')
                host_model_params.optimal_binning_param.max_bin_pct = optimal_binning_params.get(
                    'max_bin_pct')
                host_model_params.optimal_binning_param.min_bin_pct = optimal_binning_params.get(
                    'min_bin_pct')

                self.binning_obj.event_total, self.binning_obj.non_event_total = self.get_histogram(
                    data_instances)
                optimal_binning_cols = {
                    x: y
                    for x, y in result_counts.items()
                    if x not in category_names
                }
                host_binning_obj = self.optimal_binning_sync(
                    optimal_binning_cols, data_instances.count(),
                    data_instances._partitions, host_idx, host_model_params)
                category_bins = {
                    x: y
                    for x, y in result_counts.items() if x in category_names
                }
                host_binning_obj.cal_iv_woe(category_bins,
                                            self.model_param.adjustment_factor)
            else:
                host_binning_obj = BaseBinning()
                host_binning_obj.cal_iv_woe(result_counts,
                                            self.model_param.adjustment_factor)
            host_binning_obj.set_role_party(role=consts.HOST,
                                            party_id=host_party_id)
            self.host_results.append(host_binning_obj)

        self.set_schema(data_instances)
        self.transform(data_instances)
        LOGGER.info("Finish feature binning fit and transform")
        return self.data_output