Example #1
0
    def _load_model(self, model_dict):
        model_param = list(
            model_dict.get('model').values())[0].get(MODEL_PARAM_NAME)
        # self._parse_need_run(model_dict, MODEL_META_NAME)
        model_meta = list(
            model_dict.get('model').values())[0].get(MODEL_META_NAME)
        # model_meta.cols = list(model_meta.cols)
        # model_meta.transform_param.transform_cols = list(model_meta.transform_param.transform_cols)
        self.cols = list(map(int, model_meta.cols))
        bin_method = str(model_meta.method)
        if bin_method == consts.QUANTILE:
            self.binning_obj = QuantileBinning(model_meta, self.party_name)
        else:
            self.binning_obj = BucketBinning(model_meta, self.party_name)

        binning_result_obj = dict(model_param.binning_result.binning_result)
        host_params = dict(model_param.host_results)

        self.binning_result = {}
        self.host_results = {}
        for col_name, iv_attr_obj in binning_result_obj.items():
            iv_attr = IVAttributes([], [], [], [], [], [])
            iv_attr.reconstruct(iv_attr_obj)
            self.binning_obj.reconstruct_by_iv_obj(col_name, iv_attr)
            self.binning_result[col_name] = iv_attr
            # self.cols.append(col_name)

        for host_name, host_result_obj in host_params.items():
            host_result_obj = dict(host_result_obj.binning_result)
            for col_name, iv_attr_obj in host_result_obj.items():
                iv_attr = IVAttributes([], [], [], [], [], [])
                iv_attr.reconstruct(iv_attr_obj)
                host_result_obj[col_name] = iv_attr
            self.host_results[host_name] = host_result_obj
Example #2
0
    def _init_model(self, params: FeatureBinningParam):
        self.model_param = params

        self.transform_type = self.model_param.transform_param.transform_type

        if self.role == consts.HOST:
            if self.transform_type == "woe":
                raise ValueError(
                    "Host party do not support woe transform now.")

        if self.model_param.method == consts.QUANTILE:
            self.binning_obj = QuantileBinning(self.model_param)
        elif self.model_param.method == consts.BUCKET:
            self.binning_obj = BucketBinning(self.model_param)
        elif self.model_param.method == consts.OPTIMAL:
            if self.role == consts.HOST:
                self.model_param.bin_num = self.model_param.optimal_binning_param.init_bin_nums
                self.binning_obj = QuantileBinning(self.model_param)
            else:
                self.binning_obj = OptimalBinning(self.model_param)
        else:
            raise ValueError("Binning method: {} is not supported yet".format(
                self.model_param.method))

        self.iv_calculator = IvCalculator(
            self.model_param.adjustment_factor,
            role=self.role,
            party_id=self.component_properties.local_partyid)
Example #3
0
    def _init_model(self, params: FeatureBinningParam):
        self.model_param = params

        self.transform_type = self.model_param.transform_param.transform_type

        if self.role == consts.HOST:
            if self.transform_type == "woe":
                raise ValueError(
                    "Host party do not support woe transform now.")

        if self.model_param.method == consts.QUANTILE:
            self.binning_obj = QuantileBinning(self.model_param)
        elif self.model_param.method == consts.BUCKET:
            self.binning_obj = BucketBinning(self.model_param)
        elif self.model_param.method == consts.OPTIMAL:
            if self.role == consts.HOST:
                self.model_param.bin_num = self.model_param.optimal_binning_param.init_bin_nums
                self.binning_obj = QuantileBinning(self.model_param)
            else:
                self.binning_obj = OptimalBinning(self.model_param)
        else:
            # self.binning_obj = QuantileBinning(self.bin_param)
            raise ValueError("Binning method: {} is not supported yet".format(
                self.model_param.method))
        LOGGER.debug("in _init_model, role: {}, local_partyid: {}".format(
            self.role, self.component_properties))
        self.binning_obj.set_role_party(
            self.role, self.component_properties.local_partyid)
Example #4
0
    def load_model(self, model_dict):
        model_param = list(model_dict.get('model').values())[0].get(MODEL_PARAM_NAME)
        model_meta = list(model_dict.get('model').values())[0].get(MODEL_META_NAME)

        self.bin_inner_param = BinInnerParam()

        assert isinstance(model_meta, feature_binning_meta_pb2.FeatureBinningMeta)
        assert isinstance(model_param, feature_binning_param_pb2.FeatureBinningParam)

        self.header = list(model_param.header)
        self.bin_inner_param.set_header(self.header)

        self.bin_inner_param.add_transform_bin_indexes(list(model_meta.transform_param.transform_cols))
        self.bin_inner_param.add_bin_names(list(model_meta.cols))
        self.transform_type = model_meta.transform_param.transform_type

        bin_method = str(model_meta.method)
        if bin_method == consts.QUANTILE:
            self.binning_obj = QuantileBinning(params=model_meta)
        else:
            self.binning_obj = BucketBinning(params=model_meta)

        self.binning_obj.set_role_party(self.role, self.component_properties.local_partyid)
        self.binning_obj.set_bin_inner_param(self.bin_inner_param)
        self.binning_obj.bin_results.reconstruct(model_param.binning_result)

        self.host_results = []
        for host_pb in model_param.host_results:
            host_bin_obj = BaseBinning()
            host_bin_obj.bin_results.reconstruct(host_pb)
            self.host_results.append(host_bin_obj)
Example #5
0
 def _init_model(self, params):
     self.model_param = params
     self.cols_index = params.cols
     if self.model_param.method == consts.QUANTILE:
         self.binning_obj = QuantileBinning(self.model_param,
                                            self.party_name)
     elif self.model_param.method == consts.BUCKET:
         self.binning_obj = BucketBinning(self.model_param, self.party_name)
     else:
         # self.binning_obj = QuantileBinning(self.bin_param)
         raise ValueError("Binning method: {} is not supported yet".format(
             self.model_param.method))
Example #6
0
 def test_bucket_binning(self):
     bin_param = FeatureBinningParam(bin_num=self.bin_num, cols=self.cols)
     bucket_bin = BucketBinning(bin_param)
     split_points = bucket_bin.fit_split_points(self.table)
     split_point = list(split_points.values())[0]
     for kth, s_p in enumerate(split_point):
         expect_s_p = (self.data_num - 1) / self.bin_num * (kth + 1)
         self.assertEqual(s_p, expect_s_p)
     iv_attrs = bucket_bin.cal_local_iv(self.table)
     for col_name, iv_attr in iv_attrs.items():
         print('col_name: {}, iv: {}, woe_array: {}'.format(
             col_name, iv_attr.iv, iv_attr.woe_array))
Example #7
0
 def test_bucket_binning(self):
     bin_param = FeatureBinningParam(bin_num=self.bin_num,
                                     bin_indexes=self.cols)
     bucket_bin = BucketBinning(bin_param)
     split_points = bucket_bin.fit_split_points(self.table)
     split_point = list(split_points.values())[0]
     for kth, s_p in enumerate(split_point):
         expect_s_p = (self.data_num - 1) / self.bin_num * (kth + 1)
         self.assertEqual(s_p, expect_s_p)
     bucket_bin.cal_local_iv(self.table)
     for col_name, iv_attr in bucket_bin.bin_results.all_cols_results.items(
     ):
         # print('col_name: {}, iv: {}, woe_array: {}'.format(col_name, iv_attr.iv, iv_attr.woe_array))
         assert abs(iv_attr.iv - 0.00364386529386804) < 1e-6
Example #8
0
    def _init_model(self, params: FeatureBinningParam):
        self.model_param = params

        self.transform_type = self.model_param.transform_param.transform_type

        if self.model_param.method == consts.QUANTILE:
            self.binning_obj = QuantileBinning(self.model_param)
        elif self.model_param.method == consts.BUCKET:
            self.binning_obj = BucketBinning(self.model_param)
        else:
            # self.binning_obj = QuantileBinning(self.bin_param)
            raise ValueError("Binning method: {} is not supported yet".format(self.model_param.method))
        LOGGER.debug("in _init_model, role: {}, local_partyid: {}".format(self.role, self.component_properties))
        self.binning_obj.set_role_party(self.role, self.component_properties.local_partyid)
Example #9
0
 def _init_binning_obj(self):
     if self.bin_param.method == consts.QUANTILE:
         self.binning_obj = QuantileBinning(self.bin_param, self.party_name)
     elif self.bin_param.method == consts.BUCKET:
         self.binning_obj = BucketBinning(self.bin_param, self.party_name)
     else:
         # self.binning_obj = QuantileBinning(self.bin_param)
         raise ValueError("Binning method: {} is not supported yet".format(self.bin_param.method))
Example #10
0
    def init_bucket(self, data_instances):
        header = data_overview.get_header(data_instances)
        self._default_setting(header)

        init_bucket_param = copy.deepcopy(self.params)
        init_bucket_param.bin_num = self.optimal_param.init_bin_nums
        if self.optimal_param.init_bucket_method == consts.QUANTILE:
            init_binning_obj = QuantileBinningTool(param_obj=init_bucket_param,
                                                   allow_duplicate=False)
        else:
            init_binning_obj = BucketBinning(params=init_bucket_param)
        init_binning_obj.set_bin_inner_param(self.bin_inner_param)
        init_split_points = init_binning_obj.fit_split_points(data_instances)
        is_sparse = data_overview.is_sparse_data(data_instances)

        bucket_dict = dict()
        for col_name, sps in init_split_points.items():

            bucket_list = []
            for idx, sp in enumerate(sps):
                bucket = bucket_info.Bucket(idx,
                                            self.adjustment_factor,
                                            right_bound=sp)
                if idx == 0:
                    bucket.left_bound = -math.inf
                    bucket.set_left_neighbor(None)
                else:
                    bucket.left_bound = sps[idx - 1]
                bucket.event_total = self.event_total
                bucket.non_event_total = self.non_event_total
                bucket_list.append(bucket)
            bucket_list[-1].set_right_neighbor(None)
            bucket_dict[col_name] = bucket_list
            # LOGGER.debug(f"col_name: {col_name}, length of sps: {len(sps)}, "
            #              f"length of list: {len(bucket_list)}")

        convert_func = functools.partial(
            self.convert_data_to_bucket,
            split_points=init_split_points,
            headers=self.header,
            bucket_dict=copy.deepcopy(bucket_dict),
            is_sparse=is_sparse,
            get_bin_num_func=self.get_bin_num)
        bucket_table = data_instances.mapReducePartitions(
            convert_func, self.merge_bucket_list)
        # bucket_table = dict(bucket_table.collect())

        # for k, v in bucket_table.items():
        #     LOGGER.debug(f"[feature] {k}, length of list: {len(v)}")

        # LOGGER.debug("bucket_table: {}, length: {}".format(type(bucket_table), len(bucket_table)))
        # bucket_table = [(k, v) for k, v in bucket_table.items()]
        # LOGGER.debug("bucket_table: {}, length: {}".format(type(bucket_table), len(bucket_table)))

        # bucket_table = session.parallelize(bucket_table, include_key=True, partition=data_instances.partitions)

        return bucket_table
Example #11
0
class BaseFeatureBinning(ModelBase):
    """
    Do binning method through guest and host

    """
    def __init__(self):
        super(BaseFeatureBinning, self).__init__()
        self.transfer_variable = HeteroFeatureBinningTransferVariable()
        self.binning_obj: BaseBinning = None
        self.header = None
        self.header_anonymous = None
        self.schema = None
        self.host_results = []
        self.transform_type = None

        self.model_param = FeatureBinningParam()
        self.bin_inner_param = BinInnerParam()

    def _init_model(self, params: FeatureBinningParam):
        self.model_param = params

        self.transform_type = self.model_param.transform_param.transform_type

        if self.model_param.method == consts.QUANTILE:
            self.binning_obj = QuantileBinning(self.model_param)
        elif self.model_param.method == consts.BUCKET:
            self.binning_obj = BucketBinning(self.model_param)
        elif self.model_param.method == consts.OPTIMAL:
            if self.role == consts.HOST:
                self.model_param.bin_num = self.model_param.optimal_binning_param.init_bin_nums
                self.binning_obj = QuantileBinning(self.model_param)
            else:
                self.binning_obj = OptimalBinning(self.model_param)
        else:
            # self.binning_obj = QuantileBinning(self.bin_param)
            raise ValueError("Binning method: {} is not supported yet".format(
                self.model_param.method))
        LOGGER.debug("in _init_model, role: {}, local_partyid: {}".format(
            self.role, self.component_properties))
        self.binning_obj.set_role_party(
            self.role, self.component_properties.local_partyid)

    @staticmethod
    def data_format_transform(row):
        """
        transform data into sparse format
        """

        if type(row.features).__name__ != consts.SPARSE_VECTOR:
            feature_shape = row.features.shape[0]
            indices = []
            data = []

            for i in range(feature_shape):
                if np.isnan(row.features[i]):
                    indices.append(i)
                    data.append(NoneType())
                elif np.abs(row.features[i]) < consts.FLOAT_ZERO:
                    continue
                else:
                    indices.append(i)
                    data.append(row.features[i])

            new_row = copy.deepcopy(row)
            new_row.features = SparseVector(indices, data, feature_shape)
            return new_row
        else:
            sparse_vec = row.features.get_sparse_vector()
            replace_key = []
            for key in sparse_vec:
                if sparse_vec.get(key) == NoneType() or np.isnan(
                        sparse_vec.get(key)):
                    replace_key.append(key)

            if len(replace_key) == 0:
                return row
            else:
                new_row = copy.deepcopy(row)
                new_sparse_vec = new_row.features.get_sparse_vector()
                for key in replace_key:
                    new_sparse_vec[key] = NoneType()
                return new_row

    def _setup_bin_inner_param(self, data_instances, params):
        if self.schema is not None:
            return

        self.header = get_header(data_instances)
        LOGGER.debug("_setup_bin_inner_param, get header length: {}".format(
            len(self.header)))

        self.schema = data_instances.schema
        self.bin_inner_param.set_header(self.header)
        if params.bin_indexes == -1:
            self.bin_inner_param.set_bin_all()
        else:
            self.bin_inner_param.add_bin_indexes(params.bin_indexes)
            self.bin_inner_param.add_bin_names(params.bin_names)

        self.bin_inner_param.add_category_indexes(params.category_indexes)
        self.bin_inner_param.add_category_names(params.category_names)

        if params.transform_param.transform_cols == -1:
            self.bin_inner_param.set_transform_all()
        else:
            self.bin_inner_param.add_transform_bin_indexes(
                params.transform_param.transform_cols)
            self.bin_inner_param.add_transform_bin_names(
                params.transform_param.transform_names)
        self.binning_obj.set_bin_inner_param(self.bin_inner_param)

    @assert_io_num_rows_equal
    @assert_schema_consistent
    def transform(self, data_instances):
        self._setup_bin_inner_param(data_instances, self.model_param)
        data_instances = self.binning_obj.transform(data_instances,
                                                    self.transform_type)
        self.set_schema(data_instances)
        self.data_output = data_instances
        return data_instances

    def _get_meta(self):
        # col_list = [str(x) for x in self.cols]

        transform_param = feature_binning_meta_pb2.TransformMeta(
            transform_cols=self.bin_inner_param.transform_bin_indexes,
            transform_type=self.model_param.transform_param.transform_type)

        meta_protobuf_obj = feature_binning_meta_pb2.FeatureBinningMeta(
            method=self.model_param.method,
            compress_thres=self.model_param.compress_thres,
            head_size=self.model_param.head_size,
            error=self.model_param.error,
            bin_num=self.model_param.bin_num,
            cols=self.bin_inner_param.bin_names,
            adjustment_factor=self.model_param.adjustment_factor,
            local_only=self.model_param.local_only,
            need_run=self.need_run,
            transform_param=transform_param,
            skip_static=self.model_param.skip_static)
        return meta_protobuf_obj

    def _get_param(self):
        binning_result_obj = self.binning_obj.bin_results.generated_pb()
        # binning_result_obj = self.bin_results.generated_pb()
        host_results = [
            x.bin_results.generated_pb() for x in self.host_results
        ]
        result_obj = feature_binning_param_pb2. \
            FeatureBinningParam(binning_result=binning_result_obj,
                                host_results=host_results,
                                header=self.header,
                                header_anonymous=self.header_anonymous,
                                model_name=consts.BINNING_MODEL)

        return result_obj

    def load_model(self, model_dict):
        model_param = list(
            model_dict.get('model').values())[0].get(MODEL_PARAM_NAME)
        model_meta = list(
            model_dict.get('model').values())[0].get(MODEL_META_NAME)

        self.bin_inner_param = BinInnerParam()

        assert isinstance(model_meta,
                          feature_binning_meta_pb2.FeatureBinningMeta)
        assert isinstance(model_param,
                          feature_binning_param_pb2.FeatureBinningParam)

        self.header = list(model_param.header)
        self.bin_inner_param.set_header(self.header)

        self.bin_inner_param.add_transform_bin_indexes(
            list(model_meta.transform_param.transform_cols))
        self.bin_inner_param.add_bin_names(list(model_meta.cols))
        self.transform_type = model_meta.transform_param.transform_type

        bin_method = str(model_meta.method)
        if bin_method == consts.QUANTILE:
            self.binning_obj = QuantileBinning(params=model_meta)
        else:
            self.binning_obj = BucketBinning(params=model_meta)

        self.binning_obj.set_role_party(
            self.role, self.component_properties.local_partyid)
        self.binning_obj.set_bin_inner_param(self.bin_inner_param)
        self.binning_obj.bin_results.reconstruct(model_param.binning_result)

        self.host_results = []
        for host_pb in model_param.host_results:
            host_bin_obj = BaseBinning()
            host_bin_obj.bin_results.reconstruct(host_pb)
            self.host_results.append(host_bin_obj)

    def export_model(self):
        if self.model_output is not None:
            return self.model_output

        meta_obj = self._get_meta()
        param_obj = self._get_param()
        result = {MODEL_META_NAME: meta_obj, MODEL_PARAM_NAME: param_obj}
        self.model_output = result
        return result

    def save_data(self):
        return self.data_output

    def set_schema(self, data_instance):
        self.schema['header'] = self.header
        data_instance.schema = self.schema
        LOGGER.debug(
            "After Binning, when setting schema, schema is : {}".format(
                data_instance.schema))

    def _abnormal_detection(self, data_instances):
        """
        Make sure input data_instances is valid.
        """
        abnormal_detection.empty_table_detection(data_instances)
        abnormal_detection.empty_feature_detection(data_instances)
        self.check_schema_content(data_instances.schema)
Example #12
0
class BaseHeteroFeatureBinning(ModelBase):
    """
    Do binning method through guest and host

    Attributes
    ----------
    header : list
        record headers of input table

    has_synchronized : bool
        Record whether the encryption information has been synchronized or not.

    flowid : str
        Use in cross validation

    binning_result: dict
        Record binning result of guest party. The format is {'col_name': 'iv_attr', ... }

    host_results: dict
        This attribute uses to record host results. For future version which may record multiple host results,
        the format is dict of dict.
        e.g.
        host_results = {'host1': {'x1': iv1, 'x2: iv2}
                        'host2': ...
                        }

    """
    def __init__(self):
        super(BaseHeteroFeatureBinning, self).__init__()
        self.transfer_variable = HeteroFeatureBinningTransferVariable()
        self.cols = None
        self.cols_dict = {}
        self.binning_obj = None
        self.header = []
        self.schema = {}
        self.has_synchronized = False
        self.flowid = ''
        self.binning_result = {}  # dict of iv_attr
        self.host_results = {}  # dict of host results
        self.party_name = 'Base'
        self.model_param = FeatureBinningParam()

    def _init_model(self, params):
        self.model_param = params
        self.cols_index = params.cols
        if self.model_param.method == consts.QUANTILE:
            self.binning_obj = QuantileBinning(self.model_param,
                                               self.party_name)
        elif self.model_param.method == consts.BUCKET:
            self.binning_obj = BucketBinning(self.model_param, self.party_name)
        else:
            # self.binning_obj = QuantileBinning(self.bin_param)
            raise ValueError("Binning method: {} is not supported yet".format(
                self.model_param.method))

    def transform(self, data_instances):
        self._parse_cols(data_instances)
        transform_cols_idx = self.model_param.transform_param.transform_cols
        transform_type = self.model_param.transform_param.transform_type
        data_instances = self.binning_obj.transform(data_instances,
                                                    transform_cols_idx,
                                                    transform_type)

        self.set_schema(data_instances)
        self.data_output = data_instances

        return data_instances

    def _get_meta(self):
        col_list = [str(x) for x in self.cols]

        meta_protobuf_obj = feature_binning_meta_pb2.FeatureBinningMeta(
            method=self.model_param.method,
            compress_thres=self.model_param.compress_thres,
            head_size=self.model_param.head_size,
            error=self.model_param.error,
            bin_num=self.model_param.bin_num,
            cols=col_list,
            adjustment_factor=self.model_param.adjustment_factor,
            local_only=self.model_param.local_only,
            need_run=self.need_run)
        return meta_protobuf_obj

    def _get_param(self):

        binning_result = self.binning_result

        host_results = self.host_results

        iv_attrs = {}
        for col_name, iv_attr in binning_result.items():
            iv_result = iv_attr.result_dict()
            iv_object = feature_binning_param_pb2.IVParam(**iv_result)
            iv_attrs[col_name] = iv_object
        binning_result_obj = feature_binning_param_pb2.FeatureBinningResult(
            binning_result=iv_attrs)

        final_host_results = {}
        for host_id, this_host_results in host_results.items():
            host_result = {}
            for host_col_idx, iv_attr in this_host_results.items():
                iv_result = iv_attr.result_dict()
                iv_object = feature_binning_param_pb2.IVParam(**iv_result)
                host_result[str(host_col_idx)] = iv_object
            final_host_results[
                host_id] = feature_binning_param_pb2.FeatureBinningResult(
                    binning_result=host_result)

        result_obj = feature_binning_param_pb2.FeatureBinningParam(
            binning_result=binning_result_obj, host_results=final_host_results)
        # json_result = json_format.MessageToJson(result_obj)
        # LOGGER.debug("json_result: {}".format(json_result))
        return result_obj

    def _load_model(self, model_dict):
        model_param = list(
            model_dict.get('model').values())[0].get(MODEL_PARAM_NAME)
        # self._parse_need_run(model_dict, MODEL_META_NAME)
        model_meta = list(
            model_dict.get('model').values())[0].get(MODEL_META_NAME)
        # model_meta.cols = list(model_meta.cols)
        # model_meta.transform_param.transform_cols = list(model_meta.transform_param.transform_cols)
        self.cols = list(map(int, model_meta.cols))
        bin_method = str(model_meta.method)
        if bin_method == consts.QUANTILE:
            self.binning_obj = QuantileBinning(model_meta, self.party_name)
        else:
            self.binning_obj = BucketBinning(model_meta, self.party_name)

        binning_result_obj = dict(model_param.binning_result.binning_result)
        host_params = dict(model_param.host_results)

        self.binning_result = {}
        self.host_results = {}
        for col_name, iv_attr_obj in binning_result_obj.items():
            iv_attr = IVAttributes([], [], [], [], [], [])
            iv_attr.reconstruct(iv_attr_obj)
            self.binning_obj.reconstruct_by_iv_obj(col_name, iv_attr)
            self.binning_result[col_name] = iv_attr
            # self.cols.append(col_name)

        for host_name, host_result_obj in host_params.items():
            host_result_obj = dict(host_result_obj.binning_result)
            for col_name, iv_attr_obj in host_result_obj.items():
                iv_attr = IVAttributes([], [], [], [], [], [])
                iv_attr.reconstruct(iv_attr_obj)
                host_result_obj[col_name] = iv_attr
            self.host_results[host_name] = host_result_obj
        # LOGGER.debug("In feature binning load model, self.binning_result: {}, cols: {}, host_results: {}".format(
        #     self.binning_result, self.cols, self.host_results
        # ))

    def export_model(self):
        if self.model_output is not None:
            return self.model_output

        meta_obj = self._get_meta()
        param_obj = self._get_param()
        result = {MODEL_META_NAME: meta_obj, MODEL_PARAM_NAME: param_obj}
        self.model_output = result
        return result

    def save_data(self):
        return self.data_output

    def _parse_cols(self, data_instances):
        if self.header is not None and len(self.header) != 0:
            return

        LOGGER.debug("Before Binning, schema is : {}".format(
            data_instances.schema))
        header = get_header(data_instances)
        self.schema = data_instances.schema
        self.header = header
        # LOGGER.debug("data_instance count: {}, header: {}".format(data_instances.count(), header))
        if self.cols_index == -1:
            if header is None:
                raise RuntimeError(
                    'Cannot get feature header, please check input data')
            self.cols = [i for i in range(len(header))]
        else:
            self.cols = self.cols_index

        self.cols_dict = {}
        for col in self.cols:
            col_name = header[col]
            self.cols_dict[col_name] = col

    def set_schema(self, data_instance):
        self.schema['header'] = self.header
        data_instance.schema = self.schema
        LOGGER.debug(
            "After Binning, when setting schema, schema is : {}".format(
                data_instance.schema))

    def _abnormal_detection(self, data_instances):
        """
        Make sure input data_instances is valid.
        """
        abnormal_detection.empty_table_detection(data_instances)
        abnormal_detection.empty_feature_detection(data_instances)
Example #13
0
class BaseFeatureBinning(ModelBase):
    """
    Do binning method through guest and host

    """
    def __init__(self):
        super(BaseFeatureBinning, self).__init__()
        self.transfer_variable = HeteroFeatureBinningTransferVariable()
        self.binning_obj: BaseBinning = None
        self.header = None
        self.header_anonymous = None
        self.schema = None
        self.host_results = []
        self.transform_type = None

        self.model_param = FeatureBinningParam()
        self.bin_inner_param = BinInnerParam()
        self.bin_result = MultiClassBinResult(labels=[0, 1])
        self.has_missing_value = False
        self.labels = []

    def _init_model(self, params: FeatureBinningParam):
        self.model_param = params

        self.transform_type = self.model_param.transform_param.transform_type

        if self.role == consts.HOST:
            if self.transform_type == "woe":
                raise ValueError(
                    "Host party do not support woe transform now.")

        if self.model_param.method == consts.QUANTILE:
            self.binning_obj = QuantileBinning(self.model_param)
        elif self.model_param.method == consts.BUCKET:
            self.binning_obj = BucketBinning(self.model_param)
        elif self.model_param.method == consts.OPTIMAL:
            if self.role == consts.HOST:
                self.model_param.bin_num = self.model_param.optimal_binning_param.init_bin_nums
                self.binning_obj = QuantileBinning(self.model_param)
            else:
                self.binning_obj = OptimalBinning(self.model_param)
        else:
            raise ValueError("Binning method: {} is not supported yet".format(
                self.model_param.method))

        self.iv_calculator = IvCalculator(
            self.model_param.adjustment_factor,
            role=self.role,
            party_id=self.component_properties.local_partyid)
        # self.binning_obj.set_role_party(self.role, self.component_properties.local_partyid)

    @staticmethod
    def data_format_transform(row):
        """
        transform data into sparse format
        """

        if type(row.features).__name__ != consts.SPARSE_VECTOR:
            feature_shape = row.features.shape[0]
            indices = []
            data = []

            for i in range(feature_shape):
                if np.isnan(row.features[i]):
                    indices.append(i)
                    data.append(NoneType())
                elif np.abs(row.features[i]) < consts.FLOAT_ZERO:
                    continue
                else:
                    indices.append(i)
                    data.append(row.features[i])

            new_row = copy.deepcopy(row)
            new_row.features = SparseVector(indices, data, feature_shape)
            return new_row
        else:
            sparse_vec = row.features.get_sparse_vector()
            replace_key = []
            for key in sparse_vec:
                if sparse_vec.get(key) == NoneType() or np.isnan(
                        sparse_vec.get(key)):
                    replace_key.append(key)

            if len(replace_key) == 0:
                return row
            else:
                new_row = copy.deepcopy(row)
                new_sparse_vec = new_row.features.get_sparse_vector()
                for key in replace_key:
                    new_sparse_vec[key] = NoneType()
                return new_row

    def _setup_bin_inner_param(self, data_instances, params):
        if self.schema is not None:
            return

        self.header = get_header(data_instances)
        LOGGER.debug("_setup_bin_inner_param, get header length: {}".format(
            len(self.header)))

        self.schema = data_instances.schema
        self.bin_inner_param.set_header(self.header)
        if params.bin_indexes == -1:
            self.bin_inner_param.set_bin_all()
        else:
            self.bin_inner_param.add_bin_indexes(params.bin_indexes)
            self.bin_inner_param.add_bin_names(params.bin_names)

        self.bin_inner_param.add_category_indexes(params.category_indexes)
        self.bin_inner_param.add_category_names(params.category_names)

        if params.transform_param.transform_cols == -1:
            self.bin_inner_param.set_transform_all()
        else:
            self.bin_inner_param.add_transform_bin_indexes(
                params.transform_param.transform_cols)
            self.bin_inner_param.add_transform_bin_names(
                params.transform_param.transform_names)
        self.binning_obj.set_bin_inner_param(self.bin_inner_param)

    @assert_io_num_rows_equal
    @assert_schema_consistent
    def transform(self, data_instances):
        self._setup_bin_inner_param(data_instances, self.model_param)
        if self.transform_type != "woe":
            data_instances = self.binning_obj.transform(
                data_instances, self.transform_type)
        elif self.role == consts.HOST:
            raise ValueError(
                "Woe transform is not available for host parties.")
        else:
            data_instances = self.iv_calculator.woe_transformer(
                data_instances, self.bin_inner_param, self.bin_result)
        self.set_schema(data_instances)
        self.data_output = data_instances
        return data_instances

    def _get_meta(self):
        # col_list = [str(x) for x in self.cols]

        transform_param = feature_binning_meta_pb2.TransformMeta(
            transform_cols=self.bin_inner_param.transform_bin_indexes,
            transform_type=self.model_param.transform_param.transform_type)

        meta_protobuf_obj = feature_binning_meta_pb2.FeatureBinningMeta(
            method=self.model_param.method,
            compress_thres=self.model_param.compress_thres,
            head_size=self.model_param.head_size,
            error=self.model_param.error,
            bin_num=self.model_param.bin_num,
            cols=self.bin_inner_param.bin_names,
            adjustment_factor=self.model_param.adjustment_factor,
            local_only=self.model_param.local_only,
            need_run=self.need_run,
            transform_param=transform_param,
            skip_static=self.model_param.skip_static)
        return meta_protobuf_obj

    def _get_param(self):
        split_points_result = self.binning_obj.bin_results.split_results

        multi_class_result = self.bin_result.generated_pb_list(
            split_points_result)
        # LOGGER.debug(f"split_points_result: {split_points_result}")
        host_multi_class_result = []
        host_single_results = []
        for host_res in self.host_results:
            host_multi_class_result.extend(host_res.generated_pb_list())
            host_single_results.append(host_res.bin_results[0].generated_pb())

        has_host_result = True if len(host_multi_class_result) else False
        multi_pb = feature_binning_param_pb2.MultiClassResult(
            results=multi_class_result,
            labels=[str(x) for x in self.labels],
            host_results=host_multi_class_result,
            host_party_ids=[
                str(x) for x in self.component_properties.host_party_idlist
            ],
            has_host_result=has_host_result)
        result_obj = feature_binning_param_pb2. \
            FeatureBinningParam(binning_result=multi_class_result[0],
                                host_results=host_single_results,
                                header=self.header,
                                header_anonymous=self.header_anonymous,
                                model_name=consts.BINNING_MODEL,
                                multi_class_result=multi_pb)
        return result_obj

    def load_model(self, model_dict):
        model_param = list(
            model_dict.get('model').values())[0].get(MODEL_PARAM_NAME)
        model_meta = list(
            model_dict.get('model').values())[0].get(MODEL_META_NAME)

        self.bin_inner_param = BinInnerParam()
        multi_class_result = model_param.multi_class_result
        self.labels = list(multi_class_result.labels)
        # if not self.labels:
        #     self.labels = [0, 1]
        if self.labels:
            self.bin_result = MultiClassBinResult.reconstruct(
                list(multi_class_result.results), self.labels)

        assert isinstance(model_meta,
                          feature_binning_meta_pb2.FeatureBinningMeta)
        assert isinstance(model_param,
                          feature_binning_param_pb2.FeatureBinningParam)

        self.header = list(model_param.header)
        self.bin_inner_param.set_header(self.header)

        self.bin_inner_param.add_transform_bin_indexes(
            list(model_meta.transform_param.transform_cols))
        self.bin_inner_param.add_bin_names(list(model_meta.cols))
        self.transform_type = model_meta.transform_param.transform_type

        bin_method = str(model_meta.method)
        if bin_method == consts.QUANTILE:
            self.binning_obj = QuantileBinning(params=model_meta)
        elif bin_method == consts.OPTIMAL:
            self.binning_obj = OptimalBinning(params=model_meta)
        else:
            self.binning_obj = BucketBinning(params=model_meta)

        # self.binning_obj.set_role_party(self.role, self.component_properties.local_partyid)
        self.binning_obj.set_bin_inner_param(self.bin_inner_param)

        split_results = dict(model_param.binning_result.binning_result)
        for col_name, sr_pb in split_results.items():
            split_points = list(sr_pb.split_points)
            self.binning_obj.bin_results.put_col_split_points(
                col_name, split_points)

        # self.binning_obj.bin_results.reconstruct(model_param.binning_result)

        self.host_results = []
        host_pbs = list(model_param.multi_class_result.host_results)
        if len(host_pbs):
            if len(self.labels) == 2:
                for host_pb in host_pbs:
                    self.host_results.append(
                        MultiClassBinResult.reconstruct(host_pb, self.labels))
            else:
                assert len(host_pbs) % len(self.labels) == 0
                i = 0
                while i < len(host_pbs):
                    this_pbs = host_pbs[i:i + len(self.labels)]
                    self.host_results.append(
                        MultiClassBinResult.reconstruct(this_pbs, self.labels))
                    i += len(self.labels)

        if list(model_param.header_anonymous):
            self.header_anonymous = list(model_param.header_anonymous)

    def export_model(self):
        if self.model_output is not None:
            return self.model_output

        meta_obj = self._get_meta()
        param_obj = self._get_param()
        result = {MODEL_META_NAME: meta_obj, MODEL_PARAM_NAME: param_obj}
        self.model_output = result
        return result

    def save_data(self):
        return self.data_output

    def set_schema(self, data_instance):
        self.schema['header'] = self.header
        data_instance.schema = self.schema
        # LOGGER.debug("After Binning, when setting schema, schema is : {}".format(data_instance.schema))

    def _abnormal_detection(self, data_instances):
        """
        Make sure input data_instances is valid.
        """
        abnormal_detection.empty_table_detection(data_instances)
        abnormal_detection.empty_feature_detection(data_instances)
        self.check_schema_content(data_instances.schema)
Example #14
0
    def load_model(self, model_dict):
        model_param = list(
            model_dict.get('model').values())[0].get(MODEL_PARAM_NAME)
        model_meta = list(
            model_dict.get('model').values())[0].get(MODEL_META_NAME)

        self.bin_inner_param = BinInnerParam()
        multi_class_result = model_param.multi_class_result
        self.labels = list(multi_class_result.labels)
        # if not self.labels:
        #     self.labels = [0, 1]
        if self.labels:
            self.bin_result = MultiClassBinResult.reconstruct(
                list(multi_class_result.results), self.labels)

        assert isinstance(model_meta,
                          feature_binning_meta_pb2.FeatureBinningMeta)
        assert isinstance(model_param,
                          feature_binning_param_pb2.FeatureBinningParam)

        self.header = list(model_param.header)
        self.bin_inner_param.set_header(self.header)

        self.bin_inner_param.add_transform_bin_indexes(
            list(model_meta.transform_param.transform_cols))
        self.bin_inner_param.add_bin_names(list(model_meta.cols))
        self.transform_type = model_meta.transform_param.transform_type

        bin_method = str(model_meta.method)
        if bin_method == consts.QUANTILE:
            self.binning_obj = QuantileBinning(params=model_meta)
        elif bin_method == consts.OPTIMAL:
            self.binning_obj = OptimalBinning(params=model_meta)
        else:
            self.binning_obj = BucketBinning(params=model_meta)

        # self.binning_obj.set_role_party(self.role, self.component_properties.local_partyid)
        self.binning_obj.set_bin_inner_param(self.bin_inner_param)

        split_results = dict(model_param.binning_result.binning_result)
        for col_name, sr_pb in split_results.items():
            split_points = list(sr_pb.split_points)
            self.binning_obj.bin_results.put_col_split_points(
                col_name, split_points)

        # self.binning_obj.bin_results.reconstruct(model_param.binning_result)

        self.host_results = []
        host_pbs = list(model_param.multi_class_result.host_results)
        if len(host_pbs):
            if len(self.labels) == 2:
                for host_pb in host_pbs:
                    self.host_results.append(
                        MultiClassBinResult.reconstruct(host_pb, self.labels))
            else:
                assert len(host_pbs) % len(self.labels) == 0
                i = 0
                while i < len(host_pbs):
                    this_pbs = host_pbs[i:i + len(self.labels)]
                    self.host_results.append(
                        MultiClassBinResult.reconstruct(this_pbs, self.labels))
                    i += len(self.labels)

        if list(model_param.header_anonymous):
            self.header_anonymous = list(model_param.header_anonymous)
Example #15
0
class BaseHeteroFeatureBinning(ModelBase):
    """
    Do binning method through guest and host

    """

    def __init__(self):
        super(BaseHeteroFeatureBinning, self).__init__()
        self.transfer_variable = HeteroFeatureBinningTransferVariable()
        self.binning_obj = None
        self.header = None
        self.schema = None
        self.host_results = []
        self.transform_type = None

        self.model_param = FeatureBinningParam()
        self.bin_inner_param = BinInnerParam()

    def _init_model(self, params: FeatureBinningParam):
        self.model_param = params

        self.transform_type = self.model_param.transform_param.transform_type

        if self.model_param.method == consts.QUANTILE:
            self.binning_obj = QuantileBinning(self.model_param)
        elif self.model_param.method == consts.BUCKET:
            self.binning_obj = BucketBinning(self.model_param)
        else:
            # self.binning_obj = QuantileBinning(self.bin_param)
            raise ValueError("Binning method: {} is not supported yet".format(self.model_param.method))
        LOGGER.debug("in _init_model, role: {}, local_partyid: {}".format(self.role, self.component_properties))
        self.binning_obj.set_role_party(self.role, self.component_properties.local_partyid)

    def _setup_bin_inner_param(self, data_instances, params: FeatureBinningParam):
        if self.schema is not None:
            return

        self.header = get_header(data_instances)
        self.schema = data_instances.schema
        self.bin_inner_param.set_header(self.header)
        if params.bin_indexes == -1:
            self.bin_inner_param.set_bin_all()
        else:
            self.bin_inner_param.add_bin_indexes(params.bin_indexes)
            self.bin_inner_param.add_bin_names(params.bin_names)

        self.bin_inner_param.add_category_indexes(params.category_indexes)
        self.bin_inner_param.add_category_names(params.category_names)

        if params.transform_param.transform_cols == -1:
            self.bin_inner_param.set_transform_all()
        else:
            self.bin_inner_param.add_transform_bin_indexes(params.transform_param.transform_cols)
            self.bin_inner_param.add_transform_bin_names(params.transform_param.transform_names)

        self.binning_obj.set_bin_inner_param(self.bin_inner_param)

    def transform(self, data_instances):
        self._setup_bin_inner_param(data_instances, self.model_param)
        data_instances = self.binning_obj.transform(data_instances, self.transform_type)
        self.set_schema(data_instances)
        self.data_output = data_instances
        return data_instances

    def _get_meta(self):
        # col_list = [str(x) for x in self.cols]

        transform_param = feature_binning_meta_pb2.TransformMeta(
            transform_cols=self.bin_inner_param.transform_bin_indexes,
            transform_type=self.model_param.transform_param.transform_type
        )

        meta_protobuf_obj = feature_binning_meta_pb2.FeatureBinningMeta(
            method=self.model_param.method,
            compress_thres=self.model_param.compress_thres,
            head_size=self.model_param.head_size,
            error=self.model_param.error,
            bin_num=self.model_param.bin_num,
            cols=self.bin_inner_param.bin_names,
            adjustment_factor=self.model_param.adjustment_factor,
            local_only=self.model_param.local_only,
            need_run=self.need_run,
            transform_param=transform_param
        )
        return meta_protobuf_obj

    def _get_param(self):
        binning_result_obj = self.binning_obj.bin_results.generated_pb()
        host_results = [x.bin_results.generated_pb() for x in self.host_results]

        result_obj = feature_binning_param_pb2.FeatureBinningParam(binning_result=binning_result_obj,
                                                                   host_results=host_results,
                                                                   header=self.header)
        # json_result = json_format.MessageToJson(result_obj)
        # LOGGER.debug("json_result: {}".format(json_result))
        return result_obj

    def load_model(self, model_dict):
        model_param = list(model_dict.get('model').values())[0].get(MODEL_PARAM_NAME)
        model_meta = list(model_dict.get('model').values())[0].get(MODEL_META_NAME)

        self.bin_inner_param = BinInnerParam()

        assert isinstance(model_meta, feature_binning_meta_pb2.FeatureBinningMeta)
        assert isinstance(model_param, feature_binning_param_pb2.FeatureBinningParam)

        self.header = list(model_param.header)
        self.bin_inner_param.set_header(self.header)

        self.bin_inner_param.add_transform_bin_indexes(list(model_meta.transform_param.transform_cols))
        self.bin_inner_param.add_bin_names(list(model_meta.cols))
        self.transform_type = model_meta.transform_param.transform_type

        bin_method = str(model_meta.method)
        if bin_method == consts.QUANTILE:
            self.binning_obj = QuantileBinning(params=model_meta)
        else:
            self.binning_obj = BucketBinning(params=model_meta)

        self.binning_obj.set_role_party(self.role, self.component_properties.local_partyid)
        self.binning_obj.set_bin_inner_param(self.bin_inner_param)
        self.binning_obj.bin_results.reconstruct(model_param.binning_result)

        self.host_results = []
        for host_pb in model_param.host_results:
            host_bin_obj = HostBaseBinning()
            host_bin_obj.bin_results.reconstruct(host_pb)
            self.host_results.append(host_bin_obj)

    def export_model(self):
        if self.model_output is not None:
            return self.model_output

        meta_obj = self._get_meta()
        param_obj = self._get_param()
        result = {
            MODEL_META_NAME: meta_obj,
            MODEL_PARAM_NAME: param_obj
        }
        self.model_output = result
        return result

    def save_data(self):
        return self.data_output

    def set_schema(self, data_instance):
        self.schema['header'] = self.header
        data_instance.schema = self.schema
        LOGGER.debug("After Binning, when setting schema, schema is : {}".format(data_instance.schema))

    def _abnormal_detection(self, data_instances):
        """
        Make sure input data_instances is valid.
        """
        abnormal_detection.empty_table_detection(data_instances)
        abnormal_detection.empty_feature_detection(data_instances)
Example #16
0
 def test_bucket_binning(self):
     bin_param = FeatureBinningParam(bin_num=self.bin_num, cols=self.cols)
     bucket_bin = BucketBinning(bin_param)
     split_points = bucket_bin.fit_split_points(self.table)
     print(split_points)
Example #17
0
    def init_bucket(self, data_instances):
        header = data_overview.get_header(data_instances)
        self._default_setting(header)

        init_bucket_param = copy.deepcopy(self.params)
        init_bucket_param.bin_num = self.optimal_param.init_bin_nums
        if self.optimal_param.init_bucket_method == consts.QUANTILE:
            init_binning_obj = QuantileBinningTool(param_obj=init_bucket_param, allow_duplicate=False)
        else:
            init_binning_obj = BucketBinning(params=init_bucket_param)
        init_binning_obj.set_bin_inner_param(self.bin_inner_param)
        init_split_points = init_binning_obj.fit_split_points(data_instances)
        is_sparse = data_overview.is_sparse_data(data_instances)

        bucket_dict = dict()
        for col_name, sps in init_split_points.items():

            # bucket_list = [bucket_info.Bucket(idx, self.adjustment_factor, right_bound=sp)
            #                for idx, sp in enumerate(sps)]
            bucket_list = []
            for idx, sp in enumerate(sps):
                bucket = bucket_info.Bucket(idx, self.adjustment_factor, right_bound=sp)
                if idx == 0:
                    bucket.left_bound = -math.inf
                    bucket.set_left_neighbor(None)
                else:
                    bucket.left_bound = sps[idx - 1]
                bucket.event_total = self.event_total
                bucket.non_event_total = self.non_event_total
                bucket_list.append(bucket)
            bucket_list[-1].set_right_neighbor(None)
            bucket_dict[col_name] = bucket_list
            LOGGER.debug(f"col_name: {col_name}, length of sps: {len(sps)}, "
                         f"length of list: {len(bucket_list)}")

        # bucket_table = data_instances.mapPartitions2(convert_func)
        # bucket_table = bucket_table.reduce(self.merge_bucket_list, key_func=lambda key: key[1])
        from fate_arch.common.versions import get_eggroll_version
        version = get_eggroll_version()
        if version.startswith('2.0'):
            convert_func = functools.partial(self.convert_data_to_bucket_old,
                                             split_points=init_split_points,
                                             headers=self.header,
                                             bucket_dict=copy.deepcopy(bucket_dict),
                                             is_sparse=is_sparse,
                                             get_bin_num_func=self.get_bin_num)
            summary_dict = data_instances.mapPartitions(convert_func, use_previous_behavior=False)
            # summary_dict = summary_dict.reduce(self.copy_merge, key_func=lambda key: key[1])
            from federatedml.util.reduce_by_key import reduce
            bucket_table = reduce(summary_dict, self.merge_bucket_list, key_func=lambda key: key[1])
        elif version.startswith('2.2'):
            convert_func = functools.partial(self.convert_data_to_bucket,
                                             split_points=init_split_points,
                                             headers=self.header,
                                             bucket_dict=copy.deepcopy(bucket_dict),
                                             is_sparse=is_sparse,
                                             get_bin_num_func=self.get_bin_num)
            bucket_table = data_instances.mapReducePartitions(convert_func, self.merge_bucket_list)
            bucket_table = dict(bucket_table.collect())
        else:
            raise RuntimeError(f"Cannot recognized eggroll version: {version}")

        for k, v in bucket_table.items():
            LOGGER.debug(f"[feature] {k}, length of list: {len(v)}")

        LOGGER.debug("bucket_table: {}, length: {}".format(type(bucket_table), len(bucket_table)))
        bucket_table = [(k, v) for k, v in bucket_table.items()]
        LOGGER.debug("bucket_table: {}, length: {}".format(type(bucket_table), len(bucket_table)))

        bucket_table = session.parallelize(bucket_table, include_key=True, partition=data_instances.partitions)

        return bucket_table