Esempio n. 1
0
    def average_run(self, data_instances, bin_num=10, abnormal_list=None):
        if self.bin_param is None:
            bin_param = FeatureBinningParam(bin_num=bin_num)
            self.bin_param = bin_param
        else:
            bin_param = self.bin_param

        if self.bin_method == consts.QUANTILE:
            bin_obj = QuantileBinning(params=bin_param,
                                      abnormal_list=abnormal_list,
                                      allow_duplicate=True)
        else:
            raise ValueError(
                "H**o Split Point do not accept bin_method: {}".format(
                    self.bin_method))

        abnormal_detection.empty_table_detection(data_instances)
        abnormal_detection.empty_feature_detection(data_instances)

        split_points = bin_obj.fit_split_points(data_instances)
        split_points = {k: np.array(v) for k, v in split_points.items()}
        split_points_weights = DictWeights(d=split_points)

        self.aggregator.send_model(split_points_weights, self.suffix)
        dict_split_points = self.aggregator.get_aggregated_model(self.suffix)
        split_points = {
            k: list(v)
            for k, v in dict_split_points.unboxed.items()
        }
        self.bin_obj = bin_obj
        return split_points
Esempio n. 2
0
    def _init_model(self, params: FeatureBinningParam):
        self.model_param = params

        self.transform_type = self.model_param.transform_param.transform_type

        if self.role == consts.HOST:
            if self.transform_type == "woe":
                raise ValueError(
                    "Host party do not support woe transform now.")

        if self.model_param.method == consts.QUANTILE:
            self.binning_obj = QuantileBinning(self.model_param)
        elif self.model_param.method == consts.BUCKET:
            self.binning_obj = BucketBinning(self.model_param)
        elif self.model_param.method == consts.OPTIMAL:
            if self.role == consts.HOST:
                self.model_param.bin_num = self.model_param.optimal_binning_param.init_bin_nums
                self.binning_obj = QuantileBinning(self.model_param)
            else:
                self.binning_obj = OptimalBinning(self.model_param)
        else:
            # self.binning_obj = QuantileBinning(self.bin_param)
            raise ValueError("Binning method: {} is not supported yet".format(
                self.model_param.method))
        LOGGER.debug("in _init_model, role: {}, local_partyid: {}".format(
            self.role, self.component_properties))
        self.binning_obj.set_role_party(
            self.role, self.component_properties.local_partyid)
Esempio n. 3
0
    def test_quantile_binning(self):
        error = 0.01
        compress_thres = int(self.data_num / (self.data_num * error))

        head_size = 5000
        bin_num = 10
        bin_percent = [int(i * (100.0 / bin_num)) for i in range(1, bin_num)]

        bin_param = FeatureBinningParam(method='quantile',
                                        compress_thres=compress_thres,
                                        head_size=head_size,
                                        error=error,
                                        cols=self.cols,
                                        bin_num=bin_num)
        quan_bin = QuantileBinning(bin_param)
        t0 = time.time()
        split_points = quan_bin.fit_split_points(self.table)
        t1 = time.time()
        print('Spend time: {}'.format(t1 - t0))

        # collect and test numpy quantile speed
        local_table = self.table.collect()
        total_data = []
        for _, data_inst in local_table:
            total_data.append(data_inst.features)
        total_data = np.array(total_data)
        for col in self.cols:
            col_idx = self.col_dict.get(col)
            x = total_data[:, col_idx]
            sk = np.percentile(x, bin_percent, interpolation="midpoint")
        t2 = time.time()
        print('collect and use numpy time: {}'.format(t2 - t1))
Esempio n. 4
0
    def _init_model(self, params: FeatureBinningParam):
        self.model_param = params

        self.transform_type = self.model_param.transform_param.transform_type

        if self.role == consts.HOST:
            if self.transform_type == "woe":
                raise ValueError(
                    "Host party do not support woe transform now.")

        if self.model_param.method == consts.QUANTILE:
            self.binning_obj = QuantileBinning(self.model_param)
        elif self.model_param.method == consts.BUCKET:
            self.binning_obj = BucketBinning(self.model_param)
        elif self.model_param.method == consts.OPTIMAL:
            if self.role == consts.HOST:
                self.model_param.bin_num = self.model_param.optimal_binning_param.init_bin_nums
                self.binning_obj = QuantileBinning(self.model_param)
            else:
                self.binning_obj = OptimalBinning(self.model_param)
        else:
            raise ValueError("Binning method: {} is not supported yet".format(
                self.model_param.method))

        self.iv_calculator = IvCalculator(
            self.model_param.adjustment_factor,
            role=self.role,
            party_id=self.component_properties.local_partyid)
Esempio n. 5
0
 def convert_feature_to_bin(self, data_instance):
     LOGGER.info("convert feature to bins")
     param_obj = FeatureBinningParam(bin_num=self.bin_num)
     binning_obj = QuantileBinning(param_obj)
     binning_obj.fit_split_points(data_instance)
     self.data_bin, self.bin_split_points, self.bin_sparse_points = binning_obj.convert_feature_to_bin(
         data_instance)
Esempio n. 6
0
 def test_new_sparse_quantile(self):
     param_obj = FeatureBinningParam(bin_num=4)
     binning_obj = QuantileBinning(param_obj)
     binning_obj.fit_split_points(self.sparse_table)
     data_bin, bin_splitpoints, bin_sparse = binning_obj.convert_feature_to_bin(self.sparse_table)
     bin_result = dict([(key, inst.features) for key, inst in data_bin.collect()])
     for i in range(20):
         self.assertTrue(len(self.sparse_inst[i][1].features.sparse_vec) == len(bin_result[i].sparse_vec))
Esempio n. 7
0
 def _get_quantile_median(self):
     bin_param = FeatureBinningParam(bin_num=2, cols=self.cols)
     binning_obj = QuantileBinning(bin_param)
     split_points = binning_obj.fit_split_points(self.data_instances)
     medians = {}
     for col_name, split_point in split_points.items():
         medians[col_name] = split_point[0]
     return medians
Esempio n. 8
0
 def _get_quantile_median(self):
     cols_index = self._get_cols_index()
     bin_param = FeatureBinningParam(bin_num=2, cols=cols_index)
     binning_obj = QuantileBinning(bin_param, abnormal_list=self.abnormal_list)
     split_points = binning_obj.fit_split_points(self.data_instances)
     medians = {}
     for col_name, split_point in split_points.items():
         medians[col_name] = split_point[0]
     return medians
Esempio n. 9
0
    def _static_quantile_summaries(self):
        """
        Static summaries so that can query a specific quantile point
        """
        if self.binning_obj is not None:
            return self.binning_obj
        bin_param = FeatureBinningParam(bin_num=2, bin_indexes=self.cols_index,
                                        error=self.error)
        self.binning_obj = QuantileBinning(bin_param, abnormal_list=self.abnormal_list)
        self.binning_obj.fit_split_points(self.data_instances)

        return self.binning_obj
Esempio n. 10
0
    def fit(self, data_instances):
        if self.bin_obj is not None:
            return self

        if self.bin_param is None:
            self.bin_param = FeatureBinningParam()

        self.bin_obj = QuantileBinning(params=self.bin_param,
                                       abnormal_list=self.abnormal_list,
                                       allow_duplicate=True)
        self.bin_obj.fit_split_points(data_instances)
        return self
Esempio n. 11
0
    def fit(self, data_instances, bin_param=None):
        if bin_param is None:  # Use default setting
            bin_param = FeatureBinningParam()

        bin_obj = QuantileBinning(bin_param)
        query_result = bin_obj.query_quantile_point(data_instances, self.cols,
                                                    self.percentile)
        for col_name, feature_value in query_result.items():
            self.feature_values[col_name] = feature_value
            if feature_value < self.upper_threshold:
                self.left_cols[col_name] = True
            else:
                self.left_cols[col_name] = False

        self.left_cols = self._keep_one_feature()
        return self.left_cols
Esempio n. 12
0
    def _load_model(self, model_dict):
        model_param = list(
            model_dict.get('model').values())[0].get(MODEL_PARAM_NAME)
        # self._parse_need_run(model_dict, MODEL_META_NAME)
        model_meta = list(
            model_dict.get('model').values())[0].get(MODEL_META_NAME)
        # model_meta.cols = list(model_meta.cols)
        # model_meta.transform_param.transform_cols = list(model_meta.transform_param.transform_cols)
        self.cols = list(map(int, model_meta.cols))
        bin_method = str(model_meta.method)
        if bin_method == consts.QUANTILE:
            self.binning_obj = QuantileBinning(model_meta, self.party_name)
        else:
            self.binning_obj = BucketBinning(model_meta, self.party_name)

        binning_result_obj = dict(model_param.binning_result.binning_result)
        host_params = dict(model_param.host_results)

        self.binning_result = {}
        self.host_results = {}
        for col_name, iv_attr_obj in binning_result_obj.items():
            iv_attr = IVAttributes([], [], [], [], [], [])
            iv_attr.reconstruct(iv_attr_obj)
            self.binning_obj.reconstruct_by_iv_obj(col_name, iv_attr)
            self.binning_result[col_name] = iv_attr
            # self.cols.append(col_name)

        for host_name, host_result_obj in host_params.items():
            host_result_obj = dict(host_result_obj.binning_result)
            for col_name, iv_attr_obj in host_result_obj.items():
                iv_attr = IVAttributes([], [], [], [], [], [])
                iv_attr.reconstruct(iv_attr_obj)
                host_result_obj[col_name] = iv_attr
            self.host_results[host_name] = host_result_obj
Esempio n. 13
0
    def load_model(self, model_dict):
        model_param = list(model_dict.get('model').values())[0].get(MODEL_PARAM_NAME)
        model_meta = list(model_dict.get('model').values())[0].get(MODEL_META_NAME)

        self.bin_inner_param = BinInnerParam()

        assert isinstance(model_meta, feature_binning_meta_pb2.FeatureBinningMeta)
        assert isinstance(model_param, feature_binning_param_pb2.FeatureBinningParam)

        self.header = list(model_param.header)
        self.bin_inner_param.set_header(self.header)

        self.bin_inner_param.add_transform_bin_indexes(list(model_meta.transform_param.transform_cols))
        self.bin_inner_param.add_bin_names(list(model_meta.cols))
        self.transform_type = model_meta.transform_param.transform_type

        bin_method = str(model_meta.method)
        if bin_method == consts.QUANTILE:
            self.binning_obj = QuantileBinning(params=model_meta)
        else:
            self.binning_obj = BucketBinning(params=model_meta)

        self.binning_obj.set_role_party(self.role, self.component_properties.local_partyid)
        self.binning_obj.set_bin_inner_param(self.bin_inner_param)
        self.binning_obj.bin_results.reconstruct(model_param.binning_result)

        self.host_results = []
        for host_pb in model_param.host_results:
            host_bin_obj = BaseBinning()
            host_bin_obj.bin_results.reconstruct(host_pb)
            self.host_results.append(host_bin_obj)
Esempio n. 14
0
 def _init_binning_obj(self):
     if self.bin_param.method == consts.QUANTILE:
         self.binning_obj = QuantileBinning(self.bin_param, self.party_name)
     elif self.bin_param.method == consts.BUCKET:
         self.binning_obj = BucketBinning(self.bin_param, self.party_name)
     else:
         # self.binning_obj = QuantileBinning(self.bin_param)
         raise ValueError("Binning method: {} is not supported yet".format(self.bin_param.method))
Esempio n. 15
0
    def test_new_dense_quantile(self):
        param_obj = FeatureBinningParam(bin_num=4)
        binning_obj = QuantileBinning(param_obj)
        binning_obj.fit_split_points(self.dense_table)
        data_bin, bin_splitpoints, bin_sparse = binning_obj.convert_feature_to_bin(self.dense_table)
        bin_result = dict([(key, inst.features) for key, inst in data_bin.collect()])
        # print(bin_result)
        for i in range(100):
            self.assertTrue((bin_result[i] == np.ones(20, dtype='int') * ((i % 16) // 4)).all())
            if i < 20:
                # col_name = 'x' + str(i)
                col_idx = i
                split_point = np.array(bin_splitpoints[col_idx])
                self.assertTrue((split_point == np.asarray([3, 7, 11, 15], dtype='int')).all())

        for split_points in bin_splitpoints:
            self.assertTrue(len(split_points) <= 4)
Esempio n. 16
0
    def _bin_obj_generator(self, abnormal_list: list = None, this_bin_num=bin_num):

        bin_param = FeatureBinningParam(method='quantile', compress_thres=consts.DEFAULT_COMPRESS_THRESHOLD,
                                        head_size=consts.DEFAULT_HEAD_SIZE,
                                        error=consts.DEFAULT_RELATIVE_ERROR,
                                        bin_indexes=-1,
                                        bin_num=this_bin_num)
        bin_obj = QuantileBinning(bin_param, abnormal_list=abnormal_list)
        return bin_obj
Esempio n. 17
0
    def _bin_obj_generator(self):

        bin_param = FeatureBinningParam(method='quantile',
                                        compress_thres=compress_thres,
                                        head_size=head_size,
                                        error=error,
                                        cols=-1,
                                        bin_num=bin_num)
        bin_obj = QuantileBinning(bin_param)
        return bin_obj
Esempio n. 18
0
    def test_quantile_binning(self):
        return

        compress_thres = 10000
        head_size = 5000
        error = 0.01
        bin_num = 10
        bin_param = FeatureBinningParam(method='quantile',
                                        compress_thres=compress_thres,
                                        head_size=head_size,
                                        error=error,
                                        cols=self.cols,
                                        bin_num=bin_num)
        quan_bin = QuantileBinning(bin_param)
        split_points = quan_bin.fit_split_points(self.table)
        for col_idx, col in enumerate(self.cols):
            bin_percent = [i * (1.0 / bin_num) for i in range(1, bin_num)]
            feature_idx = self.col_dict.get(col)
            x = self.numpy_table[:, feature_idx]
            x = sorted(x)
            for bin_idx, percent in enumerate(bin_percent):
                min_rank = int(
                    math.floor(percent * self.data_num -
                               self.data_num * error))
                max_rank = int(
                    math.ceil(percent * self.data_num + self.data_num * error))
                if min_rank < 0:
                    min_rank = 0
                if max_rank > len(x) - 1:
                    max_rank = len(x) - 1
                try:
                    self.assertTrue(x[min_rank] <= split_points[col_idx]
                                    [bin_idx] <= x[max_rank])
                except:
                    print(x[min_rank], x[max_rank],
                          split_points[col_idx][bin_idx])
                    found_index = x.index(split_points[col_idx][bin_idx])
                    print("min_rank: {}, found_rank: {}, max_rank: {}".format(
                        min_rank, found_index, max_rank))
                self.assertTrue(x[min_rank] <= split_points[col_idx][bin_idx]
                                <= x[max_rank])
Esempio n. 19
0
    def convert_feature_to_bin(self, data_instance):
        LOGGER.info("convert feature to bins")
        param_obj = FeatureBinningParam(bin_num=self.bin_num)
        if self.use_missing:
            binning_obj = QuantileBinning(param_obj, abnormal_list=[NoneType()])
        else:
            binning_obj = QuantileBinning(param_obj)

        binning_obj.fit_split_points(data_instance)
        self.data_bin, self.bin_split_points, self.bin_sparse_points = binning_obj.convert_feature_to_bin(data_instance)
Esempio n. 20
0
 def _init_model(self, params):
     self.model_param = params
     self.cols_index = params.cols
     if self.model_param.method == consts.QUANTILE:
         self.binning_obj = QuantileBinning(self.model_param,
                                            self.party_name)
     elif self.model_param.method == consts.BUCKET:
         self.binning_obj = BucketBinning(self.model_param, self.party_name)
     else:
         # self.binning_obj = QuantileBinning(self.bin_param)
         raise ValueError("Binning method: {} is not supported yet".format(
             self.model_param.method))
Esempio n. 21
0
    def _init_model(self, params: FeatureBinningParam):
        self.model_param = params

        self.transform_type = self.model_param.transform_param.transform_type

        if self.model_param.method == consts.QUANTILE:
            self.binning_obj = QuantileBinning(self.model_param)
        elif self.model_param.method == consts.BUCKET:
            self.binning_obj = BucketBinning(self.model_param)
        else:
            # self.binning_obj = QuantileBinning(self.bin_param)
            raise ValueError("Binning method: {} is not supported yet".format(self.model_param.method))
        LOGGER.debug("in _init_model, role: {}, local_partyid: {}".format(self.role, self.component_properties))
        self.binning_obj.set_role_party(self.role, self.component_properties.local_partyid)
Esempio n. 22
0
    def fit(self, expect_table, actual_table):

        LOGGER.info('start psi computing')

        header1 = expect_table.schema['header']
        header2 = actual_table.schema['header']

        if not set(header1) == set(header2):
            raise ValueError(
                'table header must be the same while computing psi values')

        # baseline table should not contain empty columns
        abnormal_detection.empty_column_detection(expect_table)

        self.all_feature_list = header1

        # make sure no duplicate features
        self.all_feature_list = self.check_duplicates(self.all_feature_list)

        # kv bi-directional mapping
        self.tag_id_mapping = {
            v: k
            for k, v in enumerate(self.all_feature_list)
        }
        self.id_tag_mapping = {
            k: v
            for k, v in enumerate(self.all_feature_list)
        }

        if not self.is_sparse(
                expect_table):  # convert missing value: nan to NoneType
            expect_table = self.convert_missing_val(expect_table)

        if not self.is_sparse(
                actual_table):  # convert missing value: nan to NoneType
            actual_table = self.convert_missing_val(actual_table)

        if not (self.check_table_content(expect_table)
                and self.check_table_content(actual_table)):
            raise ValueError(
                'contents of input table must be instances of class "Instance"'
            )

        param = FeatureBinningParam(method=consts.QUANTILE,
                                    bin_num=self.max_bin_num,
                                    local_only=True,
                                    error=self.binning_error)
        binning_obj = QuantileBinning(params=param,
                                      abnormal_list=[NoneType()],
                                      allow_duplicate=False)
        binning_obj.fit_split_points(expect_table)

        data_bin, bin_split_points, bin_sparse_points = binning_obj.convert_feature_to_bin(
            expect_table)
        LOGGER.debug('bin split points is {}, shape is {}'.format(
            bin_split_points, bin_split_points.shape))
        self.binning_obj = binning_obj

        self.data_bin1 = data_bin
        self.bin_split_points = bin_split_points
        self.bin_sparse_points = bin_sparse_points
        LOGGER.debug('expect table binning done')

        count_func1 = functools.partial(
            map_partition_handle,
            feat_num=len(self.all_feature_list),
            max_bin_num=self.max_bin_num +
            1,  # an additional bin for missing value
            missing_val=self.dense_missing_val,
            is_sparse=self.is_sparse(self.data_bin1))

        map_rs1 = self.data_bin1.applyPartitions(count_func1)
        count1 = count_rs_to_dict(map_rs1.reduce(map_partition_reduce))

        data_bin2, bin_split_points2, bin_sparse_points2 = binning_obj.convert_feature_to_bin(
            actual_table)
        self.data_bin2 = data_bin2
        LOGGER.debug('actual table binning done')

        count_func2 = functools.partial(
            map_partition_handle,
            feat_num=len(self.all_feature_list),
            max_bin_num=self.max_bin_num +
            1,  # an additional bin for missing value
            missing_val=self.dense_missing_val,
            is_sparse=self.is_sparse(self.data_bin2))

        map_rs2 = self.data_bin2.applyPartitions(count_func2)
        count2 = count_rs_to_dict(map_rs2.reduce(map_partition_reduce))

        self.count1, self.count2 = count1, count2

        LOGGER.info('psi counting done')

        # compute psi from counting result
        psi_result = psi_computer(count1, count2, expect_table.count(),
                                  actual_table.count())
        self.psi_rs = psi_result

        # get total psi score of features
        total_scores = {}
        for idx, rs in enumerate(self.psi_rs):
            feat_name = self.id_tag_mapping[idx]
            total_scores[feat_name] = rs['total_psi']
        self.total_scores = total_scores

        # id-feature mapping convert, str interval computation
        self.str_intervals = self.get_string_interval(
            bin_split_points,
            self.id_tag_mapping,
            missing_bin_idx=self.max_bin_num)

        self.interval_perc1 = self.count_dict_to_percentage(
            copy.deepcopy(count1), expect_table.count())
        self.interval_perc2 = self.count_dict_to_percentage(
            copy.deepcopy(count2), actual_table.count())

        self.set_summary(self.generate_summary())
        LOGGER.info('psi computation done')
Esempio n. 23
0
class MultivariateStatisticalSummary(object):
    """

    """
    def __init__(self,
                 data_instances,
                 cols_index=-1,
                 abnormal_list=None,
                 error=consts.DEFAULT_RELATIVE_ERROR,
                 stat_order=2,
                 bias=True):
        self.finish_fit_statics = False  # Use for static data
        # self.finish_fit_summaries = False   # Use for quantile data
        self.binning_obj: QuantileBinning = None
        self.summary_statistics = None
        self.header = None
        # self.quantile_summary_dict = {}
        self.cols_dict = {}
        # self.medians = None
        self.data_instances = data_instances
        self.cols_index = None
        if not isinstance(abnormal_list, list):
            abnormal_list = [abnormal_list]

        self.abnormal_list = abnormal_list
        self.__init_cols(data_instances, cols_index, stat_order, bias)
        self.label_summary = None
        self.error = error

    def __init_cols(self, data_instances, cols_index, stat_order, bias):
        header = data_overview.get_header(data_instances)
        self.header = header
        if cols_index == -1:
            self.cols_index = [i for i in range(len(header))]
        else:
            self.cols_index = cols_index
        LOGGER.debug(
            f"col_index: {cols_index}, self.col_index: {self.cols_index}")
        self.cols_dict = {
            header[indices]: indices
            for indices in self.cols_index
        }
        self.summary_statistics = SummaryStatistics(
            length=len(self.cols_index),
            abnormal_list=self.abnormal_list,
            stat_order=stat_order,
            bias=bias)

    def _static_sums(self):
        """
        Statics sum, sum_square, max_value, min_value,
        so that variance is available.
        """
        is_sparse = data_overview.is_sparse_data(self.data_instances)
        partition_cal = functools.partial(self.static_in_partition,
                                          cols_index=self.cols_index,
                                          summary_statistics=copy.deepcopy(
                                              self.summary_statistics),
                                          is_sparse=is_sparse)
        self.summary_statistics = self.data_instances.applyPartitions(partition_cal). \
            reduce(lambda x, y: self.copy_merge(x, y))
        # self.summary_statistics = summary_statistic_dict.reduce(self.aggregate_statics)
        self.finish_fit_statics = True

    def _static_quantile_summaries(self):
        """
        Static summaries so that can query a specific quantile point
        """
        if self.binning_obj is not None:
            return self.binning_obj
        bin_param = FeatureBinningParam(bin_num=2,
                                        bin_indexes=self.cols_index,
                                        error=self.error)
        self.binning_obj = QuantileBinning(bin_param,
                                           abnormal_list=self.abnormal_list)
        self.binning_obj.fit_split_points(self.data_instances)

        return self.binning_obj

    @staticmethod
    def copy_merge(s1, s2):
        new_s1 = copy.deepcopy(s1)
        return new_s1.merge(s2)

    @staticmethod
    def static_in_partition(data_instances, cols_index, summary_statistics,
                            is_sparse):
        """
        Statics sums, sum_square, max and min value through one traversal

        Parameters
        ----------
        data_instances : DTable
            The input data

        cols_index : indices
            Specify which column(s) need to apply statistic.

        summary_statistics: SummaryStatistics

        Returns
        -------
        Dict of SummaryStatistics object

        """

        for k, instances in data_instances:
            if not is_sparse:
                if isinstance(instances, Instance):
                    features = instances.features
                else:
                    features = instances
                    # try:
                    #     features = np.array(instances, dtype=float)
                    # except ValueError as e:
                    #     raise ValueError(f"Static Module accept numeric input only. Error info: {e}")
                # LOGGER.debug(f"In statics, features: {features}")
                row_values = [
                    x for idx, x in enumerate(features) if idx in cols_index
                ]
                # row_values = features[cols_index]
            else:
                sparse_data = instances.features.get_sparse_vector()
                row_values = np.array(
                    [sparse_data.get(x, 0) for x in cols_index])
            summary_statistics.add_rows(row_values)
        return summary_statistics

    @staticmethod
    def static_summaries_in_partition(data_instances, cols_dict, abnormal_list,
                                      error):
        """
        Statics sums, sum_square, max and min value through one traversal

        Parameters
        ----------
        data_instances : DTable
            The input data

        cols_dict : dict
            Specify which column(s) need to apply statistic.

        abnormal_list: list
            Specify which values are not permitted.

        Returns
        -------
        Dict of SummaryStatistics object

        """
        summary_dict = {}
        for col_name in cols_dict:
            summary_dict[col_name] = QuantileSummaries(
                abnormal_list=abnormal_list, error=error)

        for k, instances in data_instances:
            if isinstance(instances, Instance):
                features = instances.features
            else:
                features = instances

            for col_name, col_index in cols_dict.items():
                value = features[col_index]
                summary_obj = summary_dict[col_name]
                summary_obj.insert(value)

        return summary_dict

    @staticmethod
    def aggregate_statics(s_dict1, s_dict2):
        if s_dict1 is None and s_dict2 is None:
            return None
        if s_dict1 is None:
            return s_dict2
        if s_dict2 is None:
            return s_dict1

        new_dict = {}
        for col_name, static_1 in s_dict1.items():
            static_1.merge(s_dict2[col_name])
            new_dict[col_name] = static_1
        return new_dict

    def get_median(self):
        if self.binning_obj is None:
            self._static_quantile_summaries()

        medians = self.binning_obj.query_quantile_point(query_points=0.5)
        return medians

    @property
    def median(self):
        median_dict = self.get_median()
        return np.array(
            [median_dict[self.header[idx]] for idx in self.cols_index])

    def get_quantile_point(self, quantile):
        """
        Return the specific quantile point value

        Parameters
        ----------
        quantile : float, 0 <= quantile <= 1
            Specify which column(s) need to apply statistic.

        Returns
        -------
        return a dict of result quantile points.
        eg.
        quantile_point = {"x1": 3, "x2": 5... }
        """

        if self.binning_obj is None:
            self._static_quantile_summaries()
        quantile_points = self.binning_obj.query_quantile_point(quantile)
        return quantile_points

    def get_mean(self):
        """
        Return the mean value(s) of the given column

        Returns
        -------
        return a dict of result mean.

        """
        return self.get_statics("mean")

    def get_variance(self):
        return self.get_statics("variance")

    def get_std_variance(self):
        return self.get_statics("stddev")

    def get_max(self):
        return self.get_statics("max_value")

    def get_min(self):
        return self.get_statics("min_value")

    def get_statics(self, data_type):
        """
        Return the specific static value(s) of the given column

        Parameters
        ----------
        data_type : str, "mean", "variance", "std_variance", "max_value" or "mim_value"
            Specify which type to show.

        Returns
        -------
        return a list of result result. The order is the same as cols.
        """
        if not self.finish_fit_statics:
            self._static_sums()

        if hasattr(self.summary_statistics, data_type):
            result_row = getattr(self.summary_statistics, data_type)

        elif hasattr(self, data_type):
            result_row = getattr(self, data_type)
        else:
            raise ValueError(
                f"Statistic data type: {data_type} cannot be recognized")
        # LOGGER.debug(f"col_index: {self.cols_index}, result_row: {result_row},"
        #              f"header: {self.header}, data_type: {data_type}")

        result = {}

        result_row = result_row.tolist()
        for col_idx, header_idx in enumerate(self.cols_index):
            result[self.header[header_idx]] = result_row[col_idx]
        return result

    def get_missing_ratio(self):
        return self.get_statics("missing_ratio")

    @property
    def missing_ratio(self):
        missing_static_obj = MissingStatistic()
        all_missing_ratio = missing_static_obj.fit(self.data_instances)
        return np.array(
            [all_missing_ratio[self.header[idx]] for idx in self.cols_index])

    @property
    def missing_count(self):
        missing_ratio = self.missing_ratio
        missing_count = missing_ratio * self.data_instances.count()
        return missing_count.astype(int)

    @staticmethod
    def get_label_static_dict(data_instances):
        result_dict = {}
        for instance in data_instances:
            label_key = instance[1].label
            if label_key not in result_dict:
                result_dict[label_key] = 1
            else:
                result_dict[label_key] += 1
        return result_dict

    @staticmethod
    def merge_result_dict(dict_a, dict_b):
        for k, v in dict_b.items():
            if k in dict_a:
                dict_a[k] += v
            else:
                dict_a[k] = v
        return dict_a

    def get_label_histogram(self):
        label_histogram = self.data_instances.applyPartitions(
            self.get_label_static_dict).reduce(self.merge_result_dict)
        return label_histogram
Esempio n. 24
0
    def load_model(self, model_dict):
        model_param = list(
            model_dict.get('model').values())[0].get(MODEL_PARAM_NAME)
        model_meta = list(
            model_dict.get('model').values())[0].get(MODEL_META_NAME)

        self.bin_inner_param = BinInnerParam()
        multi_class_result = model_param.multi_class_result
        self.labels = list(multi_class_result.labels)
        # if not self.labels:
        #     self.labels = [0, 1]
        if self.labels:
            self.bin_result = MultiClassBinResult.reconstruct(
                list(multi_class_result.results), self.labels)

        assert isinstance(model_meta,
                          feature_binning_meta_pb2.FeatureBinningMeta)
        assert isinstance(model_param,
                          feature_binning_param_pb2.FeatureBinningParam)

        self.header = list(model_param.header)
        self.bin_inner_param.set_header(self.header)

        self.bin_inner_param.add_transform_bin_indexes(
            list(model_meta.transform_param.transform_cols))
        self.bin_inner_param.add_bin_names(list(model_meta.cols))
        self.transform_type = model_meta.transform_param.transform_type

        bin_method = str(model_meta.method)
        if bin_method == consts.QUANTILE:
            self.binning_obj = QuantileBinning(params=model_meta)
        elif bin_method == consts.OPTIMAL:
            self.binning_obj = OptimalBinning(params=model_meta)
        else:
            self.binning_obj = BucketBinning(params=model_meta)

        # self.binning_obj.set_role_party(self.role, self.component_properties.local_partyid)
        self.binning_obj.set_bin_inner_param(self.bin_inner_param)

        split_results = dict(model_param.binning_result.binning_result)
        for col_name, sr_pb in split_results.items():
            split_points = list(sr_pb.split_points)
            self.binning_obj.bin_results.put_col_split_points(
                col_name, split_points)

        # self.binning_obj.bin_results.reconstruct(model_param.binning_result)

        self.host_results = []
        host_pbs = list(model_param.multi_class_result.host_results)
        if len(host_pbs):
            if len(self.labels) == 2:
                for host_pb in host_pbs:
                    self.host_results.append(
                        MultiClassBinResult.reconstruct(host_pb, self.labels))
            else:
                assert len(host_pbs) % len(self.labels) == 0
                i = 0
                while i < len(host_pbs):
                    this_pbs = host_pbs[i:i + len(self.labels)]
                    self.host_results.append(
                        MultiClassBinResult.reconstruct(this_pbs, self.labels))
                    i += len(self.labels)

        if list(model_param.header_anonymous):
            self.header_anonymous = list(model_param.header_anonymous)
Esempio n. 25
0
class HomoFeatureBinningClient(object):
    def __init__(self, bin_method=consts.QUANTILE):
        self.aggregator = secure_mean_aggregator.Client(
            enable_secure_aggregate=True)
        self.suffix = tuple()
        self.bin_method = bin_method
        self.bin_obj: QuantileBinning = None
        self.bin_param = None
        self.abnormal_list = None

    def set_suffix(self, suffix):
        self.suffix = suffix

    def average_run(self, data_instances, bin_num=10, abnormal_list=None):
        if self.bin_param is None:
            bin_param = FeatureBinningParam(bin_num=bin_num)
            self.bin_param = bin_param
        else:
            bin_param = self.bin_param

        if self.bin_method == consts.QUANTILE:
            bin_obj = QuantileBinning(params=bin_param,
                                      abnormal_list=abnormal_list,
                                      allow_duplicate=True)
        else:
            raise ValueError(
                "H**o Split Point do not accept bin_method: {}".format(
                    self.bin_method))

        abnormal_detection.empty_table_detection(data_instances)
        abnormal_detection.empty_feature_detection(data_instances)

        split_points = bin_obj.fit_split_points(data_instances)
        split_points = {k: np.array(v) for k, v in split_points.items()}
        split_points_weights = DictWeights(d=split_points)

        self.aggregator.send_model(split_points_weights, self.suffix)
        dict_split_points = self.aggregator.get_aggregated_model(self.suffix)
        split_points = {
            k: list(v)
            for k, v in dict_split_points.unboxed.items()
        }
        self.bin_obj = bin_obj
        return split_points

    def convert_feature_to_bin(self, data_instances, split_points=None):
        if self.bin_obj is None:
            return None, None, None
        return self.bin_obj.convert_feature_to_bin(data_instances,
                                                   split_points)

    def set_bin_param(self, bin_param: FeatureBinningParam):
        if self.bin_param is not None:
            raise RuntimeError("Bin param has been set and it's immutable")
        self.bin_param = bin_param
        return self

    def set_abnormal_list(self, abnormal_list):
        self.abnormal_list = abnormal_list
        return self

    def fit(self, data_instances):
        if self.bin_obj is not None:
            return self

        if self.bin_param is None:
            self.bin_param = FeatureBinningParam()

        self.bin_obj = QuantileBinning(params=self.bin_param,
                                       abnormal_list=self.abnormal_list,
                                       allow_duplicate=True)
        self.bin_obj.fit_split_points(data_instances)
        return self

    def query_quantile_points(self, data_instances, quantile_points):
        if self.bin_obj is None:
            self.fit(data_instances)

        # bin_col_names = self.bin_obj.bin_inner_param.bin_names
        query_result = self.bin_obj.query_quantile_point(quantile_points)

        query_points = DictWeights(d=query_result)

        suffix = tuple(list(self.suffix) + [str(quantile_points)])
        self.aggregator.send_model(query_points, suffix)
        query_points = self.aggregator.get_aggregated_model(suffix)
        query_points = {k: v for k, v in query_points.unboxed.items()}
        return query_points