Ejemplo n.º 1
0
    def test_quantile_binning(self):
        error = 0.01
        compress_thres = int(self.data_num / (self.data_num * error))

        head_size = 5000
        bin_num = 10
        bin_percent = [int(i * (100.0 / bin_num)) for i in range(1, bin_num)]

        bin_param = FeatureBinningParam(method='quantile',
                                        compress_thres=compress_thres,
                                        head_size=head_size,
                                        error=error,
                                        cols=self.cols,
                                        bin_num=bin_num)
        quan_bin = QuantileBinning(bin_param)
        t0 = time.time()
        split_points = quan_bin.fit_split_points(self.table)
        t1 = time.time()
        print('Spend time: {}'.format(t1 - t0))

        # collect and test numpy quantile speed
        local_table = self.table.collect()
        total_data = []
        for _, data_inst in local_table:
            total_data.append(data_inst.features)
        total_data = np.array(total_data)
        for col in self.cols:
            col_idx = self.col_dict.get(col)
            x = total_data[:, col_idx]
            sk = np.percentile(x, bin_percent, interpolation="midpoint")
        t2 = time.time()
        print('collect and use numpy time: {}'.format(t2 - t1))
Ejemplo n.º 2
0
 def _get_quantile_median(self):
     bin_param = FeatureBinningParam(bin_num=2, cols=self.cols)
     binning_obj = QuantileBinning(bin_param)
     split_points = binning_obj.fit_split_points(self.data_instances)
     medians = {}
     for col_name, split_point in split_points.items():
         medians[col_name] = split_point[0]
     return medians
Ejemplo n.º 3
0
    def init_previous_model(self, **models):
        if 'binning_model' in models:
            binning_model_params = models.get('binning_model')
            binning_param = FeatureBinningParam()
            if self.party_name == consts.GUEST:
                binning_obj = HeteroFeatureBinningGuest(binning_param)
            else:
                binning_obj = HeteroFeatureBinningHost(binning_param)

            name = binning_model_params.get('name')
            namespace = binning_model_params.get('namespace')

            binning_obj.load_model(name, namespace)
            self.binning_model = binning_obj
Ejemplo n.º 4
0
    def filter(self, data_instances, bin_param=None):
        if bin_param is None:  # Use default setting
            bin_param = FeatureBinningParam()

        bin_obj = QuantileBinning(bin_param)
        query_result = bin_obj.query_quantile_point(data_instances,
                                                    self.select_cols,
                                                    self.percentile)
        left_cols = []
        for idx, q_r in enumerate(query_result):
            if q_r < self.upper_threshold:
                left_cols.append(self.select_cols[idx])

        left_cols = self._keep_one_feature(self.select_cols, left_cols)
        self.left_cols = left_cols
        return left_cols
Ejemplo n.º 5
0
    def fit(self, data_instances, bin_param=None):
        if bin_param is None:  # Use default setting
            bin_param = FeatureBinningParam()

        bin_obj = QuantileBinning(bin_param)
        query_result = bin_obj.query_quantile_point(data_instances, self.cols,
                                                    self.percentile)
        for col_name, feature_value in query_result.items():
            self.feature_values[col_name] = feature_value
            if feature_value < self.upper_threshold:
                self.left_cols[col_name] = True
            else:
                self.left_cols[col_name] = False

        self.left_cols = self._keep_one_feature()
        return self.left_cols
Ejemplo n.º 6
0
    def test_quantile_binning(self):
        return

        compress_thres = 10000
        head_size = 5000
        error = 0.01
        bin_num = 10
        bin_param = FeatureBinningParam(method='quantile',
                                        compress_thres=compress_thres,
                                        head_size=head_size,
                                        error=error,
                                        cols=self.cols,
                                        bin_num=bin_num)
        quan_bin = QuantileBinning(bin_param)
        split_points = quan_bin.fit_split_points(self.table)
        for col_idx, col in enumerate(self.cols):
            bin_percent = [i * (1.0 / bin_num) for i in range(1, bin_num)]
            feature_idx = self.col_dict.get(col)
            x = self.numpy_table[:, feature_idx]
            x = sorted(x)
            for bin_idx, percent in enumerate(bin_percent):
                min_rank = int(
                    math.floor(percent * self.data_num -
                               self.data_num * error))
                max_rank = int(
                    math.ceil(percent * self.data_num + self.data_num * error))
                if min_rank < 0:
                    min_rank = 0
                if max_rank > len(x) - 1:
                    max_rank = len(x) - 1
                try:
                    self.assertTrue(x[min_rank] <= split_points[col_idx]
                                    [bin_idx] <= x[max_rank])
                except:
                    print(x[min_rank], x[max_rank],
                          split_points[col_idx][bin_idx])
                    found_index = x.index(split_points[col_idx][bin_idx])
                    print("min_rank: {}, found_rank: {}, max_rank: {}".format(
                        min_rank, found_index, max_rank))
                self.assertTrue(x[min_rank] <= split_points[col_idx][bin_idx]
                                <= x[max_rank])
Ejemplo n.º 7
0
 def _get_quantile_median(self, cols):
     bin_param = FeatureBinningParam(bin_num=2)
     binning_obj = QuantileBinning(bin_param)
     split_points = binning_obj.binning(self.data_instances, cols)
     medians = [x[0] for x in split_points]
     return medians
Ejemplo n.º 8
0
 def test_bucket_binning(self):
     bin_param = FeatureBinningParam(bin_num=self.bin_num, cols=self.cols)
     bucket_bin = BucketBinning(bin_param)
     split_points = bucket_bin.fit_split_points(self.table)
     print(split_points)