def traverse_tree(self, data_inst: Instance, tree: List[Node], use_missing=True, zero_as_missing=True):

        nid = 0# root node id
        while True:

            if tree[nid].is_leaf:
                return tree[nid].weight

            cur_node = tree[nid]
            fid,bid = cur_node.fid,cur_node.bid
            missing_dir = cur_node.missing_dir

            if use_missing and zero_as_missing:

                if data_inst.features.get_data(fid) == NoneType() or data_inst.features.get_data(fid, None) is None:

                    nid = tree[nid].right_nodeid if missing_dir == 1 else tree[nid].left_nodeid

                elif data_inst.features.get_data(fid) <= bid:
                    nid = tree[nid].left_nodeid
                else:
                    nid = tree[nid].right_nodeid

            elif data_inst.features.get_data(fid) == NoneType():

                nid = tree[nid].right_nodeid if missing_dir == 1 else tree[nid].left_nodeid

            elif data_inst.features.get_data(fid, 0) <= bid:
                nid = tree[nid].left_nodeid
            else:
                nid = tree[nid].right_nodeid
Ejemplo n.º 2
0
    def host_local_traverse_tree(data_inst, tree_node, use_missing=True, zero_as_missing=True):

        nid = 0  # root node id
        while True:

            if tree_node[nid].is_leaf:
                return nid

            cur_node = tree_node[nid]
            fid, bid = cur_node.fid, cur_node.bid
            missing_dir = cur_node.missing_dir

            if use_missing and zero_as_missing:

                if data_inst.features.get_data(fid) == NoneType() or data_inst.features.get_data(fid, None) is None:

                    nid = tree_node[nid].right_nodeid if missing_dir == 1 else tree_node[nid].left_nodeid

                elif data_inst.features.get_data(fid) <= bid:
                    nid = tree_node[nid].left_nodeid
                else:
                    nid = tree_node[nid].right_nodeid

            elif data_inst.features.get_data(fid) == NoneType():

                nid = tree_node[nid].right_nodeid if missing_dir == 1 else tree_node[nid].left_nodeid

            elif data_inst.features.get_data(fid, 0) <= bid:
                nid = tree_node[nid].left_nodeid
            else:
                nid = tree_node[nid].right_nodeid
Ejemplo n.º 3
0
    def data_format_transform(row):
        if type(row.features).__name__ != consts.SPARSE_VECTOR:
            feature_shape = row.features.shape[0]
            indices = []
            data = []

            for i in range(feature_shape):
                if np.isnan(row.features[i]):
                    indices.append(i)
                    data.append(NoneType())
                elif np.abs(row.features[i]) < consts.FLOAT_ZERO:
                    continue
                else:
                    indices.append(i)
                    data.append(row.features[i])

            row.features = SparseVector(indices, data, feature_shape)
        else:
            sparse_vec = row.features.get_sparse_vector()
            for key in sparse_vec:
                if sparse_vec.get(key) == NoneType() or np.isnan(
                        sparse_vec.get(key)):
                    sparse_vec[key] = NoneType()

            row.features.set_sparse_vector(sparse_vec)

        return row
Ejemplo n.º 4
0
    def assign_a_instance(row, tree: List[Node], bin_sparse_point, use_missing,
                          use_zero_as_missing):

        leaf_status, nodeid = row[1]
        node = tree[nodeid]
        if node.is_leaf:
            return node.weight

        fid = node.fid
        bid = node.bid

        missing_dir = node.missing_dir

        missing_val = False
        if use_zero_as_missing:
            if row[0].features.get_data(fid, None) is None or \
                    row[0].features.get_data(fid) == NoneType():
                missing_val = True
        elif use_missing and row[0].features.get_data(fid) == NoneType():
            missing_val = True

        if missing_val:
            if missing_dir == 1:
                return 1, tree[nodeid].right_nodeid
            else:
                return 1, tree[nodeid].left_nodeid
        else:
            if row[0].features.get_data(fid, bin_sparse_point[fid]) <= bid:
                return 1, tree[nodeid].left_nodeid
            else:
                return 1, tree[nodeid].right_nodeid
Ejemplo n.º 5
0
    def make_decision(data_inst,
                      fid,
                      bid,
                      missing_dir,
                      use_missing,
                      zero_as_missing,
                      zero_val=0):

        left, right = True, False
        missing_dir = left if missing_dir == -1 else right

        # use missing and zero as missing
        if use_missing and zero_as_missing:
            # missing or zero
            if data_inst.features.get_data(fid) == NoneType(
            ) or data_inst.features.get_data(fid, None) is None:
                return missing_dir

        # is missing feat
        if data_inst.features.get_data(fid) == NoneType():
            return missing_dir

        # no missing val
        feat_val = data_inst.features.get_data(fid, zero_val)
        direction = left if feat_val <= bid + consts.FLOAT_ZERO else right
        return direction
Ejemplo n.º 6
0
    def traverse_tree(predict_state,
                      data_inst,
                      tree_=None,
                      decoder=None,
                      sitename=consts.GUEST,
                      split_maskdict=None,
                      use_missing=None,
                      zero_as_missing=None,
                      missing_dir_maskdict=None):
        nid, tag = predict_state

        while tree_[nid].sitename == sitename:
            if tree_[nid].is_leaf is True:
                return tree_[nid].weight

            fid = decoder("feature_idx",
                          tree_[nid].fid,
                          split_maskdict=split_maskdict)
            bid = decoder("feature_val",
                          tree_[nid].bid,
                          nid,
                          split_maskdict=split_maskdict)
            if use_missing:
                missing_dir = decoder(
                    "missing_dir",
                    1,
                    nid,
                    missing_dir_maskdict=missing_dir_maskdict)
            else:
                missing_dir = 1

            if use_missing and zero_as_missing:
                missing_dir = decoder(
                    "missing_dir",
                    1,
                    nid,
                    missing_dir_maskdict=missing_dir_maskdict)
                if data_inst.features.get_data(fid) == NoneType(
                ) or data_inst.features.get_data(fid, None) is None:
                    if missing_dir == 1:
                        nid = tree_[nid].right_nodeid
                    else:
                        nid = tree_[nid].left_nodeid
                elif data_inst.features.get_data(fid) <= bid:
                    nid = tree_[nid].left_nodeid
                else:
                    nid = tree_[nid].right_nodeid
            elif data_inst.features.get_data(fid) == NoneType():
                if missing_dir == 1:
                    nid = tree_[nid].right_nodeid
                else:
                    nid = tree_[nid].left_nodeid
            elif data_inst.features.get_data(fid, 0) <= bid:
                nid = tree_[nid].left_nodeid
            else:
                nid = tree_[nid].right_nodeid

        return nid, 1
Ejemplo n.º 7
0
    def host_assign_an_instance(value, tree_, bin_sparse_points, use_missing, zero_as_missing, dense_format=False):

        unleaf_state, nodeid = value[1]

        if tree_[nodeid].is_leaf is True:
            return nodeid

        fid = tree_[nodeid].fid
        bid = tree_[nodeid].bid

        if not dense_format:
            if not use_missing:
                if value[0].features.get_data(fid, bin_sparse_points[fid]) <= bid:
                    return 1, tree_[nodeid].left_nodeid
                else:
                    return 1, tree_[nodeid].right_nodeid
            else:
                missing_dir = tree_[nodeid].missing_dir
                missing_val = False
                if zero_as_missing:
                    if value[0].features.get_data(fid, None) is None or \
                            value[0].features.get_data(fid) == NoneType():
                        missing_val = True
                elif use_missing and value[0].features.get_data(fid) == NoneType():
                    missing_val = True
                if missing_val:
                    if missing_dir == 1:
                        return 1, tree_[nodeid].right_nodeid
                    else:
                        return 1, tree_[nodeid].left_nodeid
                else:
                    if value[0].features.get_data(fid, bin_sparse_points[fid]) <= bid:
                        return 1, tree_[nodeid].left_nodeid
                    else:
                        return 1, tree_[nodeid].right_nodeid
        else:
            # this branch is for fast histogram
            # will get scipy sparse matrix if using fast histogram
            if not use_missing:
                sample_feat = value[0].features[0, fid]  # value.features is a scipy sparse matrix
                return (1, tree_[nodeid].left_nodeid) if sample_feat <= bid else (1, tree_[nodeid].right_nodeid)
            else:
                missing_dir = tree_[nodeid].missing_dir
                sample_feat = value[0].features[0, fid]
                if zero_as_missing:  # zero_as_missing and use_missing, 0 and missing value are marked as -1
                    sample_feat -= 1  # remove offset
                if sample_feat == -1:
                    return (1, tree_[nodeid].right_nodeid) if missing_dir == 1 else (1, tree_[nodeid].left_nodeid)
                else:
                    return (1, tree_[nodeid].left_nodeid) if sample_feat <= bid else (1, tree_[nodeid].right_nodeid)
Ejemplo n.º 8
0
    def assign_a_instance(value1,
                          value2,
                          sitename=None,
                          decoder=None,
                          split_maskdict=None,
                          bin_sparse_points=None,
                          use_missing=False,
                          zero_as_missing=False,
                          missing_dir_maskdict=None):

        unleaf_state, fid, bid, node_sitename, nodeid, left_nodeid, right_nodeid = value1
        if node_sitename != sitename:
            return value1

        fid = decoder("feature_idx", fid, split_maskdict=split_maskdict)
        bid = decoder("feature_val",
                      bid,
                      nodeid,
                      split_maskdict=split_maskdict)
        if not use_missing:
            if value2.features.get_data(fid, bin_sparse_points[fid]) <= bid:
                return unleaf_state, left_nodeid
            else:
                return unleaf_state, right_nodeid
        else:
            missing_dir = decoder("missing_dir",
                                  1,
                                  nodeid,
                                  missing_dir_maskdict=missing_dir_maskdict)
            missing_val = False
            if zero_as_missing:
                if value2.features.get_data(fid, None) is None or \
                        value2.features.get_data(fid) == NoneType():
                    missing_val = True
            elif use_missing and value2.features.get_data(fid) == NoneType():
                missing_val = True

            if missing_val:
                if missing_dir == 1:
                    return unleaf_state, right_nodeid
                else:
                    return unleaf_state, left_nodeid
            else:
                if value2.features.get_data(fid,
                                            bin_sparse_points[fid]) <= bid:
                    return unleaf_state, left_nodeid
                else:
                    return unleaf_state, right_nodeid
        def static_candidates_num(instances, select_cols, all_candidates):
            """
            Static number of candidates
            Parameters
            ----------
            instances: Data generator
                Original data

            select_cols: list
                Indicates columns that need to be operated.

            all_candidates: dict
                Each key is col_index and value is a list that contains mode candidates.
            """

            for _, instant in instances:
                for col_index in select_cols:
                    candidate_dict = all_candidates[col_index]
                    if is_sparse:
                        feature_value = instant.features.get_data(
                            col_index, NoneType())
                    else:
                        feature_value = instant.features[col_index]
                    if isinstance(feature_value, float):
                        feature_value = round(feature_value, 8)

                    if feature_value in candidate_dict:
                        candidate_dict[feature_value] += 1

            # mode_result = {}
            # for col_index, candidate_dict in all_candidates.items():
            #     feature_value, nums = sorted(candidate_dict.items(), key=operator.itemgetter(1), reverse=False)[0]
            #     mode_result[col_index] = (feature_value, nums)
            return all_candidates
Ejemplo n.º 10
0
 def _fill_nan(inst):
     arr = copy.deepcopy(inst.features)
     nan_index = np.isnan(arr)
     arr = arr.astype(np.object)
     arr[nan_index] = NoneType()
     inst.features = arr
     return inst
Ejemplo n.º 11
0
    def assign_a_instance(value, tree_=None, decoder=None, sitename=consts.GUEST,
                      split_maskdict=None, bin_sparse_points=None,
                      use_missing=False, zero_as_missing=False,
                      missing_dir_maskdict=None):

        unleaf_state, nodeid = value[1]

        if tree_[nodeid].is_leaf is True:
            return tree_[nodeid].weight
        else:
            if tree_[nodeid].sitename == sitename:
                fid = decoder("feature_idx", tree_[nodeid].fid, split_maskdict=split_maskdict)
                bid = decoder("feature_val", tree_[nodeid].bid, nodeid, split_maskdict=split_maskdict)
                if not use_missing:
                    if value[0].features.get_data(fid, bin_sparse_points[fid]) <= bid:
                        return 1, tree_[nodeid].left_nodeid
                    else:
                        return 1, tree_[nodeid].right_nodeid
                else:
                    missing_dir = decoder("missing_dir", tree_[nodeid].missing_dir, nodeid,
                                          missing_dir_maskdict=missing_dir_maskdict)

                    missing_val = False
                    if zero_as_missing:
                        if value[0].features.get_data(fid, None) is None or \
                                value[0].features.get_data(fid) == NoneType():
                            missing_val = True
                    elif use_missing and value[0].features.get_data(fid) == NoneType():
                        missing_val = True

                    if missing_val:
                        if missing_dir == 1:
                            return 1, tree_[nodeid].right_nodeid
                        else:
                            return 1, tree_[nodeid].left_nodeid
                    else:
                        LOGGER.debug("fid is {}, bid is {}, sitename is {}".format(fid, bid, sitename))
                        if value[0].features.get_data(fid, bin_sparse_points[fid]) <= bid:
                            return 1, tree_[nodeid].left_nodeid
                        else:
                            return 1, tree_[nodeid].right_nodeid
            else:
                return (1, tree_[nodeid].fid, tree_[nodeid].bid, tree_[nodeid].sitename,
                        nodeid, tree_[nodeid].left_nodeid, tree_[nodeid].right_nodeid)
Ejemplo n.º 12
0
    def convert_feature_to_bin(self, data_instance):
        LOGGER.info("convert feature to bins")
        param_obj = FeatureBinningParam(bin_num=self.bin_num)
        if self.use_missing:
            binning_obj = QuantileBinning(param_obj, abnormal_list=[NoneType()])
        else:
            binning_obj = QuantileBinning(param_obj)

        binning_obj.fit_split_points(data_instance)
        self.data_bin, self.bin_split_points, self.bin_sparse_points = binning_obj.convert_feature_to_bin(data_instance)
Ejemplo n.º 13
0
def np_nan_to_nonetype(inst):

    arr = inst.features
    index = np.isnan(arr)
    if index.any():
        inst = copy.deepcopy(inst)
        arr = arr.astype(object)
        arr[index] = NoneType()
        inst.features = arr
    return inst
Ejemplo n.º 14
0
    def _handle_zero_as_missing(inst, feat_num, missing_bin_idx):
        """
        This for use_missing + zero_as_missing case
        """

        sparse_vec = inst.features.sparse_vec
        arr = np.zeros(feat_num, dtype=np.uint8) + missing_bin_idx
        for k, v in sparse_vec.items():
            if v != NoneType():
                arr[k] = v
        inst.features = arr
        return inst
Ejemplo n.º 15
0
        def _transform_nan(instance):
            feature_shape = instance.features.shape[0]
            new_features = []

            for i in range(feature_shape):
                if instance.features[i] != instance.features[i]:
                    new_features.append(NoneType())
                else:
                    new_features.append(instance.features[i])
            new_instance = copy.deepcopy(instance)
            new_instance.features = np.array(new_features)
            return new_instance
Ejemplo n.º 16
0
    def data_format_transform(row):
        """
        transform data into sparse format
        """

        if type(row.features).__name__ != consts.SPARSE_VECTOR:
            feature_shape = row.features.shape[0]
            indices = []
            data = []

            for i in range(feature_shape):
                if np.isnan(row.features[i]):
                    indices.append(i)
                    data.append(NoneType())
                elif np.abs(row.features[i]) < consts.FLOAT_ZERO:
                    continue
                else:
                    indices.append(i)
                    data.append(row.features[i])

            new_row = copy.deepcopy(row)
            new_row.features = SparseVector(indices, data, feature_shape)
            return new_row
        else:
            sparse_vec = row.features.get_sparse_vector()
            replace_key = []
            for key in sparse_vec:
                if sparse_vec.get(key) == NoneType() or np.isnan(
                        sparse_vec.get(key)):
                    replace_key.append(key)

            if len(replace_key) == 0:
                return row
            else:
                new_row = copy.deepcopy(row)
                new_sparse_vec = new_row.features.get_sparse_vector()
                for key in replace_key:
                    new_sparse_vec[key] = NoneType()
                return new_row
Ejemplo n.º 17
0
    def federated_binning(self, data_instance):

        if self.use_missing:
            binning_result = self.binning_obj.average_run(
                data_instances=data_instance,
                bin_num=self.bin_num,
                abnormal_list=[NoneType()])
        else:
            binning_result = self.binning_obj.average_run(
                data_instances=data_instance, bin_num=self.bin_num)

        return self.binning_obj.convert_feature_to_bin(data_instance,
                                                       binning_result)
Ejemplo n.º 18
0
    def federated_binning(self, ):

        binning_param = HomoFeatureBinningParam(method=consts.RECURSIVE_QUERY,
                                                bin_num=self.bin_num,
                                                error=self.binning_error)

        if self.use_missing:
            self.binning_obj = recursive_query_binning.Server(
                binning_param, abnormal_list=[NoneType()])
        else:
            self.binning_obj = recursive_query_binning.Server(binning_param,
                                                              abnormal_list=[])

        self.binning_obj.fit_split_points(None)
Ejemplo n.º 19
0
    def convert_feature_to_bin(self, data_instance, handle_missing_value=False):
        """
        convert bin index to real value
        """
        LOGGER.info("convert feature to bins")
        param_obj = FeatureBinningParam(bin_num=self.bin_num, error=self.binning_error)

        if handle_missing_value:
            self.binning_obj = self.binning_class(param_obj, abnormal_list=[NoneType()],)
        else:
            self.binning_obj = self.binning_class(param_obj)

        self.binning_obj.fit_split_points(data_instance)
        LOGGER.info("convert feature to bins over")
        return self.binning_obj.convert_feature_to_bin(data_instance)
Ejemplo n.º 20
0
    def federated_binning(self, data_instance):

        binning_param = FeatureBinningParam(bin_num=self.bin_num,
                                            error=self.binning_error)
        self.binning_obj.bin_param = binning_param

        if self.use_missing:
            binning_result = self.binning_obj.average_run(
                data_instances=data_instance, abnormal_list=[NoneType()])
        else:
            binning_result = self.binning_obj.average_run(
                data_instances=data_instance, )

        return self.binning_obj.convert_feature_to_bin(data_instance,
                                                       binning_result)
Ejemplo n.º 21
0
    def sparse_to_array(data, feature_sparse_point_array, use_missing, zero_as_missing):
        new_data = copy.deepcopy(data)
        new_feature_sparse_point_array = copy.deepcopy(feature_sparse_point_array)
        for k, v in data.features.get_all_data():
            if v == NoneType():
                value = -1
            else:
                value = v
            new_feature_sparse_point_array[k] = value

        # as most sparse point is bin-0
        # when mark it as a missing value (-1), offset it to make it sparse
        if not use_missing or (use_missing and not zero_as_missing):
            offset = 0
        else:
            offset = 1
        new_data.features = sp.csc_matrix(np.array(new_feature_sparse_point_array) + offset)
        return new_data
Ejemplo n.º 22
0
    def federated_binning(self, data_instance):

        binning_param = HomoFeatureBinningParam(method=consts.RECURSIVE_QUERY,
                                                bin_num=self.bin_num,
                                                error=self.binning_error)

        if self.use_missing:
            self.binning_obj = recursive_query_binning.Client(
                params=binning_param,
                abnormal_list=[NoneType()],
                role=self.role)
        else:
            self.binning_obj = recursive_query_binning.Client(
                params=binning_param, role=self.role)

        self.binning_obj.fit_split_points(data_instance)

        return self.binning_obj.convert_feature_to_bin(data_instance)
Ejemplo n.º 23
0
    def __init__(self, missing_value_list=None):
        """
        Parameters
        ----------
        missing_value_list: list, the value to be replaced. Default None, if is None, it will be set to list of blank, none, null and na,
                            which regarded as missing filled. If not, it can be outlier replace, and missing_value_list includes the outlier values
        """
        if missing_value_list is None:
            self.missing_value_list = [
                '', 'none', 'null', 'na', 'None', np.nan
            ]
        else:
            self.missing_value_list = missing_value_list

        self.abnormal_value_list = copy.deepcopy(self.missing_value_list)
        for i, v in enumerate(self.missing_value_list):
            if v != v:
                self.missing_value_list[i] = np.nan
                self.abnormal_value_list[i] = NoneType()

        self.support_replace_method = [
            'min', 'max', 'mean', 'median', 'designated'
        ]
        self.support_output_format = {
            'str': str,
            'float': float,
            'int': int,
            'origin': None
        }

        self.support_replace_area = {
            'min': 'col',
            'max': 'col',
            'mean': 'col',
            'median': 'col',
            'designated': 'col'
        }

        self.cols_fit_impute_rate = []
        self.cols_transform_impute_rate = []
        self.cols_replace_method = []
        self.skip_cols = []
Ejemplo n.º 24
0
    def __init__(self):
        super(PSI, self).__init__()
        self.model_param = PSIParam()
        self.max_bin_num = 20
        self.tag_id_mapping = {}
        self.id_tag_mapping = {}
        self.count1, self.count2 = None, None
        self.actual_table, self.expect_table = None, None
        self.data_bin1, self.data_bin2 = None, None
        self.bin_split_points = None
        self.bin_sparse_points = None
        self.psi_rs = None
        self.total_scores = None
        self.all_feature_list = None
        self.dense_missing_val = NoneType()
        self.binning_error = consts.DEFAULT_RELATIVE_ERROR

        self.interval_perc1 = None
        self.interval_perc2 = None
        self.str_intervals = None

        self.binning_obj = None
Ejemplo n.º 25
0
def map_partition_handle(iterable,
                         feat_num=10,
                         max_bin_num=20,
                         is_sparse=False,
                         missing_val=NoneType()):

    count_bin = np.zeros((feat_num, max_bin_num))
    row_idx = np.array([i for i in range(feat_num)])

    for k, v in iterable:
        # last bin is for missing value
        if is_sparse:
            feature_dict = v.features.sparse_vec
            arr = np.zeros(
                feat_num, dtype=np.int64
            ) + max_bin_num - 1  # max_bin_num - 1 is the missing bin val
            arr[list(feature_dict.keys())] = list(feature_dict.values())
        else:
            arr = v.features
            arr[arr == missing_val] = max_bin_num - 1

        count_bin[row_idx, arr.astype(np.int64)] += 1

    return count_bin
Ejemplo n.º 26
0
    def batch_calculate_histogram(kv_iterator,
                                  bin_split_points=None,
                                  bin_sparse_points=None,
                                  valid_features=None,
                                  node_map=None,
                                  use_missing=False,
                                  zero_as_missing=False,
                                  with_uuid=False):
        data_bins = []
        node_ids = []
        grad = []
        hess = []

        data_record = 0  # total instance number of this partition

        # go through iterator to collect g/h feature instances/ node positions
        for _, value in kv_iterator:
            data_bin, nodeid_state = value[0]
            unleaf_state, nodeid = nodeid_state
            if unleaf_state == 0 or nodeid not in node_map:
                continue
            g, h = value[1]  # encrypted text in host, plaintext in guest
            data_bins.append(data_bin)  # features
            node_ids.append(nodeid)  # current node position
            grad.append(g)
            hess.append(h)

            data_record += 1

        LOGGER.info("begin batch calculate histogram, data count is {}".format(
            data_record))
        node_num = len(node_map)

        missing_bin = 1 if use_missing else 0

        # if the value of a feature is 0, the corresponding bin index will not appear in the sample sparse vector
        # need to compute correct sparse point g_sum and s_sum by:
        # (node total sum value) - (node feature total sum value) + (non 0 sparse point sum)
        # [0, 0, 0] -> g, h, sample count
        zero_optim = [[[0 for i in range(3)]
                       for j in range(bin_split_points.shape[0])]
                      for k in range(node_num)]
        zero_opt_node_sum = [[0 for i in range(3)] for j in range(node_num)]

        node_histograms = FeatureHistogram.generate_histogram_template(
            node_map, bin_split_points, valid_features, missing_bin)

        for rid in range(data_record):
            nid = node_map.get(node_ids[rid])
            # node total sum value
            zero_opt_node_sum[nid][0] += grad[rid]
            zero_opt_node_sum[nid][1] += hess[rid]
            zero_opt_node_sum[nid][2] += 1

            for fid, value in data_bins[rid].features.get_all_data():
                if valid_features is not None and valid_features[fid] is False:
                    continue

                if use_missing and value == NoneType():
                    # missing value is set as -1
                    value = -1

                node_histograms[nid][fid][value][0] += grad[rid]
                node_histograms[nid][fid][value][1] += hess[rid]
                node_histograms[nid][fid][value][2] += 1

                # node feature total sum value
                zero_optim[nid][fid][0] += grad[rid]
                zero_optim[nid][fid][1] += hess[rid]
                zero_optim[nid][fid][2] += 1

        for nid in range(node_num):
            for fid in range(bin_split_points.shape[0]):
                if valid_features is not None and valid_features[fid] is True:
                    if not use_missing or (use_missing
                                           and not zero_as_missing):
                        # add 0 g/h sum to sparse point
                        sparse_point = bin_sparse_points[fid]
                        node_histograms[nid][fid][sparse_point][
                            0] += zero_opt_node_sum[nid][0] - zero_optim[nid][
                                fid][0]
                        node_histograms[nid][fid][sparse_point][
                            1] += zero_opt_node_sum[nid][1] - zero_optim[nid][
                                fid][1]
                        node_histograms[nid][fid][sparse_point][
                            2] += zero_opt_node_sum[nid][2] - zero_optim[nid][
                                fid][2]
                    else:
                        # if 0 is regarded as missing value, add to missing bin
                        node_histograms[nid][fid][-1][0] += zero_opt_node_sum[
                            nid][0] - zero_optim[nid][fid][0]
                        node_histograms[nid][fid][-1][1] += zero_opt_node_sum[
                            nid][1] - zero_optim[nid][fid][1]
                        node_histograms[nid][fid][-1][2] += zero_opt_node_sum[
                            nid][2] - zero_optim[nid][fid][2]

        ret = FeatureHistogram.generate_histogram_key_value_list(
            node_histograms, node_map, bin_split_points, with_uuid)
        return ret
Ejemplo n.º 27
0
    def _batch_calculate_histogram(kv_iterator,
                                   bin_split_points=None,
                                   bin_sparse_points=None,
                                   valid_features=None,
                                   node_map=None,
                                   use_missing=False,
                                   zero_as_missing=False,
                                   parent_nid_map=None,
                                   sibling_node_id_map=None,
                                   stable_reduce=False,
                                   mo_dim=None):
        data_bins = []
        node_ids = []
        grad = []
        hess = []

        data_record = 0  # total instance number of this partition

        partition_key = None  # this var is for stable reduce

        # go through iterator to collect g/h feature instances/ node positions
        for data_id, value in kv_iterator:

            if partition_key is None and stable_reduce:  # first key of data is used as partition key
                partition_key = data_id

            data_bin, nodeid_state = value[0]
            unleaf_state, nodeid = nodeid_state
            if unleaf_state == 0 or nodeid not in node_map:
                continue
            g, h = value[1]  # encrypted text in host, plaintext in guest
            data_bins.append(data_bin)  # features
            node_ids.append(nodeid)  # current node position
            grad.append(g)
            hess.append(h)
            data_record += 1

        LOGGER.debug(
            "begin batch calculate histogram, data count is {}".format(
                data_record))
        node_num = len(node_map)

        missing_bin = 1 if use_missing else 0

        # if the value of a feature is 0, the corresponding bin index will not appear in the sample sparse vector
        # need to compute correct sparse point g_sum and s_sum by:
        # (node total sum value) - (node feature total sum value) + (non 0 sparse point sum)
        # [0, 0, 0] -> g, h, sample count
        zero_optim = [[[0 for i in range(3)]
                       for j in range(bin_split_points.shape[0])]
                      for k in range(node_num)]
        zero_opt_node_sum = [[0 for i in range(3)] for j in range(node_num)]

        node_histograms = FeatureHistogram._generate_histogram_template(
            node_map,
            bin_split_points,
            valid_features,
            missing_bin,
            mo_dim=mo_dim)

        for rid in range(data_record):

            # node index is the position in the histogram list of a certain node
            node_idx = node_map.get(node_ids[rid])
            # node total sum value
            zero_opt_node_sum[node_idx][0] += grad[rid]
            zero_opt_node_sum[node_idx][1] += hess[rid]
            zero_opt_node_sum[node_idx][2] += 1

            for fid, value in data_bins[rid].features.get_all_data():
                if valid_features is not None and valid_features[fid] is False:
                    continue

                if use_missing and value == NoneType():
                    # missing value is set as -1
                    value = -1

                node_histograms[node_idx][fid][value][0] += grad[rid]
                node_histograms[node_idx][fid][value][1] += hess[rid]
                node_histograms[node_idx][fid][value][2] += 1

        for nid in range(node_num):
            # cal feature level g_h incrementally
            for fid in range(bin_split_points.shape[0]):
                if valid_features is not None and valid_features[fid] is False:
                    continue
                for bin_index in range(len(node_histograms[nid][fid])):
                    zero_optim[nid][fid][0] += node_histograms[nid][fid][
                        bin_index][0]
                    zero_optim[nid][fid][1] += node_histograms[nid][fid][
                        bin_index][1]
                    zero_optim[nid][fid][2] += node_histograms[nid][fid][
                        bin_index][2]

        for node_idx in range(node_num):
            for fid in range(bin_split_points.shape[0]):
                if valid_features is not None and valid_features[fid] is True:
                    if not use_missing or (use_missing
                                           and not zero_as_missing):
                        # add 0 g/h sum to sparse point
                        sparse_point = bin_sparse_points[fid]
                        node_histograms[node_idx][fid][sparse_point][0] += zero_opt_node_sum[node_idx][0] - \
                            zero_optim[node_idx][fid][
                            0]
                        node_histograms[node_idx][fid][sparse_point][1] += zero_opt_node_sum[node_idx][1] - \
                            zero_optim[node_idx][fid][
                            1]
                        node_histograms[node_idx][fid][sparse_point][2] += zero_opt_node_sum[node_idx][2] - \
                            zero_optim[node_idx][fid][
                            2]
                    else:
                        # if 0 is regarded as missing value, add to missing bin
                        node_histograms[node_idx][fid][-1][0] += zero_opt_node_sum[node_idx][0] - \
                            zero_optim[node_idx][fid][0]
                        node_histograms[node_idx][fid][-1][1] += zero_opt_node_sum[node_idx][1] - \
                            zero_optim[node_idx][fid][1]
                        node_histograms[node_idx][fid][-1][2] += zero_opt_node_sum[node_idx][2] - \
                            zero_optim[node_idx][fid][2]

        ret = FeatureHistogram._generate_histogram_key_value_list(
            node_histograms,
            node_map,
            bin_split_points,
            parent_nid_map,
            sibling_node_id_map,
            partition_key=partition_key)
        return ret
Ejemplo n.º 28
0
 def _init_model(self, model: PSIParam):
     self.max_bin_num = model.max_bin_num
     self.need_run = model.need_run
     self.dense_missing_val = NoneType(
     ) if model.dense_missing_val is None else model.dense_missing_val
     self.binning_error = model.binning_error
Ejemplo n.º 29
0
    def traverse_tree(predict_state,
                      data_inst,
                      tree_=None,
                      decoder=None,
                      sitename=consts.GUEST,
                      split_maskdict=None,
                      use_missing=None,
                      zero_as_missing=None,
                      missing_dir_maskdict=None,
                      encrypted_weight_dict=None,
                      encrypted_zero_dict=None):
        nid, tag = predict_state

        weight_dict = {}
        #Encrypt(0)
        # LOGGER.info("cxl:tree_{}".format(tree_))
        # LOGGER.info("CXL:len(tree_)".format(len(tree_)))
        for i in range(len(tree_)):
            # LOGGER.info("cxl:i{}".format(i))
            if tree_[i].is_leaf is True:
                weight_dict[i] = encrypted_zero_dict[i]

        node_queue = [nid]

        while len(node_queue) != 0:
            nid = node_queue[0]
            node_queue.remove(nid)
            #leaf_code weight
            if tree_[nid].is_leaf is True:
                weight_dict[nid] = encrypted_weight_dict[nid]
            else:
                if tree_[nid].sitename == sitename:
                    fid = decoder("feature_idx",
                                  tree_[nid].fid,
                                  split_maskdict=split_maskdict)
                    bid = decoder("feature_val",
                                  tree_[nid].bid,
                                  nid,
                                  split_maskdict=split_maskdict)

                    if use_missing:
                        missing_dir = decoder(
                            "missing_dir",
                            1,
                            nid,
                            missing_dir_maskdict=missing_dir_maskdict)
                    else:
                        missing_dir = 1

                    if use_missing and zero_as_missing:
                        missing_dir = decoder(
                            "missing_dir",
                            1,
                            nid,
                            missing_dir_maskdict=missing_dir_maskdict)
                        if data_inst.features.get_data(fid) == NoneType(
                        ) or data_inst.features.get_data(fid, None) is None:
                            if missing_dir == 1:
                                nid = tree_[nid].right_nodeid
                            else:
                                nid = tree_[nid].left_nodeid
                        elif data_inst.features.get_data(fid) <= bid:
                            nid = tree_[nid].left_nodeid
                        else:
                            nid = tree_[nid].right_nodeid
                    elif data_inst.features.get_data(fid) == NoneType():
                        if missing_dir == 1:
                            nid = tree_[nid].right_nodeid
                        else:
                            nid = tree_[nid].left_nodeid
                    elif data_inst.features.get_data(fid, 0) <= bid:
                        nid = tree_[nid].left_nodeid
                    else:
                        nid = tree_[nid].right_nodeid
                    node_queue.append(nid)
                else:
                    node_queue.append(tree_[nid].left_nodeid)
                    node_queue.append(tree_[nid].right_nodeid)

        return weight_dict
Ejemplo n.º 30
0
    def fit(self, expect_table, actual_table):

        LOGGER.info('start psi computing')

        header1 = expect_table.schema['header']
        header2 = actual_table.schema['header']

        if not set(header1) == set(header2):
            raise ValueError(
                'table header must be the same while computing psi values')

        # baseline table should not contain empty columns
        abnormal_detection.empty_column_detection(expect_table)

        self.all_feature_list = header1

        # make sure no duplicate features
        self.all_feature_list = self.check_duplicates(self.all_feature_list)

        # kv bi-directional mapping
        self.tag_id_mapping = {
            v: k
            for k, v in enumerate(self.all_feature_list)
        }
        self.id_tag_mapping = {
            k: v
            for k, v in enumerate(self.all_feature_list)
        }

        if not self.is_sparse(
                expect_table):  # convert missing value: nan to NoneType
            expect_table = self.convert_missing_val(expect_table)

        if not self.is_sparse(
                actual_table):  # convert missing value: nan to NoneType
            actual_table = self.convert_missing_val(actual_table)

        if not (self.check_table_content(expect_table)
                and self.check_table_content(actual_table)):
            raise ValueError(
                'contents of input table must be instances of class "Instance"'
            )

        param = FeatureBinningParam(method=consts.QUANTILE,
                                    bin_num=self.max_bin_num,
                                    local_only=True,
                                    error=self.binning_error)
        binning_obj = QuantileBinning(params=param,
                                      abnormal_list=[NoneType()],
                                      allow_duplicate=False)
        binning_obj.fit_split_points(expect_table)

        data_bin, bin_split_points, bin_sparse_points = binning_obj.convert_feature_to_bin(
            expect_table)
        LOGGER.debug('bin split points is {}, shape is {}'.format(
            bin_split_points, bin_split_points.shape))
        self.binning_obj = binning_obj

        self.data_bin1 = data_bin
        self.bin_split_points = bin_split_points
        self.bin_sparse_points = bin_sparse_points
        LOGGER.debug('expect table binning done')

        count_func1 = functools.partial(
            map_partition_handle,
            feat_num=len(self.all_feature_list),
            max_bin_num=self.max_bin_num +
            1,  # an additional bin for missing value
            missing_val=self.dense_missing_val,
            is_sparse=self.is_sparse(self.data_bin1))

        map_rs1 = self.data_bin1.applyPartitions(count_func1)
        count1 = count_rs_to_dict(map_rs1.reduce(map_partition_reduce))

        data_bin2, bin_split_points2, bin_sparse_points2 = binning_obj.convert_feature_to_bin(
            actual_table)
        self.data_bin2 = data_bin2
        LOGGER.debug('actual table binning done')

        count_func2 = functools.partial(
            map_partition_handle,
            feat_num=len(self.all_feature_list),
            max_bin_num=self.max_bin_num +
            1,  # an additional bin for missing value
            missing_val=self.dense_missing_val,
            is_sparse=self.is_sparse(self.data_bin2))

        map_rs2 = self.data_bin2.applyPartitions(count_func2)
        count2 = count_rs_to_dict(map_rs2.reduce(map_partition_reduce))

        self.count1, self.count2 = count1, count2

        LOGGER.info('psi counting done')

        # compute psi from counting result
        psi_result = psi_computer(count1, count2, expect_table.count(),
                                  actual_table.count())
        self.psi_rs = psi_result

        # get total psi score of features
        total_scores = {}
        for idx, rs in enumerate(self.psi_rs):
            feat_name = self.id_tag_mapping[idx]
            total_scores[feat_name] = rs['total_psi']
        self.total_scores = total_scores

        # id-feature mapping convert, str interval computation
        self.str_intervals = self.get_string_interval(
            bin_split_points,
            self.id_tag_mapping,
            missing_bin_idx=self.max_bin_num)

        self.interval_perc1 = self.count_dict_to_percentage(
            copy.deepcopy(count1), expect_table.count())
        self.interval_perc2 = self.count_dict_to_percentage(
            copy.deepcopy(count2), actual_table.count())

        self.set_summary(self.generate_summary())
        LOGGER.info('psi computation done')