Exemple #1
0
 def __init__(self, data, param):
     self.param = param
     self.differ = Differential(self.param.Seed)
     self.mapp = None
     self.root = KNode()
     self.realData = data
     self.root.n_box = None
     self.root.n_budget = Params.maxHeight
Exemple #2
0
 def __init__(self, data, param):
     self.param = param
     self.differ = Differential(self.param.Seed)
     # ## initialize the root
     self.root = KNode()
     self.root.n_data = data
     self.root.n_box = np.array([Params.LOW, Params.HIGH])
     self.root.n_budget = Params.maxHeight
Exemple #3
0
    def buildIndex(self):
        """ Function to build the tree structure, fanout = 4 by default for spatial (2D) data """
        budget_c = self.getCountBudget()
        self.root.n_count = self.getCount(self.root, budget_c[0])  # ## add noisy count to root
        stack = deque()
        stack.append(self.root)
        nleaf = 0  # ## leaf counter
        max_depth = -1
        # ## main loop
        while len(stack) > 0:
            curr = stack.popleft()
            if curr.n_depth > max_depth:
                max_depth = curr.n_depth

            if self.testLeaf(curr) is True:  # ## curr is a leaf node
                if curr.n_depth < Params.maxHeight:  # ## if a node ends up earlier than maxHeight, it should be able to use the remaining count budget
                    remainingEps = sum(budget_c[curr.n_depth + 1:])
                    curr.n_count = self.getCount(curr, remainingEps)
                nleaf += 1
                curr.n_isLeaf = True
                self.cell_setLeaf(curr)

            else:  # ## curr needs to split
                curr.n_budget -= 1  # ## some budget will be used regardless the split is successful or not
                tmp = self.getCoordinates(curr)
                nw_node, ne_node, sw_node, se_node = KNode(), KNode(), KNode(), KNode()  # create sub-nodes
                nw_coord, ne_coord, nw_node.n_data, ne_node.n_data, sw_node.n_data, se_node.n_data = tmp
                x_nw, y_nw = nw_coord
                x_se, y_se = ne_coord
                # ## update bounding box, depth, count, budget for the four subnodes
                nw_node.n_box = np.array([[curr.n_box[0, 0], y_nw], [x_nw, curr.n_box[1, 1]]])
                ne_node.n_box = np.array([[x_nw, y_se], [curr.n_box[1, 0], curr.n_box[1, 1]]])
                sw_node.n_box = np.array([[curr.n_box[0, 0], curr.n_box[0, 1]], [x_se, y_nw]])
                se_node.n_box = np.array([[x_se, curr.n_box[0, 1]], [curr.n_box[1, 0], y_se]])

                for sub_node in [nw_node, ne_node, sw_node, se_node]:
                    sub_node.n_depth = curr.n_depth + 1
                    # if (sub_node.n_depth == Params.maxHeight and sub_node.n_data is not None):
                    # print len(sub_node.n_data[0])
                    sub_node.n_count = self.getCount(sub_node, budget_c[sub_node.n_depth])
                    sub_node.n_budget = curr.n_budget
                    stack.append(sub_node)

                curr.n_data = None  # ## do not need the data points coordinates now
                curr.nw, curr.ne, curr.sw, curr.se = nw_node, ne_node, sw_node, se_node
        # end of while

        logging.debug("number of leaves: %d" % nleaf)
        logging.debug("max depth: %d" % max_depth)
Exemple #4
0
 def __init__(self, data, param):
     self.param = param
     self.differ = Differential(self.param.Seed)
     # ## initialize the root
     self.root = KNode()
     self.root.n_data = data
     self.root.n_box = np.array([Params.LOW, Params.HIGH])
     self.root.n_budget = Params.maxHeight
Exemple #5
0
 def __init__(self, data, param):
     self.param = param
     self.differ = Differential(self.param.Seed)
     self.mapp = None
     self.root = KNode()
     self.realData = data
     self.root.n_box = None
     self.root.n_budget = Params.maxHeight
Exemple #6
0
    def buildIndex(self):
        stack = deque()
        stack.append(self.root)
        nleaf = 0  # leaf counter
        max_depth = -1
        self.root.n_count = np.sum(self.mapp)
        while len(stack) > 0:
            curr = stack.popleft()
            if curr.n_depth > max_depth:
                max_depth = curr.n_depth
            if self.testLeaf(curr) is True:  # curr is a leaf node
                nleaf += 1
                curr.n_isLeaf = True
                self.cell_setLeaf(curr)
            else:  # curr needs to split
                curr.n_budget -= 1
                tmp = self.getCoordinates(curr)
                nw_node, ne_node, sw_node, se_node = KNode(), KNode(), KNode(), KNode()  # create sub-nodes
                nw_coord, ne_coord, count_tmp = tmp
                x_nw, y_nw = nw_coord
                x_se, y_se = ne_coord

                nw_node.n_box = np.array([[curr.n_box[0, 0], y_nw], [x_nw, curr.n_box[1, 1]]])
                ne_node.n_box = np.array([[x_nw, y_se], [curr.n_box[1, 0], curr.n_box[1, 1]]])
                sw_node.n_box = np.array([[curr.n_box[0, 0], curr.n_box[0, 1]], [x_se, y_nw]])
                se_node.n_box = np.array([[x_se, curr.n_box[0, 1]], [curr.n_box[1, 0], y_se]])

                c_t = 0
                for sub_node in [nw_node, ne_node, sw_node, se_node]:
                    sub_node.n_depth = curr.n_depth + 1
                    sub_node.n_count = count_tmp[c_t]
                    sub_node.n_budget = curr.n_budget
                    stack.append(sub_node)
                    c_t += 1
                curr.nw, curr.ne, curr.sw, curr.se = nw_node, ne_node, sw_node, se_node

        # end of while
        logging.debug("number of leaves: %d" % nleaf)
        logging.debug("max depth: %d" % max_depth)
    def buildIndex(self):
        """ Function to build the tree structure, fanout = 4 by default for spatial (2D) data """
        budget_c = self.getCountBudget()
        self.root.n_count = self.getCount(
            self.root, budget_c[0])  # ## add noisy count to root
        stack = deque()
        stack.append(self.root)
        nleaf = 0  # ## leaf counter
        max_depth = -1
        # ## main loop
        while len(stack) > 0:
            curr = stack.popleft()
            if curr.n_depth > max_depth:
                max_depth = curr.n_depth

            if self.testLeaf(curr) is True:  # ## curr is a leaf node
                if curr.n_depth < Params.maxHeight:  # ## if a node ends up earlier than maxHeight, it should be able to use the remaining count budget
                    remainingEps = sum(budget_c[curr.n_depth + 1:])
                    curr.n_count = self.getCount(curr, remainingEps)
                nleaf += 1
                curr.n_isLeaf = True
                self.cell_setLeaf(curr)

            else:  # ## curr needs to split
                curr.n_budget -= 1  # ## some budget will be used regardless the split is successful or not
                tmp = self.getCoordinates(curr)
                nw_node, ne_node, sw_node, se_node = KNode(), KNode(), KNode(
                ), KNode()  # create sub-nodes
                nw_coord, ne_coord, nw_node.n_data, ne_node.n_data, sw_node.n_data, se_node.n_data = tmp
                x_nw, y_nw = nw_coord
                x_se, y_se = ne_coord
                # ## update bounding box, depth, count, budget for the four subnodes
                nw_node.n_box = np.array([[curr.n_box[0, 0], y_nw],
                                          [x_nw, curr.n_box[1, 1]]])
                ne_node.n_box = np.array([[x_nw, y_se],
                                          [curr.n_box[1, 0], curr.n_box[1,
                                                                        1]]])
                sw_node.n_box = np.array([[curr.n_box[0, 0], curr.n_box[0, 1]],
                                          [x_se, y_nw]])
                se_node.n_box = np.array([[x_se, curr.n_box[0, 1]],
                                          [curr.n_box[1, 0], y_se]])

                for sub_node in [nw_node, ne_node, sw_node, se_node]:
                    sub_node.n_depth = curr.n_depth + 1
                    # if (sub_node.n_depth == Params.maxHeight and sub_node.n_data is not None):
                    # print len(sub_node.n_data[0])
                    sub_node.n_count = self.getCount(
                        sub_node, budget_c[sub_node.n_depth])
                    sub_node.n_budget = curr.n_budget
                    stack.append(sub_node)

                curr.n_data = None  # ## do not need the data points coordinates now
                curr.nw, curr.ne, curr.sw, curr.se = nw_node, ne_node, sw_node, se_node
        # end of while

        logging.debug("number of leaves: %d" % nleaf)
        logging.debug("max depth: %d" % max_depth)
Exemple #8
0
    def buildIndex(self):
        stack = deque()
        stack.append(self.root)
        nleaf = 0  # leaf counter
        max_depth = -1
        self.root.n_count = np.sum(self.mapp)
        while len(stack) > 0:
            curr = stack.popleft()
            if curr.n_depth > max_depth:
                max_depth = curr.n_depth
            if self.testLeaf(curr) is True:  # curr is a leaf node
                nleaf += 1
                curr.n_isLeaf = True
                self.cell_setLeaf(curr)
            else:  # curr needs to split
                curr.n_budget -= 1
                tmp = self.getCoordinates(curr)
                nw_node, ne_node, sw_node, se_node = KNode(), KNode(), KNode(
                ), KNode()  # create sub-nodes
                nw_coord, ne_coord, count_tmp = tmp
                x_nw, y_nw = nw_coord
                x_se, y_se = ne_coord

                nw_node.n_box = np.array([[curr.n_box[0, 0], y_nw],
                                          [x_nw, curr.n_box[1, 1]]])
                ne_node.n_box = np.array([[x_nw, y_se],
                                          [curr.n_box[1, 0], curr.n_box[1,
                                                                        1]]])
                sw_node.n_box = np.array([[curr.n_box[0, 0], curr.n_box[0, 1]],
                                          [x_se, y_nw]])
                se_node.n_box = np.array([[x_se, curr.n_box[0, 1]],
                                          [curr.n_box[1, 0], y_se]])

                c_t = 0
                for sub_node in [nw_node, ne_node, sw_node, se_node]:
                    sub_node.n_depth = curr.n_depth + 1
                    sub_node.n_count = count_tmp[c_t]
                    sub_node.n_budget = curr.n_budget
                    stack.append(sub_node)
                    c_t += 1
                curr.nw, curr.ne, curr.sw, curr.se = nw_node, ne_node, sw_node, se_node

        # end of while
        logging.debug("number of leaves: %d" % nleaf)
        logging.debug("max depth: %d" % max_depth)
Exemple #9
0
class Kd_cell(Kd_pure):
    """ Kd tree based on syntatic data generation and a grid structure. See
    Y. Xiao, L. Xiong, and C. Yuan, Differentially private data release
    through multidimensional partitioning, in SDM Workshop, VLDB, 2010
    """
    def __init__(self, data, param):
        self.param = param
        self.differ = Differential(self.param.Seed)
        self.mapp = None
        self.root = KNode()
        self.realData = data
        self.root.n_box = None
        self.root.n_budget = Params.maxHeight

    def getCountBudget(self):
        count_eps = self.param.Eps * 0.5
        H = Params.maxHeight
        if self.param.geoBudget == 'none':
            return [count_eps / (H + 1) for _ in range(H + 1)]
        elif self.param.geoBudget == 'aggressive':
            unit = count_eps / (2**(H + 1) - 1)
            return [unit * 2**i for i in range(H + 1)]
        elif self.param.geoBudget == 'quadratic':
            unit = count_eps * (np.sqrt(2) - 1) / (2**(0.5 * (H + 1)) - 1)
            return [unit * 2**(0.5 * i) for i in range(H + 1)]
        elif self.param.geoBudget == 'optimal':
            unit = count_eps * ((2**(1.0 / 3)) - 1) / (2**((1.0 / 3) *
                                                           (H + 1)) - 1)
            return [unit * 2**((1.0 / 3) * i) for i in range(H + 1)]
        elif self.param.geoBudget == 'quartic':
            unit = count_eps * ((2**(1.0 / 4)) - 1) / (2**((1.0 / 4) *
                                                           (H + 1)) - 1)
            return [unit * 2**((1.0 / 4) * i) for i in range(H + 1)]
        else:
            logging.error('No such geoBudget scheme')
            sys.exit(1)

    def synthetic_gen(self):
        """Apply a grid structure on the domain and perturb the count using half
        of the available privacy budget """
        logging.debug('generating synthetic map...')
        data = self.realData
        unit = Params.unitGrid
        x_min = np.floor(Params.LOW[0] / unit) * unit
        x_max = np.ceil(Params.HIGH[0] / unit) * unit
        y_min = np.floor(Params.LOW[1] / unit) * unit
        y_max = np.ceil(Params.HIGH[1] / unit) * unit

        x_CELL = int(np.rint((x_max - x_min) / unit))
        y_CELL = int(np.rint((y_max - y_min) / unit))

        self.root.n_box = np.array([[x_min, y_min], [x_max, y_max]])

        self.mapp = np.zeros(
            (x_CELL, y_CELL)) - 1  # ## initialize every cell with -1
        for i in range(Params.NDATA):  # ## populate the map
            point = data[:, i]
            cell_x = int(np.floor((point[0] - x_min) / unit))
            cell_y = int(np.floor((point[1] - y_min) / unit))
            if self.mapp[cell_x, cell_y] != -1:
                self.mapp[cell_x, cell_y] += 1
            else:
                self.mapp[cell_x, cell_y] = 1

        for i in range(x_CELL):  # ## perturb the counts
            for j in range(y_CELL):
                if self.mapp[i, j] != -1:
                    self.mapp[i, j] += np.rint(
                        self.differ.getNoise(1, 0.5 * self.param.Eps))
                else:
                    self.mapp[i, j] = np.rint(
                        self.differ.getNoise(1, 0.5 * self.param.Eps))
                # if noisy count is negative, ignore the noise and generate no points
                if self.mapp[i, j] < 0:
                    self.mapp[i, j] = 0

    def cell_setLeaf(self, curr):
        """ Throw away the counts based on the syntatic data """
        curr.n_count = 0
        return

    def testLeaf(self, curr):
        if (curr.n_count <= self.param.minPartSize) or (
                curr.n_depth == Params.maxHeight) or (self.uniform_test(
                    curr, self.param.cellDistance)):
            return True
        return False

    def uniform_test(self, curr, distance):
        """ One of the stopping conditions: cell is uniform according to some threshold 'distance') """
        unit = Params.unitGrid
        x_min = int(np.rint((curr.n_box[0, 0] - self.root.n_box[0, 0]) / unit))
        x_max = int(np.rint((curr.n_box[1, 0] - self.root.n_box[0, 0]) / unit))
        y_min = int(np.rint((curr.n_box[0, 1] - self.root.n_box[0, 1]) / unit))
        y_max = int(np.rint((curr.n_box[1, 1] - self.root.n_box[0, 1]) / unit))
        data = self.mapp[x_min:x_max, y_min:y_max]
        total = np.sum(data)
        avg = total / ((x_max - x_min) * (y_max - y_min))
        dist = np.sum(np.abs(data - avg))
        if dist > distance:
            return False
        else:
            return True

    def buildIndex(self):
        stack = deque()
        stack.append(self.root)
        nleaf = 0  # leaf counter
        max_depth = -1
        self.root.n_count = np.sum(self.mapp)
        while len(stack) > 0:
            curr = stack.popleft()
            if curr.n_depth > max_depth:
                max_depth = curr.n_depth
            if self.testLeaf(curr) is True:  # curr is a leaf node
                nleaf += 1
                curr.n_isLeaf = True
                self.cell_setLeaf(curr)
            else:  # curr needs to split
                curr.n_budget -= 1
                tmp = self.getCoordinates(curr)
                nw_node, ne_node, sw_node, se_node = KNode(), KNode(), KNode(
                ), KNode()  # create sub-nodes
                nw_coord, ne_coord, count_tmp = tmp
                x_nw, y_nw = nw_coord
                x_se, y_se = ne_coord

                nw_node.n_box = np.array([[curr.n_box[0, 0], y_nw],
                                          [x_nw, curr.n_box[1, 1]]])
                ne_node.n_box = np.array([[x_nw, y_se],
                                          [curr.n_box[1, 0], curr.n_box[1,
                                                                        1]]])
                sw_node.n_box = np.array([[curr.n_box[0, 0], curr.n_box[0, 1]],
                                          [x_se, y_nw]])
                se_node.n_box = np.array([[x_se, curr.n_box[0, 1]],
                                          [curr.n_box[1, 0], y_se]])

                c_t = 0
                for sub_node in [nw_node, ne_node, sw_node, se_node]:
                    sub_node.n_depth = curr.n_depth + 1
                    sub_node.n_count = count_tmp[c_t]
                    sub_node.n_budget = curr.n_budget
                    stack.append(sub_node)
                    c_t += 1
                curr.nw, curr.ne, curr.sw, curr.se = nw_node, ne_node, sw_node, se_node

        # end of while
        logging.debug("number of leaves: %d" % nleaf)
        logging.debug("max depth: %d" % max_depth)

    def getCoordinates(self, curr):
        dim_1 = curr.n_depth % Params.NDIM  # primary split dimension
        UNIT = Params.unitGrid
        x_min = int(np.rint((curr.n_box[0, 0] - self.root.n_box[0, 0]) / UNIT))
        x_max = int(np.rint((curr.n_box[1, 0] - self.root.n_box[0, 0]) / UNIT))
        y_min = int(np.rint((curr.n_box[0, 1] - self.root.n_box[0, 1]) / UNIT))
        y_max = int(np.rint((curr.n_box[1, 1] - self.root.n_box[0, 1]) / UNIT))

        total = np.sum(self.mapp[x_min:x_max, y_min:y_max])
        if dim_1 == 0:
            for i in range(x_max - x_min):
                if np.sum(self.mapp[x_min:x_min + i + 1,
                                    y_min:y_max]) >= total / 2:
                    break
            split_prm = (x_min + i + 1) * UNIT + self.root.n_box[0, 0]

            half_1 = np.sum(self.mapp[x_min:x_min + i + 1, y_min:y_max])
            half_2 = np.sum(self.mapp[x_min + i + 1:x_max, y_min:y_max])
            for j in range(y_max - y_min):
                if np.sum(self.mapp[x_min:x_min + i + 1,
                                    y_min:y_min + j + 1]) >= half_1 / 2:
                    break
            split_sec1 = self.root.n_box[0, 1] + (y_min + j + 1) * UNIT
            n_sw = np.sum(self.mapp[x_min:x_min + i + 1, y_min:y_min + j + 1])
            n_nw = np.sum(self.mapp[x_min:x_min + i + 1, y_min + j + 1:y_max])
            for k in range(y_max - y_min):
                if np.sum(self.mapp[x_min + i + 1:x_max,
                                    y_min:y_min + k + 1]) >= half_2 / 2:
                    break
            split_sec2 = self.root.n_box[0, 1] + (y_min + k + 1) * UNIT
            n_se = np.sum(self.mapp[x_min + i + 1:x_max, y_min:y_min + k + 1])
            n_ne = np.sum(self.mapp[x_min + i + 1:x_max, y_min + k + 1:y_max])
            return (split_prm, split_sec1), (split_prm,
                                             split_sec2), (n_nw, n_ne, n_sw,
                                                           n_se)

        else:
            for i in range(y_max - y_min):
                if np.sum(self.mapp[x_min:x_max,
                                    y_min:y_min + i + 1]) >= total / 2:
                    break
            split_prm = self.root.n_box[0, 1] + (y_min + i + 1) * UNIT

            half_1 = np.sum(self.mapp[x_min:x_max, y_min:y_min + i + 1])
            half_2 = np.sum(self.mapp[x_min:x_max, y_min + i + 1:y_max])
            for j in range(x_max - x_min):
                if np.sum(self.mapp[x_min:x_min + j + 1,
                                    y_min:y_min + i + 1]) >= half_1 / 2:
                    break
            split_sec1 = (x_min + j + 1) * UNIT + self.root.n_box[0, 0]
            n_sw = np.sum(self.mapp[x_min:x_min + j + 1, y_min:y_min + i + 1])
            n_se = np.sum(self.mapp[x_min + j + 1:x_max, y_min:y_min + i + 1])
            for k in range(x_max - x_min):
                if np.sum(self.mapp[x_min:x_min + k + 1,
                                    y_min + i + 1:y_max]) >= half_2 / 2:
                    break
            split_sec2 = (x_min + k + 1) * UNIT + self.root.n_box[0, 0]
            n_nw = np.sum(self.mapp[x_min:x_min + k + 1, y_min + i + 1:y_max])
            n_ne = np.sum(self.mapp[x_min + k + 1:x_max, y_min + i + 1:y_max])
            return (split_sec2, split_prm), (split_sec1,
                                             split_prm), (n_nw, n_ne, n_sw,
                                                          n_se)

    def populate_synthetic_tree(self):
        """ Populate real data to the synthetic tree """
        logging.debug('populating synthetic tree...')
        a_data = self.realData
        ndata = a_data.shape[1]
        for i in range(ndata):
            ptx = a_data[0, i]
            pty = a_data[1, i]
            leaf = self.root.find_subnode(ptx, pty)
            leaf.n_count += 1

        # traverse the tree and update leaf counts
        stack = deque()
        stack.append(self.root)
        while len(stack) > 0:
            cur_node = stack.popleft()
            if cur_node.n_isLeaf is True:  # leaf
                cur_node.n_count += self.differ.getNoise(
                    1, 0.5 * self.param.Eps)
            else:
                stack.append(cur_node.nw)
                stack.append(cur_node.ne)
                stack.append(cur_node.sw)
                stack.append(cur_node.se)
Exemple #10
0
class KTree(object):
    """Generic tree template"""

    def __init__(self, data, param):
        self.param = param
        self.differ = Differential(self.param.Seed)
        # ## initialize the root
        self.root = KNode()
        self.root.n_data = data
        self.root.n_box = np.array([Params.LOW, Params.HIGH])
        self.root.n_budget = Params.maxHeight

    def getSplitBudget(self):
        """return a list of h budget values for split"""
        raise NotImplementedError

    def getCountBudget(self):
        """return a list of (h+1) budget values for noisy count"""
        raise NotImplementedError

    def getNoisyMedian(self, array, left, right, epsilon):
        """return the split value of an array"""
        raise NotImplementedError

    def getCoordinates(self, curr):
        """
        return the coordinate of lower-right point of the NW sub-node
        and the upper-left point of the SW sub-node and the data points
        in the four subnodes, i.e.
        return (x_nw,y_nw),(x_se,y_se), nw_data, ne_data, sw_data, se_data
        """
        raise NotImplementedError

    def getSplit(self, array, left, right, epsilon):
        """
        return the split point given an array, may be data-independent or
        true median or noisy median, depending on the type of the tree
        """
        raise NotImplementedError

    def getCount(self, curr, epsilon):
        """ return true count or noisy count of a node, depending on epsilon"""
        if curr.n_data is None:
            count = 0
        else:
            count = curr.n_data.shape[1]
        if epsilon < 10 ** (-6):
            return count
        else:
            return count + self.differ.getNoise(1, epsilon)

    def testLeaf(self, curr):
        """ test whether a node should be a leaf node """
        if (curr.n_depth == Params.maxHeight) or \
                (curr.n_budget <= 0) or \
                (curr.n_data is None or curr.n_data.shape[1] == 0) or \
                (curr.n_count <= self.param.minPartSize):
            return True
        return False

    def cell_setLeaf(self, curr):
        """ will be overrided in kd_cell """
        return

    def buildIndex(self):
        """ Function to build the tree structure, fanout = 4 by default for spatial (2D) data """
        budget_c = self.getCountBudget()
        self.root.n_count = self.getCount(self.root, budget_c[0])  # ## add noisy count to root
        stack = deque()
        stack.append(self.root)
        nleaf = 0  # ## leaf counter
        max_depth = -1
        # ## main loop
        while len(stack) > 0:
            curr = stack.popleft()
            if curr.n_depth > max_depth:
                max_depth = curr.n_depth

            if self.testLeaf(curr) is True:  # ## curr is a leaf node
                if curr.n_depth < Params.maxHeight:  # ## if a node ends up earlier than maxHeight, it should be able to use the remaining count budget
                    remainingEps = sum(budget_c[curr.n_depth + 1:])
                    curr.n_count = self.getCount(curr, remainingEps)
                nleaf += 1
                curr.n_isLeaf = True
                self.cell_setLeaf(curr)

            else:  # ## curr needs to split
                curr.n_budget -= 1  # ## some budget will be used regardless the split is successful or not
                tmp = self.getCoordinates(curr)
                nw_node, ne_node, sw_node, se_node = KNode(), KNode(), KNode(), KNode()  # create sub-nodes
                nw_coord, ne_coord, nw_node.n_data, ne_node.n_data, sw_node.n_data, se_node.n_data = tmp
                x_nw, y_nw = nw_coord
                x_se, y_se = ne_coord
                # ## update bounding box, depth, count, budget for the four subnodes
                nw_node.n_box = np.array([[curr.n_box[0, 0], y_nw], [x_nw, curr.n_box[1, 1]]])
                ne_node.n_box = np.array([[x_nw, y_se], [curr.n_box[1, 0], curr.n_box[1, 1]]])
                sw_node.n_box = np.array([[curr.n_box[0, 0], curr.n_box[0, 1]], [x_se, y_nw]])
                se_node.n_box = np.array([[x_se, curr.n_box[0, 1]], [curr.n_box[1, 0], y_se]])

                for sub_node in [nw_node, ne_node, sw_node, se_node]:
                    sub_node.n_depth = curr.n_depth + 1
                    # if (sub_node.n_depth == Params.maxHeight and sub_node.n_data is not None):
                    # print len(sub_node.n_data[0])
                    sub_node.n_count = self.getCount(sub_node, budget_c[sub_node.n_depth])
                    sub_node.n_budget = curr.n_budget
                    stack.append(sub_node)

                curr.n_data = None  # ## do not need the data points coordinates now
                curr.nw, curr.ne, curr.sw, curr.se = nw_node, ne_node, sw_node, se_node
        # end of while

        logging.debug("number of leaves: %d" % nleaf)
        logging.debug("max depth: %d" % max_depth)

    def rect_intersect(self, hrect, query):
        """
        checks if the hyper-rectangle intersects with the
        hyper-rectangle defined by the query in every dimension
    
        """
        bool_m1 = query[0, :] >= hrect[1, :]
        bool_m2 = query[1, :] <= hrect[0, :]
        bool_m = np.logical_or(bool_m1, bool_m2)
        if np.any(bool_m):
            return False
        else:
            return True

    def rangeCount(self, query):
        """
        Query answering function. Find the number of data points within a query rectangle.
        """
        stack = deque()
        stack.append(self.root)
        count = 0.0
        # ## Below are three variables recording the number of 1) whole leaf 2) partial leaf 3) whole internal node,
        # ## respectively, which contribute to the query answer. For debug purpose only.
        l_whole, l_part, i_whole = 0, 0, 0

        while len(stack) > 0:
            curr = stack.popleft()
            _box = curr.n_box
            if curr.n_isLeaf is True:
                frac = 1
                if self.rect_intersect(_box, query):
                    for i in range(_box.shape[1]):
                        if _box[1, i] == _box[0, i] or Params.WorstCase == True:
                            frac *= 1
                        else:
                            frac *= (min(query[1, i], _box[1, i]) - max(query[0, i], _box[0, i])) / (
                                _box[1, i] - _box[0, i])
                    count += curr.n_count * frac
                    if 1.0 - frac < 10 ** (-6):
                        l_whole += 1
                    else:
                        l_part += 1

            else:  # ## if not leaf
                bool_matrix = np.zeros((2, query.shape[1]))
                bool_matrix[0, :] = query[0, :] <= _box[0, :]
                bool_matrix[1, :] = query[1, :] >= _box[1, :]

                if np.all(bool_matrix) and self.param.useLeafOnly is False:  # ## if query range contains node range
                    count += curr.n_count
                    i_whole += 1
                else:
                    if self.rect_intersect(curr.nw.n_box, query):
                        stack.append(curr.nw)
                    if self.rect_intersect(curr.ne.n_box, query):
                        stack.append(curr.ne)
                    if self.rect_intersect(curr.sw.n_box, query):
                        stack.append(curr.sw)
                    if self.rect_intersect(curr.se.n_box, query):
                        stack.append(curr.se)

        return float(count)  # , i_whole, l_whole, l_part

    def adjustConsistency(self):
        """ 
        Post processing for uniform noise across levels. Due to 
        Michael Hay, Vibhor Rastogi, Gerome Miklau, Dan Suciu, 
        Boosting the Accuracy of Differentially-Private Histograms Through Consistency,
        VLDB 2010
        """
        logging.debug('adjusting consistency...')
        # ## upward pass
        self.root.get_z()
        # ## downward pass
        queue = deque()
        queue.append(self.root)
        while len(queue) > 0:
            curr = queue.popleft()
            if curr.n_isLeaf is False:
                adjust = (curr.n_count - curr.nw.n_count - curr.ne.n_count - curr.sw.n_count - curr.se.n_count) / 4.0
                for subnode in [curr.nw, curr.ne, curr.sw, curr.se]:
                    subnode.n_count += adjust
                    queue.append(subnode)

    def postProcessing(self):
        """ 
        Post processing for general noise distribution across levels. Due to
        G. Cormode, M. Procopiuc, E. Shen, D. Srivastava and T. Yu, 
        Differentially Private Spatial Decompositions, ICDE 2012.
        """
        logging.debug("post-processing...")
        budget = self.getCountBudget()  # ## count budget for h+1 levels
        H = Params.maxHeight
        # ## Phase 1 (top-down)
        queue = deque()
        self.root.n_count *= budget[self.root.n_depth] ** 2
        queue.append(self.root)
        while len(queue) > 0:
            curr = queue.popleft()
            if curr.n_isLeaf is False:
                for subnode in [curr.nw, curr.ne, curr.sw, curr.se]:
                    subnode.n_count = curr.n_count + subnode.n_count * (budget[subnode.n_depth] ** 2)
                    queue.append(subnode)
        # ## Phase 2 (bottom-up)
        self.root.update_count()
        # ## Phase 3 (top-down)
        queue = deque()
        E_root = 0
        for i in range(H + 1):
            E_root += 4 ** i * budget[H - i] * budget[H - i]
        self.root.n_count /= E_root
        self.root.n_F = 0
        queue.append(self.root)
        while len(queue) > 0:
            curr = queue.popleft()
            if curr.n_isLeaf is False:
                h = H - curr.n_depth - 1  # ## height of curr's children
                E_h = 0
                for i in range(h + 1):
                    E_h += 4 ** i * budget[H - i] * budget[H - i]
                for subnode in [curr.nw, curr.ne, curr.sw, curr.se]:
                    subnode.n_F = curr.n_F + curr.n_count * (budget[curr.n_depth] ** 2)
                    subnode.n_count = (subnode.n_count - 4 ** h * subnode.n_F) / E_h
                    queue.append(subnode)

    def pruning(self):
        """
        If the tree is grown without the stopping condition of minLeafSize, prune it here after post processing
        """
        logging.debug("pruning...")
        queue = deque()
        queue.append(self.root)
        while len(queue) > 0:
            curr = queue.popleft()
            if curr.n_isLeaf is False:
                if curr.n_count <= self.param.minPartSize:
                    curr.n_isLeaf = True
                else:
                    queue.append(curr.nw)
                    queue.append(curr.ne)
                    queue.append(curr.sw)
                    queue.append(curr.se)
    def buildIndex(self):
        budget_c = self.getCountBudget()
        logging.debug('encoding coordinates...')
        RES = self.param.Res  # order of Hilbert curve
        ndata = self.realData.shape[1]
        hidx = np.zeros(ndata)
        for i in range(ndata):
            hx, hy = self.get_Hcoord(self.realData[0, i], self.realData[1, i],
                                     RES)
            hidx[i] = self.h_encode(hx, hy, RES)
        hidx = np.sort(hidx)

        logging.debug('building index...')
        self.root.n_data = hidx
        self.root.n_box = (0, 2**(2 * RES) - 1)
        self.root.n_count = self.getCount(self.root, budget_c[0])

        stack = deque()
        stack.append(self.root)
        tree = [self.root]
        leaf_li = []  # storage of all leaves
        nleaf = 0  # leaf counter
        max_depth = -1

        while len(stack) > 0:
            curr = stack.popleft()
            if curr.n_depth > max_depth:
                max_depth = curr.n_depth
            if self.testLeaf(curr) is True:  # curr is a leaf node
                if curr.n_depth < Params.maxHeight:
                    remainingEps = sum(budget_c[curr.n_depth + 1:])
                    curr.n_count = self.getCount(curr, remainingEps)
                nleaf += 1
                curr.n_isLeaf = True
                leaf_li.append(curr)

            else:  # curr needs to split
                curr.n_budget -= 1
                tmp = self.getCoordinates(curr)
                if tmp is False:  # if split fails
                    stack.append(curr)
                    continue
                nw_node, ne_node, sw_node, se_node = KNode(), KNode(), KNode(
                ), KNode()  # create sub-nodes
                split_prm, split_sec1, split_sec2, nw_node.n_data, ne_node.n_data, sw_node.n_data, se_node.n_data = tmp

                nw_node.n_box = (curr.n_box[0], split_sec1)
                ne_node.n_box = (split_sec1, split_prm)
                sw_node.n_box = (split_prm, split_sec2)
                se_node.n_box = (split_sec2, curr.n_box[1])

                for sub_node in [nw_node, ne_node, sw_node, se_node]:
                    sub_node.n_depth = curr.n_depth + 1
                    sub_node.n_count = self.getCount(
                        sub_node, budget_c[sub_node.n_depth])
                    sub_node.n_budget = curr.n_budget
                    stack.append(sub_node)
                    tree.append(sub_node)
                curr.n_data = None
                curr.nw, curr.ne, curr.sw, curr.se = nw_node, ne_node, sw_node, se_node

        # end of while
        logging.debug("number of leaves: %d" % nleaf)
        logging.debug("max depth: %d" % max_depth)

        # # convert hilbert values in leaf nodes to real coordinates and update bounding box
        logging.debug('decoding and updating bounding box...')
        for leaf in leaf_li:
            bbox = np.array([[1000.0, 1000.0], [-1000.0, -1000.0]],
                            dtype='float64')
            for hvalue in leaf.n_data:
                hx, hy = self.h_decode(int(hvalue), RES)
                x, y = self.get_Rcoord(hx, hy, RES)
                bbox[0, 0] = x if x < bbox[0, 0] else bbox[0, 0]
                bbox[1, 0] = x if x > bbox[1, 0] else bbox[1, 0]
                bbox[0, 1] = y if y < bbox[0, 1] else bbox[0, 1]
                bbox[1, 1] = y if y > bbox[1, 1] else bbox[1, 1]
            leaf.n_box = bbox

        # # update bounding box bottom-up
        tree = sorted(tree, cmp=self.cmp_node)
        logging.debug('updating box for each node in the tree...')
        for node in tree:
            if node.n_data is None:
                node.n_box = np.zeros((2, 2))
                node.n_box[0,
                           0] = min(node.ne.n_box[0, 0], node.nw.n_box[0, 0],
                                    node.se.n_box[0, 0], node.sw.n_box[0, 0])
                node.n_box[0,
                           1] = min(node.ne.n_box[0, 1], node.nw.n_box[0, 1],
                                    node.se.n_box[0, 1], node.sw.n_box[0, 1])
                node.n_box[1,
                           0] = max(node.ne.n_box[1, 0], node.nw.n_box[1, 0],
                                    node.se.n_box[1, 0], node.sw.n_box[1, 0])
                node.n_box[1,
                           1] = max(node.ne.n_box[1, 1], node.nw.n_box[1, 1],
                                    node.se.n_box[1, 1], node.sw.n_box[1, 1])
Exemple #12
0
class KTree(object):
    """Generic tree template"""

    def __init__(self, data, param):
        self.param = param
        self.differ = Differential(self.param.Seed)
        # ## initialize the root
        self.root = KNode()
        self.root.n_data = data
        self.root.n_box = np.array([Params.LOW, Params.HIGH])
        self.root.n_budget = Params.maxHeight

    def getSplitBudget(self):
        """return a list of h budget values for split"""
        raise NotImplementedError

    def getCountBudget(self):
        """return a list of (h+1) budget values for noisy count"""
        raise NotImplementedError

    def getNoisyMedian(self, array, left, right, epsilon):
        """return the split value of an array"""
        raise NotImplementedError

    def getCoordinates(self, curr):
        """
        return the coordinate of lower-right point of the NW sub-node
        and the upper-left point of the SW sub-node and the data points
        in the four subnodes, i.e.
        return (x_nw,y_nw),(x_se,y_se), nw_data, ne_data, sw_data, se_data
        """
        raise NotImplementedError

    def getSplit(self, array, left, right, epsilon):
        """
        return the split point given an array, may be data-independent or
        true median or noisy median, depending on the type of the tree
        """
        raise NotImplementedError

    def getCount(self, curr, epsilon):
        """ return true count or noisy count of a node, depending on epsilon"""
        if curr.n_data is None:
            count = 0
        else:
            count = curr.n_data.shape[1]
        if epsilon < 10 ** (-6):
            return count
        else:
            return count + self.differ.getNoise(1, epsilon)

    def testLeaf(self, curr):
        """ test whether a node should be a leaf node """
        if (curr.n_depth == Params.maxHeight) or \
                (curr.n_budget <= 0) or \
                (curr.n_data is None or curr.n_data.shape[1] == 0) or \
                (curr.n_count <= self.param.minPartSize):
            return True
        return False

    def cell_setLeaf(self, curr):
        """ will be overrided in kd_cell """
        return

    def buildIndex(self):
        """ Function to build the tree structure, fanout = 4 by default for spatial (2D) data """
        budget_c = self.getCountBudget()
        self.root.n_count = self.getCount(self.root, budget_c[0])  # ## add noisy count to root
        stack = deque()
        stack.append(self.root)
        nleaf = 0  # ## leaf counter
        max_depth = -1
        # ## main loop
        while len(stack) > 0:
            curr = stack.popleft()
            if curr.n_depth > max_depth:
                max_depth = curr.n_depth

            if self.testLeaf(curr) is True:  # ## curr is a leaf node
                if curr.n_depth < Params.maxHeight:  # ## if a node ends up earlier than maxHeight, it should be able to use the remaining count budget
                    remainingEps = sum(budget_c[curr.n_depth + 1:])
                    curr.n_count = self.getCount(curr, remainingEps)
                nleaf += 1
                curr.n_isLeaf = True
                self.cell_setLeaf(curr)

            else:  # ## curr needs to split
                curr.n_budget -= 1  # ## some budget will be used regardless the split is successful or not
                tmp = self.getCoordinates(curr)
                nw_node, ne_node, sw_node, se_node = KNode(), KNode(), KNode(), KNode()  # create sub-nodes
                nw_coord, ne_coord, nw_node.n_data, ne_node.n_data, sw_node.n_data, se_node.n_data = tmp
                x_nw, y_nw = nw_coord
                x_se, y_se = ne_coord
                # ## update bounding box, depth, count, budget for the four subnodes
                nw_node.n_box = np.array([[curr.n_box[0, 0], y_nw], [x_nw, curr.n_box[1, 1]]])
                ne_node.n_box = np.array([[x_nw, y_se], [curr.n_box[1, 0], curr.n_box[1, 1]]])
                sw_node.n_box = np.array([[curr.n_box[0, 0], curr.n_box[0, 1]], [x_se, y_nw]])
                se_node.n_box = np.array([[x_se, curr.n_box[0, 1]], [curr.n_box[1, 0], y_se]])

                for sub_node in [nw_node, ne_node, sw_node, se_node]:
                    sub_node.n_depth = curr.n_depth + 1
                    # if (sub_node.n_depth == Params.maxHeight and sub_node.n_data is not None):
                    # print len(sub_node.n_data[0])
                    sub_node.n_count = self.getCount(sub_node, budget_c[sub_node.n_depth])
                    sub_node.n_budget = curr.n_budget
                    stack.append(sub_node)

                curr.n_data = None  # ## do not need the data points coordinates now
                curr.nw, curr.ne, curr.sw, curr.se = nw_node, ne_node, sw_node, se_node
        # end of while

        logging.debug("number of leaves: %d" % nleaf)
        logging.debug("max depth: %d" % max_depth)

    def rect_intersect(self, hrect, query):
        """
        checks if the hyper-rectangle intersects with the
        hyper-rectangle defined by the query in every dimension
    
        """
        bool_m1 = query[0, :] >= hrect[1, :]
        bool_m2 = query[1, :] <= hrect[0, :]
        bool_m = np.logical_or(bool_m1, bool_m2)
        if np.any(bool_m):
            return False
        else:
            return True

    def rangeCount(self, query):
        """
        Query answering function. Find the number of data points within a query rectangle.
        """
        stack = deque()
        stack.append(self.root)
        count = 0.0
        # ## Below are three variables recording the number of 1) whole leaf 2) partial leaf 3) whole internal node,
        # ## respectively, which contribute to the query answer. For debug purpose only.
        l_whole, l_part, i_whole = 0, 0, 0

        while len(stack) > 0:
            curr = stack.popleft()
            _box = curr.n_box
            if curr.n_isLeaf is True:
                frac = 1
                if self.rect_intersect(_box, query):
                    for i in range(_box.shape[1]):
                        if _box[1, i] == _box[0, i] or Params.WorstCase == True:
                            frac *= 1
                        else:
                            frac *= (min(query[1, i], _box[1, i]) - max(query[0, i], _box[0, i])) / (
                                _box[1, i] - _box[0, i])
                    count += curr.n_count * frac
                    if 1.0 - frac < 10 ** (-6):
                        l_whole += 1
                    else:
                        l_part += 1

            else:  # ## if not leaf
                bool_matrix = np.zeros((2, query.shape[1]))
                bool_matrix[0, :] = query[0, :] <= _box[0, :]
                bool_matrix[1, :] = query[1, :] >= _box[1, :]

                if np.all(bool_matrix) and self.param.useLeafOnly is False:  # ## if query range contains node range
                    count += curr.n_count
                    i_whole += 1
                else:
                    if self.rect_intersect(curr.nw.n_box, query):
                        stack.append(curr.nw)
                    if self.rect_intersect(curr.ne.n_box, query):
                        stack.append(curr.ne)
                    if self.rect_intersect(curr.sw.n_box, query):
                        stack.append(curr.sw)
                    if self.rect_intersect(curr.se.n_box, query):
                        stack.append(curr.se)

        return float(count)  # , i_whole, l_whole, l_part

    def adjustConsistency(self):
        """ 
        Post processing for uniform noise across levels. Due to 
        Michael Hay, Vibhor Rastogi, Gerome Miklau, Dan Suciu, 
        Boosting the Accuracy of Differentially-Private Histograms Through Consistency,
        VLDB 2010
        """
        logging.debug('adjusting consistency...')
        # ## upward pass
        self.root.get_z()
        # ## downward pass
        queue = deque()
        queue.append(self.root)
        while len(queue) > 0:
            curr = queue.popleft()
            if curr.n_isLeaf is False:
                adjust = (curr.n_count - curr.nw.n_count - curr.ne.n_count - curr.sw.n_count - curr.se.n_count) / 4.0
                for subnode in [curr.nw, curr.ne, curr.sw, curr.se]:
                    subnode.n_count += adjust
                    queue.append(subnode)

    def postProcessing(self):
        """ 
        Post processing for general noise distribution across levels. Due to
        G. Cormode, M. Procopiuc, E. Shen, D. Srivastava and T. Yu, 
        Differentially Private Spatial Decompositions, ICDE 2012.
        """
        logging.debug("post-processing...")
        budget = self.getCountBudget()  # ## count budget for h+1 levels
        H = Params.maxHeight
        # ## Phase 1 (top-down)
        queue = deque()
        self.root.n_count *= budget[self.root.n_depth] ** 2
        queue.append(self.root)
        while len(queue) > 0:
            curr = queue.popleft()
            if curr.n_isLeaf is False:
                for subnode in [curr.nw, curr.ne, curr.sw, curr.se]:
                    subnode.n_count = curr.n_count + subnode.n_count * (budget[subnode.n_depth] ** 2)
                    queue.append(subnode)
        # ## Phase 2 (bottom-up)
        self.root.update_count()
        # ## Phase 3 (top-down)
        queue = deque()
        E_root = 0
        for i in range(H + 1):
            E_root += 4 ** i * budget[H - i] * budget[H - i]
        self.root.n_count /= E_root
        self.root.n_F = 0
        queue.append(self.root)
        while len(queue) > 0:
            curr = queue.popleft()
            if curr.n_isLeaf is False:
                h = H - curr.n_depth - 1  # ## height of curr's children
                E_h = 0
                for i in range(h + 1):
                    E_h += 4 ** i * budget[H - i] * budget[H - i]
                for subnode in [curr.nw, curr.ne, curr.sw, curr.se]:
                    subnode.n_F = curr.n_F + curr.n_count * (budget[curr.n_depth] ** 2)
                    subnode.n_count = (subnode.n_count - 4 ** h * subnode.n_F) / E_h
                    queue.append(subnode)

    def pruning(self):
        """
        If the tree is grown without the stopping condition of minLeafSize, prune it here after post processing
        """
        logging.debug("pruning...")
        queue = deque()
        queue.append(self.root)
        while len(queue) > 0:
            curr = queue.popleft()
            if curr.n_isLeaf is False:
                if curr.n_count <= self.param.minPartSize:
                    curr.n_isLeaf = True
                else:
                    queue.append(curr.nw)
                    queue.append(curr.ne)
                    queue.append(curr.sw)
                    queue.append(curr.se)
Exemple #13
0
    def buildIndex(self):
        budget_c = self.getCountBudget()
        logging.debug('encoding coordinates...')
        RES = self.param.Res  # order of Hilbert curve
        ndata = self.realData.shape[1]
        hidx = np.zeros(ndata)
        for i in range(ndata):
            hx, hy = self.get_Hcoord(self.realData[0, i], self.realData[1, i], RES)
            hidx[i] = self.h_encode(hx, hy, RES)
        hidx = np.sort(hidx)

        logging.debug('building index...')
        self.root.n_data = hidx
        self.root.n_box = (0, 2 ** (2 * RES) - 1)
        self.root.n_count = self.getCount(self.root, budget_c[0])

        stack = deque()
        stack.append(self.root)
        tree = [self.root]
        leaf_li = []  # storage of all leaves
        nleaf = 0  # leaf counter
        max_depth = -1

        while len(stack) > 0:
            curr = stack.popleft()
            if curr.n_depth > max_depth:
                max_depth = curr.n_depth
            if self.testLeaf(curr) is True:  # curr is a leaf node
                if curr.n_depth < Params.maxHeight:
                    remainingEps = sum(budget_c[curr.n_depth + 1:])
                    curr.n_count = self.getCount(curr, remainingEps)
                nleaf += 1
                curr.n_isLeaf = True
                leaf_li.append(curr)

            else:  # curr needs to split
                curr.n_budget -= 1
                tmp = self.getCoordinates(curr)
                if tmp is False:  # if split fails
                    stack.append(curr)
                    continue
                nw_node, ne_node, sw_node, se_node = KNode(), KNode(), KNode(), KNode()  # create sub-nodes
                split_prm, split_sec1, split_sec2, nw_node.n_data, ne_node.n_data, sw_node.n_data, se_node.n_data = tmp

                nw_node.n_box = (curr.n_box[0], split_sec1)
                ne_node.n_box = (split_sec1, split_prm)
                sw_node.n_box = (split_prm, split_sec2)
                se_node.n_box = (split_sec2, curr.n_box[1])

                for sub_node in [nw_node, ne_node, sw_node, se_node]:
                    sub_node.n_depth = curr.n_depth + 1
                    sub_node.n_count = self.getCount(sub_node, budget_c[sub_node.n_depth])
                    sub_node.n_budget = curr.n_budget
                    stack.append(sub_node)
                    tree.append(sub_node)
                curr.n_data = None
                curr.nw, curr.ne, curr.sw, curr.se = nw_node, ne_node, sw_node, se_node

        # end of while
        logging.debug("number of leaves: %d" % nleaf)
        logging.debug("max depth: %d" % max_depth)

        # # convert hilbert values in leaf nodes to real coordinates and update bounding box
        logging.debug('decoding and updating bounding box...')
        for leaf in leaf_li:
            bbox = np.array([[1000.0, 1000.0], [-1000.0, -1000.0]], dtype='float64')
            for hvalue in leaf.n_data:
                hx, hy = self.h_decode(int(hvalue), RES)
                x, y = self.get_Rcoord(hx, hy, RES)
                bbox[0, 0] = x if x < bbox[0, 0] else bbox[0, 0]
                bbox[1, 0] = x if x > bbox[1, 0] else bbox[1, 0]
                bbox[0, 1] = y if y < bbox[0, 1] else bbox[0, 1]
                bbox[1, 1] = y if y > bbox[1, 1] else bbox[1, 1]
            leaf.n_box = bbox

        # # update bounding box bottom-up
        tree = sorted(tree, cmp=self.cmp_node)
        logging.debug('updating box for each node in the tree...')
        for node in tree:
            if node.n_data is None:
                node.n_box = np.zeros((2, 2))
                node.n_box[0, 0] = min(node.ne.n_box[0, 0], node.nw.n_box[0, 0], node.se.n_box[0, 0],
                                       node.sw.n_box[0, 0])
                node.n_box[0, 1] = min(node.ne.n_box[0, 1], node.nw.n_box[0, 1], node.se.n_box[0, 1],
                                       node.sw.n_box[0, 1])
                node.n_box[1, 0] = max(node.ne.n_box[1, 0], node.nw.n_box[1, 0], node.se.n_box[1, 0],
                                       node.sw.n_box[1, 0])
                node.n_box[1, 1] = max(node.ne.n_box[1, 1], node.nw.n_box[1, 1], node.se.n_box[1, 1],
                                       node.sw.n_box[1, 1])
Exemple #14
0
class Kd_cell(Kd_pure):
    """ Kd tree based on syntatic data generation and a grid structure. See
    Y. Xiao, L. Xiong, and C. Yuan, Differentially private data release
    through multidimensional partitioning, in SDM Workshop, VLDB, 2010
    """

    def __init__(self, data, param):
        self.param = param
        self.differ = Differential(self.param.Seed)
        self.mapp = None
        self.root = KNode()
        self.realData = data
        self.root.n_box = None
        self.root.n_budget = Params.maxHeight

    def getCountBudget(self):
        count_eps = self.param.Eps * 0.5
        H = Params.maxHeight
        if self.param.geoBudget == 'none':
            return [count_eps / (H + 1) for _ in range(H + 1)]
        elif self.param.geoBudget == 'aggressive':
            unit = count_eps / (2 ** (H + 1) - 1)
            return [unit * 2 ** i for i in range(H + 1)]
        elif self.param.geoBudget == 'quadratic':
            unit = count_eps * (np.sqrt(2) - 1) / (2 ** (0.5 * (H + 1)) - 1)
            return [unit * 2 ** (0.5 * i) for i in range(H + 1)]
        elif self.param.geoBudget == 'optimal':
            unit = count_eps * ((2 ** (1.0 / 3)) - 1) / (2 ** ((1.0 / 3) * (H + 1)) - 1)
            return [unit * 2 ** ((1.0 / 3) * i) for i in range(H + 1)]
        elif self.param.geoBudget == 'quartic':
            unit = count_eps * ((2 ** (1.0 / 4)) - 1) / (2 ** ((1.0 / 4) * (H + 1)) - 1)
            return [unit * 2 ** ((1.0 / 4) * i) for i in range(H + 1)]
        else:
            logging.error('No such geoBudget scheme')
            sys.exit(1)

    def synthetic_gen(self):
        """Apply a grid structure on the domain and perturb the count using half
        of the available privacy budget """
        logging.debug('generating synthetic map...')
        data = self.realData
        unit = Params.unitGrid
        x_min = np.floor(Params.LOW[0] / unit) * unit
        x_max = np.ceil(Params.HIGH[0] / unit) * unit
        y_min = np.floor(Params.LOW[1] / unit) * unit
        y_max = np.ceil(Params.HIGH[1] / unit) * unit

        x_CELL = int(np.rint((x_max - x_min) / unit))
        y_CELL = int(np.rint((y_max - y_min) / unit))

        self.root.n_box = np.array([[x_min, y_min], [x_max, y_max]])

        self.mapp = np.zeros((x_CELL, y_CELL)) - 1  # ## initialize every cell with -1
        for i in range(Params.NDATA):  # ## populate the map
            point = data[:, i]
            cell_x = int(np.floor((point[0] - x_min) / unit))
            cell_y = int(np.floor((point[1] - y_min) / unit))
            if self.mapp[cell_x, cell_y] != -1:
                self.mapp[cell_x, cell_y] += 1
            else:
                self.mapp[cell_x, cell_y] = 1

        for i in range(x_CELL):  # ## perturb the counts
            for j in range(y_CELL):
                if self.mapp[i, j] != -1:
                    self.mapp[i, j] += np.rint(self.differ.getNoise(1, 0.5 * self.param.Eps))
                else:
                    self.mapp[i, j] = np.rint(self.differ.getNoise(1, 0.5 * self.param.Eps))
                # if noisy count is negative, ignore the noise and generate no points
                if self.mapp[i, j] < 0:
                    self.mapp[i, j] = 0

    def cell_setLeaf(self, curr):
        """ Throw away the counts based on the syntatic data """
        curr.n_count = 0
        return

    def testLeaf(self, curr):
        if (curr.n_count <= self.param.minPartSize) or (curr.n_depth == Params.maxHeight) or (
                self.uniform_test(curr, self.param.cellDistance)):
            return True
        return False

    def uniform_test(self, curr, distance):
        """ One of the stopping conditions: cell is uniform according to some threshold 'distance') """
        unit = Params.unitGrid
        x_min = int(np.rint((curr.n_box[0, 0] - self.root.n_box[0, 0]) / unit))
        x_max = int(np.rint((curr.n_box[1, 0] - self.root.n_box[0, 0]) / unit))
        y_min = int(np.rint((curr.n_box[0, 1] - self.root.n_box[0, 1]) / unit))
        y_max = int(np.rint((curr.n_box[1, 1] - self.root.n_box[0, 1]) / unit))
        data = self.mapp[x_min:x_max, y_min:y_max]
        total = np.sum(data)
        avg = total / ((x_max - x_min) * (y_max - y_min))
        dist = np.sum(np.abs(data - avg))
        if dist > distance:
            return False
        else:
            return True

    def buildIndex(self):
        stack = deque()
        stack.append(self.root)
        nleaf = 0  # leaf counter
        max_depth = -1
        self.root.n_count = np.sum(self.mapp)
        while len(stack) > 0:
            curr = stack.popleft()
            if curr.n_depth > max_depth:
                max_depth = curr.n_depth
            if self.testLeaf(curr) is True:  # curr is a leaf node
                nleaf += 1
                curr.n_isLeaf = True
                self.cell_setLeaf(curr)
            else:  # curr needs to split
                curr.n_budget -= 1
                tmp = self.getCoordinates(curr)
                nw_node, ne_node, sw_node, se_node = KNode(), KNode(), KNode(), KNode()  # create sub-nodes
                nw_coord, ne_coord, count_tmp = tmp
                x_nw, y_nw = nw_coord
                x_se, y_se = ne_coord

                nw_node.n_box = np.array([[curr.n_box[0, 0], y_nw], [x_nw, curr.n_box[1, 1]]])
                ne_node.n_box = np.array([[x_nw, y_se], [curr.n_box[1, 0], curr.n_box[1, 1]]])
                sw_node.n_box = np.array([[curr.n_box[0, 0], curr.n_box[0, 1]], [x_se, y_nw]])
                se_node.n_box = np.array([[x_se, curr.n_box[0, 1]], [curr.n_box[1, 0], y_se]])

                c_t = 0
                for sub_node in [nw_node, ne_node, sw_node, se_node]:
                    sub_node.n_depth = curr.n_depth + 1
                    sub_node.n_count = count_tmp[c_t]
                    sub_node.n_budget = curr.n_budget
                    stack.append(sub_node)
                    c_t += 1
                curr.nw, curr.ne, curr.sw, curr.se = nw_node, ne_node, sw_node, se_node

        # end of while
        logging.debug("number of leaves: %d" % nleaf)
        logging.debug("max depth: %d" % max_depth)

    def getCoordinates(self, curr):
        dim_1 = curr.n_depth % Params.NDIM  # primary split dimension
        UNIT = Params.unitGrid
        x_min = int(np.rint((curr.n_box[0, 0] - self.root.n_box[0, 0]) / UNIT))
        x_max = int(np.rint((curr.n_box[1, 0] - self.root.n_box[0, 0]) / UNIT))
        y_min = int(np.rint((curr.n_box[0, 1] - self.root.n_box[0, 1]) / UNIT))
        y_max = int(np.rint((curr.n_box[1, 1] - self.root.n_box[0, 1]) / UNIT))

        total = np.sum(self.mapp[x_min:x_max, y_min:y_max])
        if dim_1 == 0:
            for i in range(x_max - x_min):
                if np.sum(self.mapp[x_min:x_min + i + 1, y_min:y_max]) >= total / 2:
                    break
            split_prm = (x_min + i + 1) * UNIT + self.root.n_box[0, 0]

            half_1 = np.sum(self.mapp[x_min:x_min + i + 1, y_min:y_max])
            half_2 = np.sum(self.mapp[x_min + i + 1:x_max, y_min:y_max])
            for j in range(y_max - y_min):
                if np.sum(self.mapp[x_min:x_min + i + 1, y_min:y_min + j + 1]) >= half_1 / 2:
                    break
            split_sec1 = self.root.n_box[0, 1] + (y_min + j + 1) * UNIT
            n_sw = np.sum(self.mapp[x_min:x_min + i + 1, y_min:y_min + j + 1])
            n_nw = np.sum(self.mapp[x_min:x_min + i + 1, y_min + j + 1:y_max])
            for k in range(y_max - y_min):
                if np.sum(self.mapp[x_min + i + 1:x_max, y_min:y_min + k + 1]) >= half_2 / 2:
                    break
            split_sec2 = self.root.n_box[0, 1] + (y_min + k + 1) * UNIT
            n_se = np.sum(self.mapp[x_min + i + 1:x_max, y_min:y_min + k + 1])
            n_ne = np.sum(self.mapp[x_min + i + 1:x_max, y_min + k + 1:y_max])
            return (split_prm, split_sec1), (split_prm, split_sec2), (n_nw, n_ne, n_sw, n_se)

        else:
            for i in range(y_max - y_min):
                if np.sum(self.mapp[x_min:x_max, y_min:y_min + i + 1]) >= total / 2:
                    break
            split_prm = self.root.n_box[0, 1] + (y_min + i + 1) * UNIT

            half_1 = np.sum(self.mapp[x_min:x_max, y_min:y_min + i + 1])
            half_2 = np.sum(self.mapp[x_min:x_max, y_min + i + 1:y_max])
            for j in range(x_max - x_min):
                if np.sum(self.mapp[x_min:x_min + j + 1, y_min:y_min + i + 1]) >= half_1 / 2:
                    break
            split_sec1 = (x_min + j + 1) * UNIT + self.root.n_box[0, 0]
            n_sw = np.sum(self.mapp[x_min:x_min + j + 1, y_min:y_min + i + 1])
            n_se = np.sum(self.mapp[x_min + j + 1:x_max, y_min:y_min + i + 1])
            for k in range(x_max - x_min):
                if np.sum(self.mapp[x_min:x_min + k + 1, y_min + i + 1:y_max]) >= half_2 / 2:
                    break
            split_sec2 = (x_min + k + 1) * UNIT + self.root.n_box[0, 0]
            n_nw = np.sum(self.mapp[x_min:x_min + k + 1, y_min + i + 1:y_max])
            n_ne = np.sum(self.mapp[x_min + k + 1:x_max, y_min + i + 1:y_max])
            return (split_sec2, split_prm), (split_sec1, split_prm), (n_nw, n_ne, n_sw, n_se)


    def populate_synthetic_tree(self):
        """ Populate real data to the synthetic tree """
        logging.debug('populating synthetic tree...')
        a_data = self.realData
        ndata = a_data.shape[1]
        for i in range(ndata):
            ptx = a_data[0, i]
            pty = a_data[1, i]
            leaf = self.root.find_subnode(ptx, pty)
            leaf.n_count += 1

        # traverse the tree and update leaf counts
        stack = deque()
        stack.append(self.root)
        while len(stack) > 0:
            cur_node = stack.popleft()
            if cur_node.n_isLeaf is True:  # leaf
                cur_node.n_count += self.differ.getNoise(1, 0.5 * self.param.Eps)
            else:
                stack.append(cur_node.nw)
                stack.append(cur_node.ne)
                stack.append(cur_node.sw)
                stack.append(cur_node.se)