예제 #1
0
class KTree(object):
    """Generic tree template"""

    def __init__(self, data, param):
        self.param = param
        self.differ = Differential(self.param.Seed)
        # ## initialize the root
        self.root = KNode()
        self.root.n_data = data
        self.root.n_box = np.array([Params.LOW, Params.HIGH])
        self.root.n_budget = Params.maxHeight

    def getSplitBudget(self):
        """return a list of h budget values for split"""
        raise NotImplementedError

    def getCountBudget(self):
        """return a list of (h+1) budget values for noisy count"""
        raise NotImplementedError

    def getNoisyMedian(self, array, left, right, epsilon):
        """return the split value of an array"""
        raise NotImplementedError

    def getCoordinates(self, curr):
        """
        return the coordinate of lower-right point of the NW sub-node
        and the upper-left point of the SW sub-node and the data points
        in the four subnodes, i.e.
        return (x_nw,y_nw),(x_se,y_se), nw_data, ne_data, sw_data, se_data
        """
        raise NotImplementedError

    def getSplit(self, array, left, right, epsilon):
        """
        return the split point given an array, may be data-independent or
        true median or noisy median, depending on the type of the tree
        """
        raise NotImplementedError

    def getCount(self, curr, epsilon):
        """ return true count or noisy count of a node, depending on epsilon"""
        if curr.n_data is None:
            count = 0
        else:
            count = curr.n_data.shape[1]
        if epsilon < 10 ** (-6):
            return count
        else:
            return count + self.differ.getNoise(1, epsilon)

    def testLeaf(self, curr):
        """ test whether a node should be a leaf node """
        if (curr.n_depth == Params.maxHeight) or \
                (curr.n_budget <= 0) or \
                (curr.n_data is None or curr.n_data.shape[1] == 0) or \
                (curr.n_count <= self.param.minPartSize):
            return True
        return False

    def cell_setLeaf(self, curr):
        """ will be overrided in kd_cell """
        return

    def buildIndex(self):
        """ Function to build the tree structure, fanout = 4 by default for spatial (2D) data """
        budget_c = self.getCountBudget()
        self.root.n_count = self.getCount(self.root, budget_c[0])  # ## add noisy count to root
        stack = deque()
        stack.append(self.root)
        nleaf = 0  # ## leaf counter
        max_depth = -1
        # ## main loop
        while len(stack) > 0:
            curr = stack.popleft()
            if curr.n_depth > max_depth:
                max_depth = curr.n_depth

            if self.testLeaf(curr) is True:  # ## curr is a leaf node
                if curr.n_depth < Params.maxHeight:  # ## if a node ends up earlier than maxHeight, it should be able to use the remaining count budget
                    remainingEps = sum(budget_c[curr.n_depth + 1:])
                    curr.n_count = self.getCount(curr, remainingEps)
                nleaf += 1
                curr.n_isLeaf = True
                self.cell_setLeaf(curr)

            else:  # ## curr needs to split
                curr.n_budget -= 1  # ## some budget will be used regardless the split is successful or not
                tmp = self.getCoordinates(curr)
                nw_node, ne_node, sw_node, se_node = KNode(), KNode(), KNode(), KNode()  # create sub-nodes
                nw_coord, ne_coord, nw_node.n_data, ne_node.n_data, sw_node.n_data, se_node.n_data = tmp
                x_nw, y_nw = nw_coord
                x_se, y_se = ne_coord
                # ## update bounding box, depth, count, budget for the four subnodes
                nw_node.n_box = np.array([[curr.n_box[0, 0], y_nw], [x_nw, curr.n_box[1, 1]]])
                ne_node.n_box = np.array([[x_nw, y_se], [curr.n_box[1, 0], curr.n_box[1, 1]]])
                sw_node.n_box = np.array([[curr.n_box[0, 0], curr.n_box[0, 1]], [x_se, y_nw]])
                se_node.n_box = np.array([[x_se, curr.n_box[0, 1]], [curr.n_box[1, 0], y_se]])

                for sub_node in [nw_node, ne_node, sw_node, se_node]:
                    sub_node.n_depth = curr.n_depth + 1
                    # if (sub_node.n_depth == Params.maxHeight and sub_node.n_data is not None):
                    # print len(sub_node.n_data[0])
                    sub_node.n_count = self.getCount(sub_node, budget_c[sub_node.n_depth])
                    sub_node.n_budget = curr.n_budget
                    stack.append(sub_node)

                curr.n_data = None  # ## do not need the data points coordinates now
                curr.nw, curr.ne, curr.sw, curr.se = nw_node, ne_node, sw_node, se_node
        # end of while

        logging.debug("number of leaves: %d" % nleaf)
        logging.debug("max depth: %d" % max_depth)

    def rect_intersect(self, hrect, query):
        """
        checks if the hyper-rectangle intersects with the
        hyper-rectangle defined by the query in every dimension
    
        """
        bool_m1 = query[0, :] >= hrect[1, :]
        bool_m2 = query[1, :] <= hrect[0, :]
        bool_m = np.logical_or(bool_m1, bool_m2)
        if np.any(bool_m):
            return False
        else:
            return True

    def rangeCount(self, query):
        """
        Query answering function. Find the number of data points within a query rectangle.
        """
        stack = deque()
        stack.append(self.root)
        count = 0.0
        # ## Below are three variables recording the number of 1) whole leaf 2) partial leaf 3) whole internal node,
        # ## respectively, which contribute to the query answer. For debug purpose only.
        l_whole, l_part, i_whole = 0, 0, 0

        while len(stack) > 0:
            curr = stack.popleft()
            _box = curr.n_box
            if curr.n_isLeaf is True:
                frac = 1
                if self.rect_intersect(_box, query):
                    for i in range(_box.shape[1]):
                        if _box[1, i] == _box[0, i] or Params.WorstCase == True:
                            frac *= 1
                        else:
                            frac *= (min(query[1, i], _box[1, i]) - max(query[0, i], _box[0, i])) / (
                                _box[1, i] - _box[0, i])
                    count += curr.n_count * frac
                    if 1.0 - frac < 10 ** (-6):
                        l_whole += 1
                    else:
                        l_part += 1

            else:  # ## if not leaf
                bool_matrix = np.zeros((2, query.shape[1]))
                bool_matrix[0, :] = query[0, :] <= _box[0, :]
                bool_matrix[1, :] = query[1, :] >= _box[1, :]

                if np.all(bool_matrix) and self.param.useLeafOnly is False:  # ## if query range contains node range
                    count += curr.n_count
                    i_whole += 1
                else:
                    if self.rect_intersect(curr.nw.n_box, query):
                        stack.append(curr.nw)
                    if self.rect_intersect(curr.ne.n_box, query):
                        stack.append(curr.ne)
                    if self.rect_intersect(curr.sw.n_box, query):
                        stack.append(curr.sw)
                    if self.rect_intersect(curr.se.n_box, query):
                        stack.append(curr.se)

        return float(count)  # , i_whole, l_whole, l_part

    def adjustConsistency(self):
        """ 
        Post processing for uniform noise across levels. Due to 
        Michael Hay, Vibhor Rastogi, Gerome Miklau, Dan Suciu, 
        Boosting the Accuracy of Differentially-Private Histograms Through Consistency,
        VLDB 2010
        """
        logging.debug('adjusting consistency...')
        # ## upward pass
        self.root.get_z()
        # ## downward pass
        queue = deque()
        queue.append(self.root)
        while len(queue) > 0:
            curr = queue.popleft()
            if curr.n_isLeaf is False:
                adjust = (curr.n_count - curr.nw.n_count - curr.ne.n_count - curr.sw.n_count - curr.se.n_count) / 4.0
                for subnode in [curr.nw, curr.ne, curr.sw, curr.se]:
                    subnode.n_count += adjust
                    queue.append(subnode)

    def postProcessing(self):
        """ 
        Post processing for general noise distribution across levels. Due to
        G. Cormode, M. Procopiuc, E. Shen, D. Srivastava and T. Yu, 
        Differentially Private Spatial Decompositions, ICDE 2012.
        """
        logging.debug("post-processing...")
        budget = self.getCountBudget()  # ## count budget for h+1 levels
        H = Params.maxHeight
        # ## Phase 1 (top-down)
        queue = deque()
        self.root.n_count *= budget[self.root.n_depth] ** 2
        queue.append(self.root)
        while len(queue) > 0:
            curr = queue.popleft()
            if curr.n_isLeaf is False:
                for subnode in [curr.nw, curr.ne, curr.sw, curr.se]:
                    subnode.n_count = curr.n_count + subnode.n_count * (budget[subnode.n_depth] ** 2)
                    queue.append(subnode)
        # ## Phase 2 (bottom-up)
        self.root.update_count()
        # ## Phase 3 (top-down)
        queue = deque()
        E_root = 0
        for i in range(H + 1):
            E_root += 4 ** i * budget[H - i] * budget[H - i]
        self.root.n_count /= E_root
        self.root.n_F = 0
        queue.append(self.root)
        while len(queue) > 0:
            curr = queue.popleft()
            if curr.n_isLeaf is False:
                h = H - curr.n_depth - 1  # ## height of curr's children
                E_h = 0
                for i in range(h + 1):
                    E_h += 4 ** i * budget[H - i] * budget[H - i]
                for subnode in [curr.nw, curr.ne, curr.sw, curr.se]:
                    subnode.n_F = curr.n_F + curr.n_count * (budget[curr.n_depth] ** 2)
                    subnode.n_count = (subnode.n_count - 4 ** h * subnode.n_F) / E_h
                    queue.append(subnode)

    def pruning(self):
        """
        If the tree is grown without the stopping condition of minLeafSize, prune it here after post processing
        """
        logging.debug("pruning...")
        queue = deque()
        queue.append(self.root)
        while len(queue) > 0:
            curr = queue.popleft()
            if curr.n_isLeaf is False:
                if curr.n_count <= self.param.minPartSize:
                    curr.n_isLeaf = True
                else:
                    queue.append(curr.nw)
                    queue.append(curr.ne)
                    queue.append(curr.sw)
                    queue.append(curr.se)
예제 #2
0
class Kd_cell(Kd_pure):
    """ Kd tree based on syntatic data generation and a grid structure. See
    Y. Xiao, L. Xiong, and C. Yuan, Differentially private data release
    through multidimensional partitioning, in SDM Workshop, VLDB, 2010
    """
    def __init__(self, data, param):
        self.param = param
        self.differ = Differential(self.param.Seed)
        self.mapp = None
        self.root = KNode()
        self.realData = data
        self.root.n_box = None
        self.root.n_budget = Params.maxHeight

    def getCountBudget(self):
        count_eps = self.param.Eps * 0.5
        H = Params.maxHeight
        if self.param.geoBudget == 'none':
            return [count_eps / (H + 1) for _ in range(H + 1)]
        elif self.param.geoBudget == 'aggressive':
            unit = count_eps / (2**(H + 1) - 1)
            return [unit * 2**i for i in range(H + 1)]
        elif self.param.geoBudget == 'quadratic':
            unit = count_eps * (np.sqrt(2) - 1) / (2**(0.5 * (H + 1)) - 1)
            return [unit * 2**(0.5 * i) for i in range(H + 1)]
        elif self.param.geoBudget == 'optimal':
            unit = count_eps * ((2**(1.0 / 3)) - 1) / (2**((1.0 / 3) *
                                                           (H + 1)) - 1)
            return [unit * 2**((1.0 / 3) * i) for i in range(H + 1)]
        elif self.param.geoBudget == 'quartic':
            unit = count_eps * ((2**(1.0 / 4)) - 1) / (2**((1.0 / 4) *
                                                           (H + 1)) - 1)
            return [unit * 2**((1.0 / 4) * i) for i in range(H + 1)]
        else:
            logging.error('No such geoBudget scheme')
            sys.exit(1)

    def synthetic_gen(self):
        """Apply a grid structure on the domain and perturb the count using half
        of the available privacy budget """
        logging.debug('generating synthetic map...')
        data = self.realData
        unit = Params.unitGrid
        x_min = np.floor(Params.LOW[0] / unit) * unit
        x_max = np.ceil(Params.HIGH[0] / unit) * unit
        y_min = np.floor(Params.LOW[1] / unit) * unit
        y_max = np.ceil(Params.HIGH[1] / unit) * unit

        x_CELL = int(np.rint((x_max - x_min) / unit))
        y_CELL = int(np.rint((y_max - y_min) / unit))

        self.root.n_box = np.array([[x_min, y_min], [x_max, y_max]])

        self.mapp = np.zeros(
            (x_CELL, y_CELL)) - 1  # ## initialize every cell with -1
        for i in range(Params.NDATA):  # ## populate the map
            point = data[:, i]
            cell_x = int(np.floor((point[0] - x_min) / unit))
            cell_y = int(np.floor((point[1] - y_min) / unit))
            if self.mapp[cell_x, cell_y] != -1:
                self.mapp[cell_x, cell_y] += 1
            else:
                self.mapp[cell_x, cell_y] = 1

        for i in range(x_CELL):  # ## perturb the counts
            for j in range(y_CELL):
                if self.mapp[i, j] != -1:
                    self.mapp[i, j] += np.rint(
                        self.differ.getNoise(1, 0.5 * self.param.Eps))
                else:
                    self.mapp[i, j] = np.rint(
                        self.differ.getNoise(1, 0.5 * self.param.Eps))
                # if noisy count is negative, ignore the noise and generate no points
                if self.mapp[i, j] < 0:
                    self.mapp[i, j] = 0

    def cell_setLeaf(self, curr):
        """ Throw away the counts based on the syntatic data """
        curr.n_count = 0
        return

    def testLeaf(self, curr):
        if (curr.n_count <= self.param.minPartSize) or (
                curr.n_depth == Params.maxHeight) or (self.uniform_test(
                    curr, self.param.cellDistance)):
            return True
        return False

    def uniform_test(self, curr, distance):
        """ One of the stopping conditions: cell is uniform according to some threshold 'distance') """
        unit = Params.unitGrid
        x_min = int(np.rint((curr.n_box[0, 0] - self.root.n_box[0, 0]) / unit))
        x_max = int(np.rint((curr.n_box[1, 0] - self.root.n_box[0, 0]) / unit))
        y_min = int(np.rint((curr.n_box[0, 1] - self.root.n_box[0, 1]) / unit))
        y_max = int(np.rint((curr.n_box[1, 1] - self.root.n_box[0, 1]) / unit))
        data = self.mapp[x_min:x_max, y_min:y_max]
        total = np.sum(data)
        avg = total / ((x_max - x_min) * (y_max - y_min))
        dist = np.sum(np.abs(data - avg))
        if dist > distance:
            return False
        else:
            return True

    def buildIndex(self):
        stack = deque()
        stack.append(self.root)
        nleaf = 0  # leaf counter
        max_depth = -1
        self.root.n_count = np.sum(self.mapp)
        while len(stack) > 0:
            curr = stack.popleft()
            if curr.n_depth > max_depth:
                max_depth = curr.n_depth
            if self.testLeaf(curr) is True:  # curr is a leaf node
                nleaf += 1
                curr.n_isLeaf = True
                self.cell_setLeaf(curr)
            else:  # curr needs to split
                curr.n_budget -= 1
                tmp = self.getCoordinates(curr)
                nw_node, ne_node, sw_node, se_node = KNode(), KNode(), KNode(
                ), KNode()  # create sub-nodes
                nw_coord, ne_coord, count_tmp = tmp
                x_nw, y_nw = nw_coord
                x_se, y_se = ne_coord

                nw_node.n_box = np.array([[curr.n_box[0, 0], y_nw],
                                          [x_nw, curr.n_box[1, 1]]])
                ne_node.n_box = np.array([[x_nw, y_se],
                                          [curr.n_box[1, 0], curr.n_box[1,
                                                                        1]]])
                sw_node.n_box = np.array([[curr.n_box[0, 0], curr.n_box[0, 1]],
                                          [x_se, y_nw]])
                se_node.n_box = np.array([[x_se, curr.n_box[0, 1]],
                                          [curr.n_box[1, 0], y_se]])

                c_t = 0
                for sub_node in [nw_node, ne_node, sw_node, se_node]:
                    sub_node.n_depth = curr.n_depth + 1
                    sub_node.n_count = count_tmp[c_t]
                    sub_node.n_budget = curr.n_budget
                    stack.append(sub_node)
                    c_t += 1
                curr.nw, curr.ne, curr.sw, curr.se = nw_node, ne_node, sw_node, se_node

        # end of while
        logging.debug("number of leaves: %d" % nleaf)
        logging.debug("max depth: %d" % max_depth)

    def getCoordinates(self, curr):
        dim_1 = curr.n_depth % Params.NDIM  # primary split dimension
        UNIT = Params.unitGrid
        x_min = int(np.rint((curr.n_box[0, 0] - self.root.n_box[0, 0]) / UNIT))
        x_max = int(np.rint((curr.n_box[1, 0] - self.root.n_box[0, 0]) / UNIT))
        y_min = int(np.rint((curr.n_box[0, 1] - self.root.n_box[0, 1]) / UNIT))
        y_max = int(np.rint((curr.n_box[1, 1] - self.root.n_box[0, 1]) / UNIT))

        total = np.sum(self.mapp[x_min:x_max, y_min:y_max])
        if dim_1 == 0:
            for i in range(x_max - x_min):
                if np.sum(self.mapp[x_min:x_min + i + 1,
                                    y_min:y_max]) >= total / 2:
                    break
            split_prm = (x_min + i + 1) * UNIT + self.root.n_box[0, 0]

            half_1 = np.sum(self.mapp[x_min:x_min + i + 1, y_min:y_max])
            half_2 = np.sum(self.mapp[x_min + i + 1:x_max, y_min:y_max])
            for j in range(y_max - y_min):
                if np.sum(self.mapp[x_min:x_min + i + 1,
                                    y_min:y_min + j + 1]) >= half_1 / 2:
                    break
            split_sec1 = self.root.n_box[0, 1] + (y_min + j + 1) * UNIT
            n_sw = np.sum(self.mapp[x_min:x_min + i + 1, y_min:y_min + j + 1])
            n_nw = np.sum(self.mapp[x_min:x_min + i + 1, y_min + j + 1:y_max])
            for k in range(y_max - y_min):
                if np.sum(self.mapp[x_min + i + 1:x_max,
                                    y_min:y_min + k + 1]) >= half_2 / 2:
                    break
            split_sec2 = self.root.n_box[0, 1] + (y_min + k + 1) * UNIT
            n_se = np.sum(self.mapp[x_min + i + 1:x_max, y_min:y_min + k + 1])
            n_ne = np.sum(self.mapp[x_min + i + 1:x_max, y_min + k + 1:y_max])
            return (split_prm, split_sec1), (split_prm,
                                             split_sec2), (n_nw, n_ne, n_sw,
                                                           n_se)

        else:
            for i in range(y_max - y_min):
                if np.sum(self.mapp[x_min:x_max,
                                    y_min:y_min + i + 1]) >= total / 2:
                    break
            split_prm = self.root.n_box[0, 1] + (y_min + i + 1) * UNIT

            half_1 = np.sum(self.mapp[x_min:x_max, y_min:y_min + i + 1])
            half_2 = np.sum(self.mapp[x_min:x_max, y_min + i + 1:y_max])
            for j in range(x_max - x_min):
                if np.sum(self.mapp[x_min:x_min + j + 1,
                                    y_min:y_min + i + 1]) >= half_1 / 2:
                    break
            split_sec1 = (x_min + j + 1) * UNIT + self.root.n_box[0, 0]
            n_sw = np.sum(self.mapp[x_min:x_min + j + 1, y_min:y_min + i + 1])
            n_se = np.sum(self.mapp[x_min + j + 1:x_max, y_min:y_min + i + 1])
            for k in range(x_max - x_min):
                if np.sum(self.mapp[x_min:x_min + k + 1,
                                    y_min + i + 1:y_max]) >= half_2 / 2:
                    break
            split_sec2 = (x_min + k + 1) * UNIT + self.root.n_box[0, 0]
            n_nw = np.sum(self.mapp[x_min:x_min + k + 1, y_min + i + 1:y_max])
            n_ne = np.sum(self.mapp[x_min + k + 1:x_max, y_min + i + 1:y_max])
            return (split_sec2, split_prm), (split_sec1,
                                             split_prm), (n_nw, n_ne, n_sw,
                                                          n_se)

    def populate_synthetic_tree(self):
        """ Populate real data to the synthetic tree """
        logging.debug('populating synthetic tree...')
        a_data = self.realData
        ndata = a_data.shape[1]
        for i in range(ndata):
            ptx = a_data[0, i]
            pty = a_data[1, i]
            leaf = self.root.find_subnode(ptx, pty)
            leaf.n_count += 1

        # traverse the tree and update leaf counts
        stack = deque()
        stack.append(self.root)
        while len(stack) > 0:
            cur_node = stack.popleft()
            if cur_node.n_isLeaf is True:  # leaf
                cur_node.n_count += self.differ.getNoise(
                    1, 0.5 * self.param.Eps)
            else:
                stack.append(cur_node.nw)
                stack.append(cur_node.ne)
                stack.append(cur_node.sw)
                stack.append(cur_node.se)
class KalmanFilterPID(Parser):
    """ generated source for class KalmanFilterPID """

    # sampling rate
    def __init__(self, param):
        """
        generated source for method __init__
        """
        Parser.__init__(self)

        self.param = param
        self.differ = Differential(self.param.Seed)

        self.predict = []
        self.interval = None

        # Kalman Filter params
        self.P = 100

        # estimation error covariance (over all time instance)
        self.Q = 1000

        # process noise synthetic data
        self.R = 1000000

        # measurement noise optimal for alpha = 1, synthetic data
        self.K = 0

        # kalman gain
        # PID control params - default
        self.Cp = 0.9  # proportional gain, to keep output proportional to current error
        self.Ci = 0.1  # integral gain, to eliminate offset
        self.Cd = 0.0  # derivative gain, to ensure stability - prevent large error in future

        # fixed internally
        self.theta = 1  # magnitude of changes
        self.xi = 0.2  # gamma (10%)
        self.minIntvl = 1  # make sure the interval is greater than 1

        self.windowPID = 5  # I(integration) window
        self.ratioM = 0.2  # sampling rate

        #
        self.isSampling = False


    def adjustParams(self):
        # adjust params
        if self.ratioM < 0.1:
            self.theta = 20
        if 0.1 <= self.ratioM < 0.2:
            self.theta = 14
        if 0.2 <= self.ratioM < 0.3:
            self.theta = 2
        if 0.3 <= self.ratioM < 0.4:
            self.theta = 0.5
        if 0.4 <= self.ratioM < 0.5:
            self.theta = 0.3
        if 0.5 <= self.ratioM:
            self.theta = 0.1

    # test
    @classmethod
    def main(self, args):
        """ generated source for method main """
        if len(args) < 5:
            print "Usage: python KalmanFilterPID.py input output privacy-budget process-variance Cp(optional) Ci(optional) Cd(optional)"
            sys.exit()

        output = open(args[2], "w")
        budget = eval(args[3])
        Q = float(args[4])
        if budget <= 0 or Q <= 0:
            print "Usage: privacy-budget AND process-variance are positive values"
            sys.exit()

        p = Params(1000)
        kfPID = KalmanFilterPID(p)
        kfPID.setTotalBudget(budget)
        kfPID.setQ(Q)

        kfPID.orig = Parser.getData(args[1])

        kfPID.publish = [None] * len(kfPID.orig)

        # adjust R based on T and alpha
        kfPID.setR(len(kfPID.orig) * len(kfPID.orig) / (0.0 + budget * budget))

        # set optional control gains
        if len(args) >= 6:
            d = args[5]
            if d > 1:
                d = 1
            kfPID.setCp(d)

        if len(args) >= 7:
            d = args[6]
            if d + kfPID.Cp > 1:
                d = 1 - kfPID.Cp
            kfPID.setCi(d)
        else:
            kfPID.setCi(1 - kfPID.Cp)

        if len(args) >= 8:
            d = args[7]
            if d + kfPID.Cp + kfPID.Ci > 1:
                d = 1 - kfPID.Cp - kfPID.Ci
            kfPID.setCd(d)
        else:
            kfPID.setCd(1 - kfPID.Cp - kfPID.Ci)

        # kfPID.adjustParams()

        start = time.time()
        kfPID.publishCounts()
        end = time.time()

        Parser.outputData(output, kfPID.publish)

        print "Method:\tKalman Filter with Adaptive Sampling"
        print "Data Series Length:\t" + str(len(kfPID.orig))
        print "Queries Issued:\t" + str(kfPID.query.count(1))
        print "Privacy Budget Used:\t" + str(kfPID.query.count(1) * kfPID.epsilon)
        print "Average Relative Error:\t" + str(kfPID.getRelError())
        print "Time Used (in second):\t" + str(end - start)

    def kalmanFilter(self, orig, budget, samplingRate=None):
        self.totalBudget = budget
        self.orig = orig
        if samplingRate is not None:
            self.isSampling = True
            self.ratioM = samplingRate
        else:
            self.isSampling = False

        # self.adjustParams()

        self.publish = [None] * len(self.orig)

        # adjust R based on T and alpha
        self.setR(len(self.orig) * len(self.orig) / (0.0 + budget * budget))

        self.publishCounts()

        return self.publish

    def getCount(self, value, epsilon):
        """
        return true count or noisy count of a node, depending on epsilon.
        Note that the noisy count can be negative
        """
        if epsilon < 10 ** (-8):
            return value
        else:
            return value + self.differ.getNoise(1, epsilon)  # sensitivity is 1


    # data publication procedure
    def publishCounts(self):
        """ generated source for method publish """

        self.query = BitArray(len(self.orig))
        self.predict = [None] * len(self.orig)

        # recalculate individual budget based on M
        if (self.isSampling):
            M = int(self.ratioM * (len(self.orig)))  # 0.25 optimal percentile
        else:
            M = len(self.orig)

        if M <= 0:
            M = 1
        self.epsilon = (self.totalBudget + 0.0) / M

        # error = 0
        self.interval = 1
        nextQuery = max(1, self.windowPID) + self.interval - 1

        for i in range(len(self.orig)):
            if i == 0:
                # the first time instance
                self.publish[i] = self.getCount(self.orig[i], self.epsilon)
                self.query[i] = 1
                self.correctKF(i, 0)
            else:
                predct = self.predictKF(i)
                self.predict[i] = predct
                if self.query.count(1) < self.windowPID and self.query.count(1) < M:
                    # i is NOT the sampling point

                    self.publish[i] = self.getCount(self.orig[i], self.epsilon)
                    self.query[i] = 1

                    # update count using observation
                    self.correctKF(i, predct)
                elif i == nextQuery and self.query.count(1) < M:
                    # if i is the sampling point

                    # query
                    self.publish[i] = self.getCount(self.orig[i], self.epsilon)
                    self.query[i] = 1

                    # update count using observation
                    self.correctKF(i, predct)

                    # update freq
                    if (self.isSampling):
                        ratio = self.PID(i)
                        frac = min(20, (ratio - self.xi) / self.xi)
                        deltaI = self.theta * (1 - math.exp(frac))
                        deltaI = int(deltaI) + (random.random() < deltaI - int(deltaI))
                        self.interval += deltaI
                    else:
                        self.interval = 1

                    if self.interval < self.minIntvl:
                        self.interval = self.minIntvl
                    nextQuery += self.interval  # nextQuery is ns in the paper
                else:
                    # --> predict
                    self.publish[i] = predct

                    # del self.orig
                    # del self.predict
                    # del self.query

                    # if self.isPostProcessing:
                    # self.postProcessing()

    # def postProcessing(self):
    # print len(self.samples), self.samples
    # remainedEps = self.totalBudget - len(self.samples) * self.epsilon
    # self.epsilon = self.epsilon + remainedEps/len(self.samples)
    #
    # # recompute noisy counts
    #     prev = 0
    #     for i in self.samples:
    #         self.publish[i] = self.getCount(self.orig[i], self.epsilon)
    #         if i > prev + 1:
    #             self.publish[prev + 1 : i] = [self.publish[prev]] * (i - prev - 1)
    #         prev = i

    def setR(self, r):
        """ generated source for method setR """
        self.R = r

    def setQ(self, q):
        """ generated source for method setQ """
        self.Q = q

    def setCp(self, cp):
        """ generated source for method setCp """
        self.Cp = cp

    def setCi(self, ci):
        """ generated source for method setCi """
        self.Ci = ci

    def setCd(self, cd):
        """ generated source for method setCd """
        self.Cd = cd

    # prediction step
    def predictKF(self, curr):
        """ generated source for method predictKF """
        # predict using Kalman Filter
        lastValue = self.getLastQuery(curr)

        # project estimation error
        self.P += self.Q  # Q is gaussian noise
        return lastValue

    # correction step
    def correctKF(self, curr, predict):
        """ generated source for method correctKF """
        self.K = (self.P + 0.0) / (self.P + self.R)
        correct = predict + self.K * (self.publish[curr] - predict)

        # publish[curr] = Math.max((int) correct, 0)
        if curr > 0:
            # only correct from 2nd values
            self.publish[curr] = correct

        # print correct, "\t", self.publish[curr], self.K, self.P

        # update estimation error variance
        self.P *= (1 - self.K)

    def getLastQuery(self, curr):
        """ generated source for method getLastQuery """
        for i in reversed(range(curr)):
            if self.query[i]:
                break
        return self.publish[i]

    # adaptive sampling - return feedback error
    def PID(self, curr):
        """ generated source for method PID """
        sum = 0
        lastValue = 0
        change = 0
        timeDiff = 0
        next = curr
        for j in reversed(range(self.windowPID - 1)):
            index = next
            while index >= 0:
                if self.query[index]:
                    next = index - 1  # the last nextQuery
                    break
                index -= 1
            if j == self.windowPID - 1:
                lastValue = abs(self.publish[index] - self.predict[index]) / (0.0 + max(self.publish[index], 1))
                change = abs(self.publish[index] - self.predict[index]) / (0.0 + max(self.publish[index], 1))
                timeDiff = index
            if j == self.windowPID - 2:
                change -= abs(self.publish[index] - self.predict[index]) / (0.0 + max(self.publish[index], 1))
                timeDiff -= index
            sum += (abs(self.publish[index] - self.predict[index]) / (0.0 + max(self.publish[index], 1)))

        ratio = self.Cp * lastValue + self.Ci * sum + self.Cd * change / (0.0 + timeDiff)
        return ratio
class Hilbert(Kd_standard):
    """ Hilbert R-tree """
    def __init__(self, data, param):
        self.param = param
        self.differ = Differential(self.param.Seed)
        self.root = KNode()
        self.realData = data
        self.root.n_budget = Params.maxHeight

    def h_encode(self, x, y, r):
        """ (x,y) -> value h in Hilbert space, r is the resolution of the Hilbert curve """
        mask = (1 << r) - 1
        heven = x ^ y
        notx = ~x & mask
        noty = ~y & mask
        temp = notx ^ y
        v0, v1 = 0, 0
        for k in range(r - 1):
            v1 = ((v1 & heven) | ((v0 ^ noty) & temp)) >> 1
            v0 = ((v0 & (v1 ^ notx)) | (~v0 & (v1 ^ noty))) >> 1
        hodd = (~v0 & (v1 ^ x)) | (v0 & (v1 ^ noty))
        return self.interleaveBits(hodd, heven)

    def h_decode(self, h, r):
        """ h -> (x,y) """
        heven, hodd = self.deleaveBits(h)
        mask = (1 << r) - 1
        v0, v1 = 0, 0
        temp1 = ~(heven | hodd) & mask
        temp0 = ~(heven ^ hodd) & mask
        for k in range(r - 1):
            v1 = (v1 ^ temp1) >> 1
            v0 = (v0 ^ temp0) >> 1
        return (v0 & ~heven) ^ v1 ^ hodd, (v0 | heven) ^ v1 ^ hodd

    def interleaveBits(self, hodd, heven):
        val = 0
        maxx = max(hodd, heven)
        n = 0
        while maxx > 0:
            n += 1
            maxx >>= 1
        for i in range(n):
            bitMask = 1 << i
            a = 1 << (2 * i) if (heven & bitMask) else 0
            b = 1 << (2 * i + 1) if (hodd & bitMask) else 0
            val += a + b
        return val

    def deleaveBitsOdd(self, x):
        x &= 0x5555555555555555
        x = (x | (x >> 1)) & 0x3333333333333333
        x = (x | (x >> 2)) & 0x0F0F0F0F0F0F0F0F
        x = (x | (x >> 4)) & 0x00FF00FF00FF00FF
        x = (x | (x >> 8)) & 0x0000FFFF0000FFFF
        x = (x | (x >> 16)) & 0x00000000FFFFFFFF
        return x

    def deleaveBits(self, x):
        return self.deleaveBitsOdd(x), self.deleaveBitsOdd(x >> 1)

    def get_Hcoord(self, x, y, R):
        hx = int((x - Params.LOW[0]) /
                 (Params.HIGH[0] - Params.LOW[0] + 10**(-8)) * (2**R))
        hy = int((y - Params.LOW[1]) /
                 (Params.HIGH[1] - Params.LOW[1] + 10**(-8)) * (2**R))
        return hx, hy

    def get_Rcoord(self, hx, hy, R):
        x = float(hx) / (2**
                         R) * (Params.HIGH[0] - Params.LOW[0]) + Params.LOW[0]
        y = float(hy) / (2**
                         R) * (Params.HIGH[1] - Params.LOW[1]) + Params.LOW[1]
        return x, y

    def getCount(self, curr, epsilon):
        count = len(curr.n_data)
        if epsilon < 10**(-6):
            return count
        else:
            return count + self.differ.getNoise(1, epsilon)

    def testLeaf(self, curr):
        """ test whether a node should be a leaf node """
        if (curr.n_depth == Params.maxHeight) or \
                (curr.n_budget <= 0) or \
                (curr.n_count <= self.param.minPartSize):
            return True
        return False

    def buildIndex(self):
        budget_c = self.getCountBudget()
        logging.debug('encoding coordinates...')
        RES = self.param.Res  # order of Hilbert curve
        ndata = self.realData.shape[1]
        hidx = np.zeros(ndata)
        for i in range(ndata):
            hx, hy = self.get_Hcoord(self.realData[0, i], self.realData[1, i],
                                     RES)
            hidx[i] = self.h_encode(hx, hy, RES)
        hidx = np.sort(hidx)

        logging.debug('building index...')
        self.root.n_data = hidx
        self.root.n_box = (0, 2**(2 * RES) - 1)
        self.root.n_count = self.getCount(self.root, budget_c[0])

        stack = deque()
        stack.append(self.root)
        tree = [self.root]
        leaf_li = []  # storage of all leaves
        nleaf = 0  # leaf counter
        max_depth = -1

        while len(stack) > 0:
            curr = stack.popleft()
            if curr.n_depth > max_depth:
                max_depth = curr.n_depth
            if self.testLeaf(curr) is True:  # curr is a leaf node
                if curr.n_depth < Params.maxHeight:
                    remainingEps = sum(budget_c[curr.n_depth + 1:])
                    curr.n_count = self.getCount(curr, remainingEps)
                nleaf += 1
                curr.n_isLeaf = True
                leaf_li.append(curr)

            else:  # curr needs to split
                curr.n_budget -= 1
                tmp = self.getCoordinates(curr)
                if tmp is False:  # if split fails
                    stack.append(curr)
                    continue
                nw_node, ne_node, sw_node, se_node = KNode(), KNode(), KNode(
                ), KNode()  # create sub-nodes
                split_prm, split_sec1, split_sec2, nw_node.n_data, ne_node.n_data, sw_node.n_data, se_node.n_data = tmp

                nw_node.n_box = (curr.n_box[0], split_sec1)
                ne_node.n_box = (split_sec1, split_prm)
                sw_node.n_box = (split_prm, split_sec2)
                se_node.n_box = (split_sec2, curr.n_box[1])

                for sub_node in [nw_node, ne_node, sw_node, se_node]:
                    sub_node.n_depth = curr.n_depth + 1
                    sub_node.n_count = self.getCount(
                        sub_node, budget_c[sub_node.n_depth])
                    sub_node.n_budget = curr.n_budget
                    stack.append(sub_node)
                    tree.append(sub_node)
                curr.n_data = None
                curr.nw, curr.ne, curr.sw, curr.se = nw_node, ne_node, sw_node, se_node

        # end of while
        logging.debug("number of leaves: %d" % nleaf)
        logging.debug("max depth: %d" % max_depth)

        # # convert hilbert values in leaf nodes to real coordinates and update bounding box
        logging.debug('decoding and updating bounding box...')
        for leaf in leaf_li:
            bbox = np.array([[1000.0, 1000.0], [-1000.0, -1000.0]],
                            dtype='float64')
            for hvalue in leaf.n_data:
                hx, hy = self.h_decode(int(hvalue), RES)
                x, y = self.get_Rcoord(hx, hy, RES)
                bbox[0, 0] = x if x < bbox[0, 0] else bbox[0, 0]
                bbox[1, 0] = x if x > bbox[1, 0] else bbox[1, 0]
                bbox[0, 1] = y if y < bbox[0, 1] else bbox[0, 1]
                bbox[1, 1] = y if y > bbox[1, 1] else bbox[1, 1]
            leaf.n_box = bbox

        # # update bounding box bottom-up
        tree = sorted(tree, cmp=self.cmp_node)
        logging.debug('updating box for each node in the tree...')
        for node in tree:
            if node.n_data is None:
                node.n_box = np.zeros((2, 2))
                node.n_box[0,
                           0] = min(node.ne.n_box[0, 0], node.nw.n_box[0, 0],
                                    node.se.n_box[0, 0], node.sw.n_box[0, 0])
                node.n_box[0,
                           1] = min(node.ne.n_box[0, 1], node.nw.n_box[0, 1],
                                    node.se.n_box[0, 1], node.sw.n_box[0, 1])
                node.n_box[1,
                           0] = max(node.ne.n_box[1, 0], node.nw.n_box[1, 0],
                                    node.se.n_box[1, 0], node.sw.n_box[1, 0])
                node.n_box[1,
                           1] = max(node.ne.n_box[1, 1], node.nw.n_box[1, 1],
                                    node.se.n_box[1, 1], node.sw.n_box[1, 1])

    def cmp_node(self, node1, node2):
        # reverse order
        return int(node2.n_depth - node1.n_depth)

    def getCoordinates(self, curr):
        budget_s = self.getSplitBudget()
        _data = curr.n_data
        _ndata = len(_data)
        split_1 = self.getSplit(_data, curr.n_box[0], curr.n_box[1],
                                budget_s[curr.n_depth] / 2)
        pos_1 = np.searchsorted(_data, split_1)
        if pos_1 == 0 or pos_1 == _ndata:
            return False
        data_1 = _data[:pos_1]
        data_2 = _data[pos_1:]
        split_sec1 = self.getSplit(data_1, curr.n_box[0], split_1,
                                   budget_s[curr.n_depth] / 2)
        split_sec2 = self.getSplit(data_2, split_1, curr.n_box[1],
                                   budget_s[curr.n_depth] / 2)
        pos_sec1 = np.searchsorted(data_1, split_sec1)
        pos_sec2 = np.searchsorted(data_2, split_sec2)

        if pos_sec1 == 0 or pos_sec1 == len(
                data_1) or pos_sec2 == 0 or pos_sec2 == len(data_2):
            return False
        nw_data, ne_data, sw_data, se_data = data_1[:pos_sec1], data_1[
            pos_sec1:], data_2[:pos_sec2], data_2[pos_sec2:]
        return split_1, split_sec1, split_sec2, nw_data, ne_data, sw_data, se_data
예제 #5
0
class KTree(object):
    """Generic tree template"""

    def __init__(self, data, param):
        self.param = param
        self.differ = Differential(self.param.Seed)
        # ## initialize the root
        self.root = KNode()
        self.root.n_data = data
        self.root.n_box = np.array([Params.LOW, Params.HIGH])
        self.root.n_budget = Params.maxHeight

    def getSplitBudget(self):
        """return a list of h budget values for split"""
        raise NotImplementedError

    def getCountBudget(self):
        """return a list of (h+1) budget values for noisy count"""
        raise NotImplementedError

    def getNoisyMedian(self, array, left, right, epsilon):
        """return the split value of an array"""
        raise NotImplementedError

    def getCoordinates(self, curr):
        """
        return the coordinate of lower-right point of the NW sub-node
        and the upper-left point of the SW sub-node and the data points
        in the four subnodes, i.e.
        return (x_nw,y_nw),(x_se,y_se), nw_data, ne_data, sw_data, se_data
        """
        raise NotImplementedError

    def getSplit(self, array, left, right, epsilon):
        """
        return the split point given an array, may be data-independent or
        true median or noisy median, depending on the type of the tree
        """
        raise NotImplementedError

    def getCount(self, curr, epsilon):
        """ return true count or noisy count of a node, depending on epsilon"""
        if curr.n_data is None:
            count = 0
        else:
            count = curr.n_data.shape[1]
        if epsilon < 10 ** (-6):
            return count
        else:
            return count + self.differ.getNoise(1, epsilon)

    def testLeaf(self, curr):
        """ test whether a node should be a leaf node """
        if (curr.n_depth == Params.maxHeight) or \
                (curr.n_budget <= 0) or \
                (curr.n_data is None or curr.n_data.shape[1] == 0) or \
                (curr.n_count <= self.param.minPartSize):
            return True
        return False

    def cell_setLeaf(self, curr):
        """ will be overrided in kd_cell """
        return

    def buildIndex(self):
        """ Function to build the tree structure, fanout = 4 by default for spatial (2D) data """
        budget_c = self.getCountBudget()
        self.root.n_count = self.getCount(self.root, budget_c[0])  # ## add noisy count to root
        stack = deque()
        stack.append(self.root)
        nleaf = 0  # ## leaf counter
        max_depth = -1
        # ## main loop
        while len(stack) > 0:
            curr = stack.popleft()
            if curr.n_depth > max_depth:
                max_depth = curr.n_depth

            if self.testLeaf(curr) is True:  # ## curr is a leaf node
                if curr.n_depth < Params.maxHeight:  # ## if a node ends up earlier than maxHeight, it should be able to use the remaining count budget
                    remainingEps = sum(budget_c[curr.n_depth + 1:])
                    curr.n_count = self.getCount(curr, remainingEps)
                nleaf += 1
                curr.n_isLeaf = True
                self.cell_setLeaf(curr)

            else:  # ## curr needs to split
                curr.n_budget -= 1  # ## some budget will be used regardless the split is successful or not
                tmp = self.getCoordinates(curr)
                nw_node, ne_node, sw_node, se_node = KNode(), KNode(), KNode(), KNode()  # create sub-nodes
                nw_coord, ne_coord, nw_node.n_data, ne_node.n_data, sw_node.n_data, se_node.n_data = tmp
                x_nw, y_nw = nw_coord
                x_se, y_se = ne_coord
                # ## update bounding box, depth, count, budget for the four subnodes
                nw_node.n_box = np.array([[curr.n_box[0, 0], y_nw], [x_nw, curr.n_box[1, 1]]])
                ne_node.n_box = np.array([[x_nw, y_se], [curr.n_box[1, 0], curr.n_box[1, 1]]])
                sw_node.n_box = np.array([[curr.n_box[0, 0], curr.n_box[0, 1]], [x_se, y_nw]])
                se_node.n_box = np.array([[x_se, curr.n_box[0, 1]], [curr.n_box[1, 0], y_se]])

                for sub_node in [nw_node, ne_node, sw_node, se_node]:
                    sub_node.n_depth = curr.n_depth + 1
                    # if (sub_node.n_depth == Params.maxHeight and sub_node.n_data is not None):
                    # print len(sub_node.n_data[0])
                    sub_node.n_count = self.getCount(sub_node, budget_c[sub_node.n_depth])
                    sub_node.n_budget = curr.n_budget
                    stack.append(sub_node)

                curr.n_data = None  # ## do not need the data points coordinates now
                curr.nw, curr.ne, curr.sw, curr.se = nw_node, ne_node, sw_node, se_node
        # end of while

        logging.debug("number of leaves: %d" % nleaf)
        logging.debug("max depth: %d" % max_depth)

    def rect_intersect(self, hrect, query):
        """
        checks if the hyper-rectangle intersects with the
        hyper-rectangle defined by the query in every dimension
    
        """
        bool_m1 = query[0, :] >= hrect[1, :]
        bool_m2 = query[1, :] <= hrect[0, :]
        bool_m = np.logical_or(bool_m1, bool_m2)
        if np.any(bool_m):
            return False
        else:
            return True

    def rangeCount(self, query):
        """
        Query answering function. Find the number of data points within a query rectangle.
        """
        stack = deque()
        stack.append(self.root)
        count = 0.0
        # ## Below are three variables recording the number of 1) whole leaf 2) partial leaf 3) whole internal node,
        # ## respectively, which contribute to the query answer. For debug purpose only.
        l_whole, l_part, i_whole = 0, 0, 0

        while len(stack) > 0:
            curr = stack.popleft()
            _box = curr.n_box
            if curr.n_isLeaf is True:
                frac = 1
                if self.rect_intersect(_box, query):
                    for i in range(_box.shape[1]):
                        if _box[1, i] == _box[0, i] or Params.WorstCase == True:
                            frac *= 1
                        else:
                            frac *= (min(query[1, i], _box[1, i]) - max(query[0, i], _box[0, i])) / (
                                _box[1, i] - _box[0, i])
                    count += curr.n_count * frac
                    if 1.0 - frac < 10 ** (-6):
                        l_whole += 1
                    else:
                        l_part += 1

            else:  # ## if not leaf
                bool_matrix = np.zeros((2, query.shape[1]))
                bool_matrix[0, :] = query[0, :] <= _box[0, :]
                bool_matrix[1, :] = query[1, :] >= _box[1, :]

                if np.all(bool_matrix) and self.param.useLeafOnly is False:  # ## if query range contains node range
                    count += curr.n_count
                    i_whole += 1
                else:
                    if self.rect_intersect(curr.nw.n_box, query):
                        stack.append(curr.nw)
                    if self.rect_intersect(curr.ne.n_box, query):
                        stack.append(curr.ne)
                    if self.rect_intersect(curr.sw.n_box, query):
                        stack.append(curr.sw)
                    if self.rect_intersect(curr.se.n_box, query):
                        stack.append(curr.se)

        return float(count)  # , i_whole, l_whole, l_part

    def adjustConsistency(self):
        """ 
        Post processing for uniform noise across levels. Due to 
        Michael Hay, Vibhor Rastogi, Gerome Miklau, Dan Suciu, 
        Boosting the Accuracy of Differentially-Private Histograms Through Consistency,
        VLDB 2010
        """
        logging.debug('adjusting consistency...')
        # ## upward pass
        self.root.get_z()
        # ## downward pass
        queue = deque()
        queue.append(self.root)
        while len(queue) > 0:
            curr = queue.popleft()
            if curr.n_isLeaf is False:
                adjust = (curr.n_count - curr.nw.n_count - curr.ne.n_count - curr.sw.n_count - curr.se.n_count) / 4.0
                for subnode in [curr.nw, curr.ne, curr.sw, curr.se]:
                    subnode.n_count += adjust
                    queue.append(subnode)

    def postProcessing(self):
        """ 
        Post processing for general noise distribution across levels. Due to
        G. Cormode, M. Procopiuc, E. Shen, D. Srivastava and T. Yu, 
        Differentially Private Spatial Decompositions, ICDE 2012.
        """
        logging.debug("post-processing...")
        budget = self.getCountBudget()  # ## count budget for h+1 levels
        H = Params.maxHeight
        # ## Phase 1 (top-down)
        queue = deque()
        self.root.n_count *= budget[self.root.n_depth] ** 2
        queue.append(self.root)
        while len(queue) > 0:
            curr = queue.popleft()
            if curr.n_isLeaf is False:
                for subnode in [curr.nw, curr.ne, curr.sw, curr.se]:
                    subnode.n_count = curr.n_count + subnode.n_count * (budget[subnode.n_depth] ** 2)
                    queue.append(subnode)
        # ## Phase 2 (bottom-up)
        self.root.update_count()
        # ## Phase 3 (top-down)
        queue = deque()
        E_root = 0
        for i in range(H + 1):
            E_root += 4 ** i * budget[H - i] * budget[H - i]
        self.root.n_count /= E_root
        self.root.n_F = 0
        queue.append(self.root)
        while len(queue) > 0:
            curr = queue.popleft()
            if curr.n_isLeaf is False:
                h = H - curr.n_depth - 1  # ## height of curr's children
                E_h = 0
                for i in range(h + 1):
                    E_h += 4 ** i * budget[H - i] * budget[H - i]
                for subnode in [curr.nw, curr.ne, curr.sw, curr.se]:
                    subnode.n_F = curr.n_F + curr.n_count * (budget[curr.n_depth] ** 2)
                    subnode.n_count = (subnode.n_count - 4 ** h * subnode.n_F) / E_h
                    queue.append(subnode)

    def pruning(self):
        """
        If the tree is grown without the stopping condition of minLeafSize, prune it here after post processing
        """
        logging.debug("pruning...")
        queue = deque()
        queue.append(self.root)
        while len(queue) > 0:
            curr = queue.popleft()
            if curr.n_isLeaf is False:
                if curr.n_count <= self.param.minPartSize:
                    curr.n_isLeaf = True
                else:
                    queue.append(curr.nw)
                    queue.append(curr.ne)
                    queue.append(curr.sw)
                    queue.append(curr.se)
예제 #6
0
class Generic(object):
    """
    Generic data structure, used for both htree and grid
    """

    def __init__(self, data, param):
        self.param = param
        self.differ = Differential(self.param.Seed)

        # initialize the root
        self.root = Node()
        # self.children = [] # all level 2 grids
        self.root.n_data = data
        self.root.n_box = np.array([param.LOW, param.HIGH])

    def getEqualSplit(self, partitions, min, max):
        """return equal split points, including both ends"""
        if min > max:
            logging.debug("getEqualSplit: Error: min > max")
        if partitions <= 1:
            return [min, max]
        return [min + (max - min) * i / partitions for i in range(partitions + 1)]

    def getCountBudget(self):
        """return noisy count budget for different levels of the indices"""
        raise NotImplementedError

    def getCoordinates(self, curr):
        """return the split dimension, the split points and the data points in each subnodes"""
        raise NotImplementedError

    def getCount(self, curr, epsilon):
        """
	return true count or noisy count of a node, depending on epsilon. 
	Note that the noisy count can be negative
	"""
        if curr.n_data is None:
            count = 0
        else:
            count = curr.n_data.shape[1]

        if epsilon < 10 ** (-6):
            return count
        else:
            return count + self.differ.getNoise(1, epsilon)

    def testLeaf(self, curr):
        """test whether a node is a leaf node"""
        raise NotImplementedError

    def intersect(self, hrect, query):
        """
        checks if the hyper-rectangle intersects with the 
        hyper-rectangle defined by the query in every dimension
        """
        bool_m1 = query[0, :] >= hrect[1, :]
        bool_m2 = query[1, :] <= hrect[0, :]
        bool_m = np.logical_or(bool_m1, bool_m2)
        if np.any(bool_m):
            return False
        else:
            return True

    def buildIndex(self):
        """build the htree & grid structure. htree is a high fanout and low level tree"""
        budget_c = self.getCountBudget()  # an array with two elements
        self.root.n_count = self.getCount(self.root, 0)  # add noisy count to the root
        queue = deque()
        queue.append(self.root)
        nleaf = 0  # number of leaf node, for debug only
        # ## main loop
        while len(queue) > 0:
            curr = queue.popleft()

            if self.testLeaf(curr) is True:  # if curr is a leaf node
                if curr.n_depth < self.param.maxHeightHTree:
                    remainingEps = sum(budget_c[curr.n_depth:])
                    curr.n_count = self.getCount(curr, remainingEps)
                    curr.eps = remainingEps
                nleaf += 1
                curr.n_isLeaf = True

            else:  # curr needs to split
                split_arr, n_data_arr = self.getCoordinates(curr)
                if split_arr is None:
                    if curr.n_depth < self.param.maxHeightHTree:
                        remainingEps = sum(budget_c[curr.n_depth:])
                        curr.n_count = self.getCount(curr, remainingEps)
                        curr.eps = remainingEps
                    nleaf += 1
                    curr.n_isLeaf = True
                    curr.children = []
                    continue  # if the first level cell is leaf node
                for i in range(len(n_data_arr)):
                    node = Node()
                    if curr.n_depth % Params.NDIM == 0:  # split by x coord
                        node.n_box = np.array([[split_arr[i], curr.n_box[0, 1]], [split_arr[i + 1], curr.n_box[1, 1]]])
                    else:  # split by y coord
                        node.n_box = np.array([[curr.n_box[0, 0], split_arr[i]], [curr.n_box[1, 0], split_arr[i + 1]]])

                    node.index = i
                    node.parent = curr
                    node.n_depth = curr.n_depth + 1
                    node.n_data = n_data_arr[i]
                    node.n_count = self.getCount(node, budget_c[node.n_depth])
                    node.eps = budget_c[node.n_depth]
                    if curr.n_depth == 2:
                        node.secondLevelPartitions = curr.secondLevelPartitions
                    curr.children.append(node)
                    queue.append(node)

                # if curr.n_depth == 2:
                # self.children.append(curr)

                curr.n_data = None  # ## do not need the data points coordinates now
        # end of while      
        logging.debug("Generic: number of leaves: %d" % nleaf)


        # canonical range query does apply

    def rangeCount(self, query):
        """
        Query answering function. Find the number of data points within a query rectangle.
        This function assume that the tree is contructed with noisy count for every node
        """
        queue = deque()
        queue.append(self.root)
        count = 0.0
        while len(queue) > 0:
            curr = queue.popleft()
            _box = curr.n_box
            if curr.n_isLeaf is True:
                frac = 1
                if self.intersect(_box, query):
                    for i in range(_box.shape[1]):
                        if _box[1, i] == _box[0, i] or Params.WorstCase == True:
                            frac *= 1
                        else:
                            frac *= (min(query[1, i], _box[1, i]) - max(query[0, i], _box[0, i])) / (
                                _box[1, i] - _box[0, i])
                    count += curr.n_count * frac
            else:  # if not leaf
                for node in curr.children:
                    bool_matrix = np.zeros((2, query.shape[1]))
                    bool_matrix[0, :] = query[0, :] <= _box[0, :]
                    bool_matrix[1, :] = query[1, :] >= _box[1, :]

                    if np.all(bool_matrix):  # if query range contains node range
                        count += node.n_count
                    elif self.intersect(_box, query):
                        queue.append(node)
        return float(count)


    def leafCover(self, loc):
        """
        find a leaf node that cover the location
        """
        queue = deque()
        queue.append(self.root)
        while len(queue) > 0:
            curr = queue.popleft()
            _box = curr.n_box
            if curr.n_isLeaf is True:
                if is_rect_cover(_box, loc):
                    return curr
            else:  # if not leaf
                queue.extend(curr.children)


    def checkCorrectness(self, node, nodePoints=None):
        """
        Total number of data points of all leaf nodes should equal to the total data points
        """
        totalPoints = 0
        if node is None:
            return 0
        if node.n_isLeaf and node.n_data is not None:
            return node.n_data.shape[1]
        for child in node.children:
            totalPoints += self.checkCorrectness(child)

        if nodePoints is None:
            return totalPoints

        if totalPoints == nodePoints:
            return True
        return False
예제 #7
0
class GenericT(object):
    """
    Generic data structure, used for grid
    """

    def __init__(self, data, param):
        self.param = param
        self.differ = Differential(self.param.Seed)

        # initialize the root
        self.root = NodeT()
        # self.children = [] # all level 2 grids
        self.root.n_data = data
        self.root.n_box = np.array([param.LOW, param.HIGH])

    def getEqualSplit(self, partitions, min, max):
        """return equal split points, including both ends"""
        if min > max:
            logging.debug("getEqualSplit: Error: min > max")
        if partitions <= 1:
            return [min, max]
        return [min + (max - min) * i / partitions for i in range(partitions + 1)]

    def getCountBudget(self):
        """return noisy count budget for different levels of the indices"""
        raise NotImplementedError

    def getCoordinates(self, curr):
        """return the split dimension, the split points and the data points in each subnodes"""
        raise NotImplementedError

    def getCount(self, curr, epsilon):
        """
        return true count or noisy count of a node, depending on epsilon.
        Note that the noisy count can be negative
        """
        if curr.n_data is None:
            count = 0
        else:
            count = curr.n_data.shape[1]

        if epsilon < 10 ** (-8):
            return count
        else:
            return count + self.differ.getNoise(1, epsilon)


    def intersect(self, hrect, query):
        """
        checks if the hyper-rectangle intersects with the
        hyper-rectangle defined by the query in every dimension
        """
        bool_m1 = query[0, :] >= hrect[1, :]
        bool_m2 = query[1, :] <= hrect[0, :]
        bool_m = np.logical_or(bool_m1, bool_m2)
        if np.any(bool_m):
            return False
        else:
            return True

    def testLeaf(self, curr):
        """ test whether a node should be a leaf node """
        if (curr.n_depth == Params.maxHeightAdaptiveGrid) or \
                (curr.n_data is None or curr.n_data.shape[1] == 0) or \
                (curr.n_count <= self.param.minPartSize):
            return True
        return False

    def buildIndex(self):
        """build the grid structure."""
        budget_c = self.getCountBudget()  # an array with two elements
        # print budget_c
        self.root.n_count = self.getCount(self.root, 0)  # add noisy count to the root
        queue = deque()
        queue.append(self.root)
        # ## main loop
        while len(queue) > 0:
            curr = queue.popleft()
            if curr.n_data is None:
                curr.a_count.append(0)
            else:
                curr.a_count.append(curr.n_data.shape[1])

            if self.testLeaf(curr) is True:  # if curr is a leaf node
                remainingEps = sum(budget_c[curr.n_depth:])
                curr.n_count, curr.eps, curr.n_isLeaf = self.getCount(curr, remainingEps), remainingEps, True
                curr.l_count.append(curr.n_count)
            else:  # curr needs to split --> find splitting granularity
                gran, split_arr_x, split_arr_y, n_data_matrix = self.getCoordinates(curr)
                if gran == 1:
                    remainingEps = sum(budget_c[curr.n_depth:])
                    curr.n_count, curr.eps, curr.n_isLeaf = self.getCount(curr, remainingEps), remainingEps, True
                    curr.children = None
                    curr.l_count.append(curr.n_count)
                    continue  # if the first level cell is leaf node

                # add all nodes to queue
                for x in range(gran):
                    for y in range(gran):
                        node = NodeT()
                        node.n_box = np.array(
                            [[split_arr_x[x], split_arr_y[y]], [split_arr_x[x + 1], split_arr_y[y + 1]]])
                        node.index, node.parent, node.n_depth = x * gran + y, curr, curr.n_depth + 1
                        if n_data_matrix[x][y] is None:
                            node.n_data = None
                        else:
                            node.n_data = np.transpose(n_data_matrix[x][y])
                        node.n_count = self.getCount(node, budget_c[node.n_depth])
                        node.eps = budget_c[node.n_depth]
                        if node.n_depth == 2:
                            node.n_isLeaf = True
                        if curr.children is None:
                            curr.children = np.ndarray(shape=(gran, gran), dtype=NodeT)
                        curr.children[x][y] = node
                        queue.append(node)

                curr.n_data = None  # ## do not need the data points coordinates now
                # end of while


    # canonical range query does apply
    def rangeCount(self, query):
        """
        Query answering function. Find the number of data points within a query rectangle.
        This function assume that the tree is constructed with noisy count for every node
        """
        queue = deque()
        queue.append(self.root)
        count = 0.0
        while len(queue) > 0:
            curr = queue.popleft()
            _box = curr.n_box
            if curr.n_isLeaf is True:
                frac = 1
                if self.intersect(_box, query):
                    for i in range(_box.shape[1]):
                        if _box[1, i] == _box[0, i] or Params.WorstCase == True:
                            frac *= 1
                        else:
                            frac *= (min(query[1, i], _box[1, i]) - max(query[0, i], _box[0, i])) / (
                                _box[1, i] - _box[0, i])
                    count += curr.n_count * frac
            else:  # if not leaf

                for (_, _), node in np.ndenumerate(curr.children):
                    bool_matrix = np.zeros((2, query.shape[1]))
                    bool_matrix[0, :] = query[0, :] <= _box[0, :]
                    bool_matrix[1, :] = query[1, :] >= _box[1, :]

                    if np.all(bool_matrix):  # if query range contains node range
                        count += node.n_count
                    elif self.intersect(_box, query):
                        queue.append(node)
        return float(count)


    def leafCover(self, loc):
        """
        find a leaf node that cover the location
        """
        gran_1st = len(self.root.children)
        x1 = min(gran_1st - 1,
                 (loc[0] - self.root.n_box[0, 0]) * gran_1st / (self.root.n_box[1, 0] - self.root.n_box[0, 0]))
        y1 = min(gran_1st - 1,
                 (loc[1] - self.root.n_box[0, 1]) * gran_1st / (self.root.n_box[1, 1] - self.root.n_box[0, 1]))

        node_1st = self.root.children[x1][y1]
        """
        Note that there are cases when the actual count of first level cell is zero but the noisy count is > 0, 
        thus the cell may be splited into a number of empty cells
        """
        if node_1st.n_isLeaf or node_1st.children is None:
            return node_1st
        else:
            gran_2st = len(node_1st.children)
            x2 = min(gran_2st - 1,
                     (loc[0] - node_1st.n_box[0, 0]) * gran_2st / (node_1st.n_box[1, 0] - node_1st.n_box[0, 0]))
            y2 = min(gran_2st - 1,
                     (loc[1] - node_1st.n_box[0, 1]) * gran_2st / (node_1st.n_box[1, 1] - node_1st.n_box[0, 1]))
            return node_1st.children[x2][y2]


    def checkCorrectness(self, node, nodePoints=None):
        """
        Total number of data points of all leaf nodes should equal to the total data points
        only check the FIRST time instance
        """
        totalPoints = 0
        if node is None:
            return 0
        if (node.n_isLeaf and node.n_data is not None) or node.children is None:
            return node.a_count[0]

        for (_, _), child in np.ndenumerate(node.children):
            totalPoints += self.checkCorrectness(child)

        if nodePoints is None:
            return totalPoints

        if totalPoints == nodePoints:
            return True
        return False
예제 #8
0
class GenericT(object):
    """
    Generic data structure, used for grid
    """
    def __init__(self, data, param):
        self.param = param
        self.differ = Differential(self.param.Seed)

        # initialize the root
        self.root = NodeT()
        # self.children = [] # all level 2 grids
        self.root.n_data = data
        self.root.n_box = np.array([param.LOW, param.HIGH])

    def getEqualSplit(self, partitions, min, max):
        """return equal split points, including both ends"""
        if min > max:
            logging.debug("getEqualSplit: Error: min > max")
        if partitions <= 1:
            return [min, max]
        return [
            min + (max - min) * i / partitions for i in range(partitions + 1)
        ]

    def getCountBudget(self):
        """return noisy count budget for different levels of the indices"""
        raise NotImplementedError

    def getCoordinates(self, curr):
        """return the split dimension, the split points and the data points in each subnodes"""
        raise NotImplementedError

    def getCount(self, curr, epsilon):
        """
        return true count or noisy count of a node, depending on epsilon.
        Note that the noisy count can be negative
        """
        if curr.n_data is None:
            count = 0
        else:
            count = curr.n_data.shape[1]

        if epsilon < 10**(-8):
            return count
        else:
            return count + self.differ.getNoise(1, epsilon)

    def intersect(self, hrect, query):
        """
        checks if the hyper-rectangle intersects with the
        hyper-rectangle defined by the query in every dimension
        """
        bool_m1 = query[0, :] >= hrect[1, :]
        bool_m2 = query[1, :] <= hrect[0, :]
        bool_m = np.logical_or(bool_m1, bool_m2)
        if np.any(bool_m):
            return False
        else:
            return True

    def testLeaf(self, curr):
        """ test whether a node should be a leaf node """
        if (curr.n_depth == Params.maxHeightAdaptiveGrid) or \
                (curr.n_data is None or curr.n_data.shape[1] == 0) or \
                (curr.n_count <= self.param.minPartSize):
            return True
        return False

    def buildIndex(self):
        """build the grid structure."""
        budget_c = self.getCountBudget()  # an array with two elements
        # print budget_c
        self.root.n_count = self.getCount(self.root,
                                          0)  # add noisy count to the root
        queue = deque()
        queue.append(self.root)
        # ## main loop
        while len(queue) > 0:
            curr = queue.popleft()
            if curr.n_data is None:
                curr.a_count.append(0)
            else:
                curr.a_count.append(curr.n_data.shape[1])

            if self.testLeaf(curr) is True:  # if curr is a leaf node
                remainingEps = sum(budget_c[curr.n_depth:])
                curr.n_count, curr.eps, curr.n_isLeaf = self.getCount(
                    curr, remainingEps), remainingEps, True
                curr.l_count.append(curr.n_count)
            else:  # curr needs to split --> find splitting granularity
                gran, split_arr_x, split_arr_y, n_data_matrix = self.getCoordinates(
                    curr)
                if gran == 1:
                    remainingEps = sum(budget_c[curr.n_depth:])
                    curr.n_count, curr.eps, curr.n_isLeaf = self.getCount(
                        curr, remainingEps), remainingEps, True
                    curr.children = None
                    curr.l_count.append(curr.n_count)
                    continue  # if the first level cell is leaf node

                # add all nodes to queue
                for x in range(gran):
                    for y in range(gran):
                        node = NodeT()
                        node.n_box = np.array(
                            [[split_arr_x[x], split_arr_y[y]],
                             [split_arr_x[x + 1], split_arr_y[y + 1]]])
                        node.index, node.parent, node.n_depth = x * gran + y, curr, curr.n_depth + 1
                        if n_data_matrix[x][y] is None:
                            node.n_data = None
                        else:
                            node.n_data = np.transpose(n_data_matrix[x][y])
                        node.n_count = self.getCount(node,
                                                     budget_c[node.n_depth])
                        node.eps = budget_c[node.n_depth]
                        if node.n_depth == 2:
                            node.n_isLeaf = True
                        if curr.children is None:
                            curr.children = np.ndarray(shape=(gran, gran),
                                                       dtype=NodeT)
                        curr.children[x][y] = node
                        queue.append(node)

                curr.n_data = None  # ## do not need the data points coordinates now
                # end of while

    # canonical range query does apply
    def rangeCount(self, query):
        """
        Query answering function. Find the number of data points within a query rectangle.
        This function assume that the tree is constructed with noisy count for every node
        """
        queue = deque()
        queue.append(self.root)
        count = 0.0
        while len(queue) > 0:
            curr = queue.popleft()
            _box = curr.n_box
            if curr.n_isLeaf is True:
                frac = 1
                if self.intersect(_box, query):
                    for i in range(_box.shape[1]):
                        if _box[1, i] == _box[0,
                                              i] or Params.WorstCase == True:
                            frac *= 1
                        else:
                            frac *= (min(query[1, i], _box[1, i]) -
                                     max(query[0, i], _box[0, i])) / (
                                         _box[1, i] - _box[0, i])
                    count += curr.n_count * frac
            else:  # if not leaf

                for (_, _), node in np.ndenumerate(curr.children):
                    bool_matrix = np.zeros((2, query.shape[1]))
                    bool_matrix[0, :] = query[0, :] <= _box[0, :]
                    bool_matrix[1, :] = query[1, :] >= _box[1, :]

                    if np.all(
                            bool_matrix):  # if query range contains node range
                        count += node.n_count
                    elif self.intersect(_box, query):
                        queue.append(node)
        return float(count)

    def leafCover(self, loc):
        """
        find a leaf node that cover the location
        """
        gran_1st = len(self.root.children)
        x1 = min(gran_1st - 1, (loc[0] - self.root.n_box[0, 0]) * gran_1st /
                 (self.root.n_box[1, 0] - self.root.n_box[0, 0]))
        y1 = min(gran_1st - 1, (loc[1] - self.root.n_box[0, 1]) * gran_1st /
                 (self.root.n_box[1, 1] - self.root.n_box[0, 1]))

        node_1st = self.root.children[x1][y1]
        """
        Note that there are cases when the actual count of first level cell is zero but the noisy count is > 0, 
        thus the cell may be splited into a number of empty cells
        """
        if node_1st.n_isLeaf or node_1st.children is None:
            return node_1st
        else:
            gran_2st = len(node_1st.children)
            x2 = min(gran_2st - 1, (loc[0] - node_1st.n_box[0, 0]) * gran_2st /
                     (node_1st.n_box[1, 0] - node_1st.n_box[0, 0]))
            y2 = min(gran_2st - 1, (loc[1] - node_1st.n_box[0, 1]) * gran_2st /
                     (node_1st.n_box[1, 1] - node_1st.n_box[0, 1]))
            return node_1st.children[x2][y2]

    def checkCorrectness(self, node, nodePoints=None):
        """
        Total number of data points of all leaf nodes should equal to the total data points
        only check the FIRST time instance
        """
        totalPoints = 0
        if node is None:
            return 0
        if (node.n_isLeaf
                and node.n_data is not None) or node.children is None:
            return node.a_count[0]

        for (_, _), child in np.ndenumerate(node.children):
            totalPoints += self.checkCorrectness(child)

        if nodePoints is None:
            return totalPoints

        if totalPoints == nodePoints:
            return True
        return False
예제 #9
0
class Hilbert(Kd_standard):
    """ Hilbert R-tree """

    def __init__(self, data, param):
        self.param = param
        self.differ = Differential(self.param.Seed)
        self.root = KNode()
        self.realData = data
        self.root.n_budget = Params.maxHeight

    def h_encode(self, x, y, r):
        """ (x,y) -> value h in Hilbert space, r is the resolution of the Hilbert curve """
        mask = (1 << r) - 1
        heven = x ^ y
        notx = ~x & mask
        noty = ~y & mask
        temp = notx ^ y
        v0, v1 = 0, 0
        for k in range(r - 1):
            v1 = ((v1 & heven) | ((v0 ^ noty) & temp)) >> 1
            v0 = ((v0 & (v1 ^ notx)) | (~v0 & (v1 ^ noty))) >> 1
        hodd = (~v0 & (v1 ^ x)) | (v0 & (v1 ^ noty))
        return self.interleaveBits(hodd, heven)

    def h_decode(self, h, r):
        """ h -> (x,y) """
        heven, hodd = self.deleaveBits(h)
        mask = (1 << r) - 1
        v0, v1 = 0, 0
        temp1 = ~(heven | hodd) & mask
        temp0 = ~(heven ^ hodd) & mask
        for k in range(r - 1):
            v1 = (v1 ^ temp1) >> 1
            v0 = (v0 ^ temp0) >> 1
        return (v0 & ~heven) ^ v1 ^ hodd, (v0 | heven) ^ v1 ^ hodd

    def interleaveBits(self, hodd, heven):
        val = 0
        maxx = max(hodd, heven)
        n = 0
        while maxx > 0:
            n += 1
            maxx >>= 1
        for i in range(n):
            bitMask = 1 << i
            a = 1 << (2 * i) if (heven & bitMask) else 0
            b = 1 << (2 * i + 1) if (hodd & bitMask) else 0
            val += a + b
        return val

    def deleaveBitsOdd(self, x):
        x &= 0x5555555555555555
        x = (x | (x >> 1)) & 0x3333333333333333
        x = (x | (x >> 2)) & 0x0F0F0F0F0F0F0F0F
        x = (x | (x >> 4)) & 0x00FF00FF00FF00FF
        x = (x | (x >> 8)) & 0x0000FFFF0000FFFF
        x = (x | (x >> 16)) & 0x00000000FFFFFFFF
        return x

    def deleaveBits(self, x):
        return self.deleaveBitsOdd(x), self.deleaveBitsOdd(x >> 1)

    def get_Hcoord(self, x, y, R):
        hx = int((x - Params.LOW[0]) / (Params.HIGH[0] - Params.LOW[0] + 10 ** (-8)) * (2 ** R))
        hy = int((y - Params.LOW[1]) / (Params.HIGH[1] - Params.LOW[1] + 10 ** (-8)) * (2 ** R))
        return hx, hy

    def get_Rcoord(self, hx, hy, R):
        x = float(hx) / (2 ** R) * (Params.HIGH[0] - Params.LOW[0]) + Params.LOW[0]
        y = float(hy) / (2 ** R) * (Params.HIGH[1] - Params.LOW[1]) + Params.LOW[1]
        return x, y

    def getCount(self, curr, epsilon):
        count = len(curr.n_data)
        if epsilon < 10 ** (-6):
            return count
        else:
            return count + self.differ.getNoise(1, epsilon)

    def testLeaf(self, curr):
        """ test whether a node should be a leaf node """
        if (curr.n_depth == Params.maxHeight) or \
                (curr.n_budget <= 0) or \
                (curr.n_count <= self.param.minPartSize):
            return True
        return False

    def buildIndex(self):
        budget_c = self.getCountBudget()
        logging.debug('encoding coordinates...')
        RES = self.param.Res  # order of Hilbert curve
        ndata = self.realData.shape[1]
        hidx = np.zeros(ndata)
        for i in range(ndata):
            hx, hy = self.get_Hcoord(self.realData[0, i], self.realData[1, i], RES)
            hidx[i] = self.h_encode(hx, hy, RES)
        hidx = np.sort(hidx)

        logging.debug('building index...')
        self.root.n_data = hidx
        self.root.n_box = (0, 2 ** (2 * RES) - 1)
        self.root.n_count = self.getCount(self.root, budget_c[0])

        stack = deque()
        stack.append(self.root)
        tree = [self.root]
        leaf_li = []  # storage of all leaves
        nleaf = 0  # leaf counter
        max_depth = -1

        while len(stack) > 0:
            curr = stack.popleft()
            if curr.n_depth > max_depth:
                max_depth = curr.n_depth
            if self.testLeaf(curr) is True:  # curr is a leaf node
                if curr.n_depth < Params.maxHeight:
                    remainingEps = sum(budget_c[curr.n_depth + 1:])
                    curr.n_count = self.getCount(curr, remainingEps)
                nleaf += 1
                curr.n_isLeaf = True
                leaf_li.append(curr)

            else:  # curr needs to split
                curr.n_budget -= 1
                tmp = self.getCoordinates(curr)
                if tmp is False:  # if split fails
                    stack.append(curr)
                    continue
                nw_node, ne_node, sw_node, se_node = KNode(), KNode(), KNode(), KNode()  # create sub-nodes
                split_prm, split_sec1, split_sec2, nw_node.n_data, ne_node.n_data, sw_node.n_data, se_node.n_data = tmp

                nw_node.n_box = (curr.n_box[0], split_sec1)
                ne_node.n_box = (split_sec1, split_prm)
                sw_node.n_box = (split_prm, split_sec2)
                se_node.n_box = (split_sec2, curr.n_box[1])

                for sub_node in [nw_node, ne_node, sw_node, se_node]:
                    sub_node.n_depth = curr.n_depth + 1
                    sub_node.n_count = self.getCount(sub_node, budget_c[sub_node.n_depth])
                    sub_node.n_budget = curr.n_budget
                    stack.append(sub_node)
                    tree.append(sub_node)
                curr.n_data = None
                curr.nw, curr.ne, curr.sw, curr.se = nw_node, ne_node, sw_node, se_node

        # end of while
        logging.debug("number of leaves: %d" % nleaf)
        logging.debug("max depth: %d" % max_depth)

        # # convert hilbert values in leaf nodes to real coordinates and update bounding box
        logging.debug('decoding and updating bounding box...')
        for leaf in leaf_li:
            bbox = np.array([[1000.0, 1000.0], [-1000.0, -1000.0]], dtype='float64')
            for hvalue in leaf.n_data:
                hx, hy = self.h_decode(int(hvalue), RES)
                x, y = self.get_Rcoord(hx, hy, RES)
                bbox[0, 0] = x if x < bbox[0, 0] else bbox[0, 0]
                bbox[1, 0] = x if x > bbox[1, 0] else bbox[1, 0]
                bbox[0, 1] = y if y < bbox[0, 1] else bbox[0, 1]
                bbox[1, 1] = y if y > bbox[1, 1] else bbox[1, 1]
            leaf.n_box = bbox

        # # update bounding box bottom-up
        tree = sorted(tree, cmp=self.cmp_node)
        logging.debug('updating box for each node in the tree...')
        for node in tree:
            if node.n_data is None:
                node.n_box = np.zeros((2, 2))
                node.n_box[0, 0] = min(node.ne.n_box[0, 0], node.nw.n_box[0, 0], node.se.n_box[0, 0],
                                       node.sw.n_box[0, 0])
                node.n_box[0, 1] = min(node.ne.n_box[0, 1], node.nw.n_box[0, 1], node.se.n_box[0, 1],
                                       node.sw.n_box[0, 1])
                node.n_box[1, 0] = max(node.ne.n_box[1, 0], node.nw.n_box[1, 0], node.se.n_box[1, 0],
                                       node.sw.n_box[1, 0])
                node.n_box[1, 1] = max(node.ne.n_box[1, 1], node.nw.n_box[1, 1], node.se.n_box[1, 1],
                                       node.sw.n_box[1, 1])


    def cmp_node(self, node1, node2):
        # reverse order
        return int(node2.n_depth - node1.n_depth)

    def getCoordinates(self, curr):
        budget_s = self.getSplitBudget()
        _data = curr.n_data
        _ndata = len(_data)
        split_1 = self.getSplit(_data, curr.n_box[0], curr.n_box[1], budget_s[curr.n_depth] / 2)
        pos_1 = np.searchsorted(_data, split_1)
        if pos_1 == 0 or pos_1 == _ndata:
            return False
        data_1 = _data[:pos_1]
        data_2 = _data[pos_1:]
        split_sec1 = self.getSplit(data_1, curr.n_box[0], split_1, budget_s[curr.n_depth] / 2)
        split_sec2 = self.getSplit(data_2, split_1, curr.n_box[1], budget_s[curr.n_depth] / 2)
        pos_sec1 = np.searchsorted(data_1, split_sec1)
        pos_sec2 = np.searchsorted(data_2, split_sec2)

        if pos_sec1 == 0 or pos_sec1 == len(data_1) or pos_sec2 == 0 or pos_sec2 == len(data_2):
            return False
        nw_data, ne_data, sw_data, se_data = data_1[:pos_sec1], data_1[pos_sec1:], data_2[:pos_sec2], data_2[pos_sec2:]
        return split_1, split_sec1, split_sec2, nw_data, ne_data, sw_data, se_data
예제 #10
0
class Kd_cell(Kd_pure):
    """ Kd tree based on syntatic data generation and a grid structure. See
    Y. Xiao, L. Xiong, and C. Yuan, Differentially private data release
    through multidimensional partitioning, in SDM Workshop, VLDB, 2010
    """

    def __init__(self, data, param):
        self.param = param
        self.differ = Differential(self.param.Seed)
        self.mapp = None
        self.root = KNode()
        self.realData = data
        self.root.n_box = None
        self.root.n_budget = Params.maxHeight

    def getCountBudget(self):
        count_eps = self.param.Eps * 0.5
        H = Params.maxHeight
        if self.param.geoBudget == 'none':
            return [count_eps / (H + 1) for _ in range(H + 1)]
        elif self.param.geoBudget == 'aggressive':
            unit = count_eps / (2 ** (H + 1) - 1)
            return [unit * 2 ** i for i in range(H + 1)]
        elif self.param.geoBudget == 'quadratic':
            unit = count_eps * (np.sqrt(2) - 1) / (2 ** (0.5 * (H + 1)) - 1)
            return [unit * 2 ** (0.5 * i) for i in range(H + 1)]
        elif self.param.geoBudget == 'optimal':
            unit = count_eps * ((2 ** (1.0 / 3)) - 1) / (2 ** ((1.0 / 3) * (H + 1)) - 1)
            return [unit * 2 ** ((1.0 / 3) * i) for i in range(H + 1)]
        elif self.param.geoBudget == 'quartic':
            unit = count_eps * ((2 ** (1.0 / 4)) - 1) / (2 ** ((1.0 / 4) * (H + 1)) - 1)
            return [unit * 2 ** ((1.0 / 4) * i) for i in range(H + 1)]
        else:
            logging.error('No such geoBudget scheme')
            sys.exit(1)

    def synthetic_gen(self):
        """Apply a grid structure on the domain and perturb the count using half
        of the available privacy budget """
        logging.debug('generating synthetic map...')
        data = self.realData
        unit = Params.unitGrid
        x_min = np.floor(Params.LOW[0] / unit) * unit
        x_max = np.ceil(Params.HIGH[0] / unit) * unit
        y_min = np.floor(Params.LOW[1] / unit) * unit
        y_max = np.ceil(Params.HIGH[1] / unit) * unit

        x_CELL = int(np.rint((x_max - x_min) / unit))
        y_CELL = int(np.rint((y_max - y_min) / unit))

        self.root.n_box = np.array([[x_min, y_min], [x_max, y_max]])

        self.mapp = np.zeros((x_CELL, y_CELL)) - 1  # ## initialize every cell with -1
        for i in range(Params.NDATA):  # ## populate the map
            point = data[:, i]
            cell_x = int(np.floor((point[0] - x_min) / unit))
            cell_y = int(np.floor((point[1] - y_min) / unit))
            if self.mapp[cell_x, cell_y] != -1:
                self.mapp[cell_x, cell_y] += 1
            else:
                self.mapp[cell_x, cell_y] = 1

        for i in range(x_CELL):  # ## perturb the counts
            for j in range(y_CELL):
                if self.mapp[i, j] != -1:
                    self.mapp[i, j] += np.rint(self.differ.getNoise(1, 0.5 * self.param.Eps))
                else:
                    self.mapp[i, j] = np.rint(self.differ.getNoise(1, 0.5 * self.param.Eps))
                # if noisy count is negative, ignore the noise and generate no points
                if self.mapp[i, j] < 0:
                    self.mapp[i, j] = 0

    def cell_setLeaf(self, curr):
        """ Throw away the counts based on the syntatic data """
        curr.n_count = 0
        return

    def testLeaf(self, curr):
        if (curr.n_count <= self.param.minPartSize) or (curr.n_depth == Params.maxHeight) or (
                self.uniform_test(curr, self.param.cellDistance)):
            return True
        return False

    def uniform_test(self, curr, distance):
        """ One of the stopping conditions: cell is uniform according to some threshold 'distance') """
        unit = Params.unitGrid
        x_min = int(np.rint((curr.n_box[0, 0] - self.root.n_box[0, 0]) / unit))
        x_max = int(np.rint((curr.n_box[1, 0] - self.root.n_box[0, 0]) / unit))
        y_min = int(np.rint((curr.n_box[0, 1] - self.root.n_box[0, 1]) / unit))
        y_max = int(np.rint((curr.n_box[1, 1] - self.root.n_box[0, 1]) / unit))
        data = self.mapp[x_min:x_max, y_min:y_max]
        total = np.sum(data)
        avg = total / ((x_max - x_min) * (y_max - y_min))
        dist = np.sum(np.abs(data - avg))
        if dist > distance:
            return False
        else:
            return True

    def buildIndex(self):
        stack = deque()
        stack.append(self.root)
        nleaf = 0  # leaf counter
        max_depth = -1
        self.root.n_count = np.sum(self.mapp)
        while len(stack) > 0:
            curr = stack.popleft()
            if curr.n_depth > max_depth:
                max_depth = curr.n_depth
            if self.testLeaf(curr) is True:  # curr is a leaf node
                nleaf += 1
                curr.n_isLeaf = True
                self.cell_setLeaf(curr)
            else:  # curr needs to split
                curr.n_budget -= 1
                tmp = self.getCoordinates(curr)
                nw_node, ne_node, sw_node, se_node = KNode(), KNode(), KNode(), KNode()  # create sub-nodes
                nw_coord, ne_coord, count_tmp = tmp
                x_nw, y_nw = nw_coord
                x_se, y_se = ne_coord

                nw_node.n_box = np.array([[curr.n_box[0, 0], y_nw], [x_nw, curr.n_box[1, 1]]])
                ne_node.n_box = np.array([[x_nw, y_se], [curr.n_box[1, 0], curr.n_box[1, 1]]])
                sw_node.n_box = np.array([[curr.n_box[0, 0], curr.n_box[0, 1]], [x_se, y_nw]])
                se_node.n_box = np.array([[x_se, curr.n_box[0, 1]], [curr.n_box[1, 0], y_se]])

                c_t = 0
                for sub_node in [nw_node, ne_node, sw_node, se_node]:
                    sub_node.n_depth = curr.n_depth + 1
                    sub_node.n_count = count_tmp[c_t]
                    sub_node.n_budget = curr.n_budget
                    stack.append(sub_node)
                    c_t += 1
                curr.nw, curr.ne, curr.sw, curr.se = nw_node, ne_node, sw_node, se_node

        # end of while
        logging.debug("number of leaves: %d" % nleaf)
        logging.debug("max depth: %d" % max_depth)

    def getCoordinates(self, curr):
        dim_1 = curr.n_depth % Params.NDIM  # primary split dimension
        UNIT = Params.unitGrid
        x_min = int(np.rint((curr.n_box[0, 0] - self.root.n_box[0, 0]) / UNIT))
        x_max = int(np.rint((curr.n_box[1, 0] - self.root.n_box[0, 0]) / UNIT))
        y_min = int(np.rint((curr.n_box[0, 1] - self.root.n_box[0, 1]) / UNIT))
        y_max = int(np.rint((curr.n_box[1, 1] - self.root.n_box[0, 1]) / UNIT))

        total = np.sum(self.mapp[x_min:x_max, y_min:y_max])
        if dim_1 == 0:
            for i in range(x_max - x_min):
                if np.sum(self.mapp[x_min:x_min + i + 1, y_min:y_max]) >= total / 2:
                    break
            split_prm = (x_min + i + 1) * UNIT + self.root.n_box[0, 0]

            half_1 = np.sum(self.mapp[x_min:x_min + i + 1, y_min:y_max])
            half_2 = np.sum(self.mapp[x_min + i + 1:x_max, y_min:y_max])
            for j in range(y_max - y_min):
                if np.sum(self.mapp[x_min:x_min + i + 1, y_min:y_min + j + 1]) >= half_1 / 2:
                    break
            split_sec1 = self.root.n_box[0, 1] + (y_min + j + 1) * UNIT
            n_sw = np.sum(self.mapp[x_min:x_min + i + 1, y_min:y_min + j + 1])
            n_nw = np.sum(self.mapp[x_min:x_min + i + 1, y_min + j + 1:y_max])
            for k in range(y_max - y_min):
                if np.sum(self.mapp[x_min + i + 1:x_max, y_min:y_min + k + 1]) >= half_2 / 2:
                    break
            split_sec2 = self.root.n_box[0, 1] + (y_min + k + 1) * UNIT
            n_se = np.sum(self.mapp[x_min + i + 1:x_max, y_min:y_min + k + 1])
            n_ne = np.sum(self.mapp[x_min + i + 1:x_max, y_min + k + 1:y_max])
            return (split_prm, split_sec1), (split_prm, split_sec2), (n_nw, n_ne, n_sw, n_se)

        else:
            for i in range(y_max - y_min):
                if np.sum(self.mapp[x_min:x_max, y_min:y_min + i + 1]) >= total / 2:
                    break
            split_prm = self.root.n_box[0, 1] + (y_min + i + 1) * UNIT

            half_1 = np.sum(self.mapp[x_min:x_max, y_min:y_min + i + 1])
            half_2 = np.sum(self.mapp[x_min:x_max, y_min + i + 1:y_max])
            for j in range(x_max - x_min):
                if np.sum(self.mapp[x_min:x_min + j + 1, y_min:y_min + i + 1]) >= half_1 / 2:
                    break
            split_sec1 = (x_min + j + 1) * UNIT + self.root.n_box[0, 0]
            n_sw = np.sum(self.mapp[x_min:x_min + j + 1, y_min:y_min + i + 1])
            n_se = np.sum(self.mapp[x_min + j + 1:x_max, y_min:y_min + i + 1])
            for k in range(x_max - x_min):
                if np.sum(self.mapp[x_min:x_min + k + 1, y_min + i + 1:y_max]) >= half_2 / 2:
                    break
            split_sec2 = (x_min + k + 1) * UNIT + self.root.n_box[0, 0]
            n_nw = np.sum(self.mapp[x_min:x_min + k + 1, y_min + i + 1:y_max])
            n_ne = np.sum(self.mapp[x_min + k + 1:x_max, y_min + i + 1:y_max])
            return (split_sec2, split_prm), (split_sec1, split_prm), (n_nw, n_ne, n_sw, n_se)


    def populate_synthetic_tree(self):
        """ Populate real data to the synthetic tree """
        logging.debug('populating synthetic tree...')
        a_data = self.realData
        ndata = a_data.shape[1]
        for i in range(ndata):
            ptx = a_data[0, i]
            pty = a_data[1, i]
            leaf = self.root.find_subnode(ptx, pty)
            leaf.n_count += 1

        # traverse the tree and update leaf counts
        stack = deque()
        stack.append(self.root)
        while len(stack) > 0:
            cur_node = stack.popleft()
            if cur_node.n_isLeaf is True:  # leaf
                cur_node.n_count += self.differ.getNoise(1, 0.5 * self.param.Eps)
            else:
                stack.append(cur_node.nw)
                stack.append(cur_node.ne)
                stack.append(cur_node.sw)
                stack.append(cur_node.se)
class KalmanFilterPID(Parser):
    """ generated source for class KalmanFilterPID """

    # sampling rate
    def __init__(self, param):
        """
        generated source for method __init__
        """
        Parser.__init__(self)

        self.param = param
        self.differ = Differential(self.param.Seed)

        self.predict = []
        self.interval = None

        # Kalman Filter params
        self.P = 100

        # estimation error covariance (over all time instance)
        self.Q = 1000

        # process noise synthetic data
        self.R = 1000000

        # measurement noise optimal for alpha = 1, synthetic data
        self.K = 0

        # kalman gain
        # PID control params - default
        self.Cp = 0.9  # proportional gain, to keep output proportional to current error
        self.Ci = 0.1  # integral gain, to eliminate offset
        self.Cd = 0.0  # derivative gain, to ensure stability - prevent large error in future

        # fixed internally
        self.theta = 1  # magnitude of changes
        self.xi = 0.2  # gamma (10%)
        self.minIntvl = 1  # make sure the interval is greater than 1

        self.windowPID = 5  # I(integration) window
        self.ratioM = 0.2  # sampling rate

        #
        self.isSampling = False

    def adjustParams(self):
        # adjust params
        if self.ratioM < 0.1:
            self.theta = 20
        if 0.1 <= self.ratioM < 0.2:
            self.theta = 14
        if 0.2 <= self.ratioM < 0.3:
            self.theta = 2
        if 0.3 <= self.ratioM < 0.4:
            self.theta = 0.5
        if 0.4 <= self.ratioM < 0.5:
            self.theta = 0.3
        if 0.5 <= self.ratioM:
            self.theta = 0.1

    # test
    @classmethod
    def main(self, args):
        """ generated source for method main """
        if len(args) < 5:
            print "Usage: python KalmanFilterPID.py input output privacy-budget process-variance Cp(optional) Ci(optional) Cd(optional)"
            sys.exit()

        output = open(args[2], "w")
        budget = eval(args[3])
        Q = float(args[4])
        if budget <= 0 or Q <= 0:
            print "Usage: privacy-budget AND process-variance are positive values"
            sys.exit()

        p = Params(1000)
        kfPID = KalmanFilterPID(p)
        kfPID.setTotalBudget(budget)
        kfPID.setQ(Q)

        kfPID.orig = Parser.getData(args[1])

        kfPID.publish = [None] * len(kfPID.orig)

        # adjust R based on T and alpha
        kfPID.setR(len(kfPID.orig) * len(kfPID.orig) / (0.0 + budget * budget))

        # set optional control gains
        if len(args) >= 6:
            d = args[5]
            if d > 1:
                d = 1
            kfPID.setCp(d)

        if len(args) >= 7:
            d = args[6]
            if d + kfPID.Cp > 1:
                d = 1 - kfPID.Cp
            kfPID.setCi(d)
        else:
            kfPID.setCi(1 - kfPID.Cp)

        if len(args) >= 8:
            d = args[7]
            if d + kfPID.Cp + kfPID.Ci > 1:
                d = 1 - kfPID.Cp - kfPID.Ci
            kfPID.setCd(d)
        else:
            kfPID.setCd(1 - kfPID.Cp - kfPID.Ci)

        # kfPID.adjustParams()

        start = time.time()
        kfPID.publishCounts()
        end = time.time()

        Parser.outputData(output, kfPID.publish)

        print "Method:\tKalman Filter with Adaptive Sampling"
        print "Data Series Length:\t" + str(len(kfPID.orig))
        print "Queries Issued:\t" + str(kfPID.query.count(1))
        print "Privacy Budget Used:\t" + str(
            kfPID.query.count(1) * kfPID.epsilon)
        print "Average Relative Error:\t" + str(kfPID.getRelError())
        print "Time Used (in second):\t" + str(end - start)

    def kalmanFilter(self, orig, budget, samplingRate=None):
        self.totalBudget = budget
        self.orig = orig
        if samplingRate is not None:
            self.isSampling = True
            self.ratioM = samplingRate
        else:
            self.isSampling = False

        # self.adjustParams()

        self.publish = [None] * len(self.orig)

        # adjust R based on T and alpha
        self.setR(len(self.orig) * len(self.orig) / (0.0 + budget * budget))

        self.publishCounts()

        return self.publish

    def getCount(self, value, epsilon):
        """
        return true count or noisy count of a node, depending on epsilon.
        Note that the noisy count can be negative
        """
        if epsilon < 10**(-8):
            return value
        else:
            return value + self.differ.getNoise(1, epsilon)  # sensitivity is 1

    # data publication procedure
    def publishCounts(self):
        """ generated source for method publish """

        self.query = BitArray(len(self.orig))
        self.predict = [None] * len(self.orig)

        # recalculate individual budget based on M
        if (self.isSampling):
            M = int(self.ratioM * (len(self.orig)))  # 0.25 optimal percentile
        else:
            M = len(self.orig)

        if M <= 0:
            M = 1
        self.epsilon = (self.totalBudget + 0.0) / M

        # error = 0
        self.interval = 1
        nextQuery = max(1, self.windowPID) + self.interval - 1

        for i in range(len(self.orig)):
            if i == 0:
                # the first time instance
                self.publish[i] = self.getCount(self.orig[i], self.epsilon)
                self.query[i] = 1
                self.correctKF(i, 0)
            else:
                predct = self.predictKF(i)
                self.predict[i] = predct
                if self.query.count(1) < self.windowPID and self.query.count(
                        1) < M:
                    # i is NOT the sampling point

                    self.publish[i] = self.getCount(self.orig[i], self.epsilon)
                    self.query[i] = 1

                    # update count using observation
                    self.correctKF(i, predct)
                elif i == nextQuery and self.query.count(1) < M:
                    # if i is the sampling point

                    # query
                    self.publish[i] = self.getCount(self.orig[i], self.epsilon)
                    self.query[i] = 1

                    # update count using observation
                    self.correctKF(i, predct)

                    # update freq
                    if (self.isSampling):
                        ratio = self.PID(i)
                        frac = min(20, (ratio - self.xi) / self.xi)
                        deltaI = self.theta * (1 - math.exp(frac))
                        deltaI = int(deltaI) + (random.random() <
                                                deltaI - int(deltaI))
                        self.interval += deltaI
                    else:
                        self.interval = 1

                    if self.interval < self.minIntvl:
                        self.interval = self.minIntvl
                    nextQuery += self.interval  # nextQuery is ns in the paper
                else:
                    # --> predict
                    self.publish[i] = predct

                    # del self.orig
                    # del self.predict
                    # del self.query

                    # if self.isPostProcessing:
                    # self.postProcessing()

    # def postProcessing(self):
    # print len(self.samples), self.samples
    # remainedEps = self.totalBudget - len(self.samples) * self.epsilon
    # self.epsilon = self.epsilon + remainedEps/len(self.samples)
    #
    # # recompute noisy counts
    #     prev = 0
    #     for i in self.samples:
    #         self.publish[i] = self.getCount(self.orig[i], self.epsilon)
    #         if i > prev + 1:
    #             self.publish[prev + 1 : i] = [self.publish[prev]] * (i - prev - 1)
    #         prev = i

    def setR(self, r):
        """ generated source for method setR """
        self.R = r

    def setQ(self, q):
        """ generated source for method setQ """
        self.Q = q

    def setCp(self, cp):
        """ generated source for method setCp """
        self.Cp = cp

    def setCi(self, ci):
        """ generated source for method setCi """
        self.Ci = ci

    def setCd(self, cd):
        """ generated source for method setCd """
        self.Cd = cd

    # prediction step
    def predictKF(self, curr):
        """ generated source for method predictKF """
        # predict using Kalman Filter
        lastValue = self.getLastQuery(curr)

        # project estimation error
        self.P += self.Q  # Q is gaussian noise
        return lastValue

    # correction step
    def correctKF(self, curr, predict):
        """ generated source for method correctKF """
        self.K = (self.P + 0.0) / (self.P + self.R)
        correct = predict + self.K * (self.publish[curr] - predict)

        # publish[curr] = Math.max((int) correct, 0)
        if curr > 0:
            # only correct from 2nd values
            self.publish[curr] = correct

        # print correct, "\t", self.publish[curr], self.K, self.P

        # update estimation error variance
        self.P *= (1 - self.K)

    def getLastQuery(self, curr):
        """ generated source for method getLastQuery """
        for i in reversed(range(curr)):
            if self.query[i]:
                break
        return self.publish[i]

    # adaptive sampling - return feedback error
    def PID(self, curr):
        """ generated source for method PID """
        sum = 0
        lastValue = 0
        change = 0
        timeDiff = 0
        next = curr
        for j in reversed(range(self.windowPID - 1)):
            index = next
            while index >= 0:
                if self.query[index]:
                    next = index - 1  # the last nextQuery
                    break
                index -= 1
            if j == self.windowPID - 1:
                lastValue = abs(self.publish[index] - self.predict[index]) / (
                    0.0 + max(self.publish[index], 1))
                change = abs(self.publish[index] - self.predict[index]) / (
                    0.0 + max(self.publish[index], 1))
                timeDiff = index
            if j == self.windowPID - 2:
                change -= abs(self.publish[index] - self.predict[index]) / (
                    0.0 + max(self.publish[index], 1))
                timeDiff -= index
            sum += (abs(self.publish[index] - self.predict[index]) /
                    (0.0 + max(self.publish[index], 1)))

        ratio = self.Cp * lastValue + self.Ci * sum + self.Cd * change / (
            0.0 + timeDiff)
        return ratio