Example #1
0
def perturbeCount(p):
    C_noisy = defaultdict()
    differ = Differential(p.seed)
    for lid, loc in p.locDict.iteritems():
        noisyLoc = differ.addPolarNoise(p.eps, loc, p.radius)  # perturbed noisy location
        cellId = coord2CellId(noisyLoc, p)  # obtain cell id from noisy location
        C_noisy[cellId] = C_noisy.get(cellId, 0) + 1
    return C_noisy
Example #2
0
 def __init__(self, data, param):
     self.param = param
     self.differ = Differential(self.param.Seed)
     self.mapp = None
     self.root = KNode()
     self.realData = data
     self.root.n_box = None
     self.root.n_budget = Params.maxHeight
Example #3
0
 def __init__(self, data, param):
     self.param = param
     self.differ = Differential(self.param.Seed)
     # ## initialize the root
     self.root = KNode()
     self.root.n_data = data
     self.root.n_box = np.array([Params.LOW, Params.HIGH])
     self.root.n_budget = Params.maxHeight
Example #4
0
 def testDifferential(self):
     differ = Differential(1000)
     RTH = (34.020412, -118.289936)
     radius = 500.0  # default unit is meters
     eps = np.log(2)
     for i in range(100):
         (x, y) = differ.getPolarNoise(radius, eps)
         print (str(RTH[0] + x * Params.ONE_KM * 0.001) + ',' + str(RTH[1] + y * Params.ONE_KM*1.2833*0.001))
Example #5
0
    def __init__(self, data, param):
        self.param = param
        self.differ = Differential(self.param.Seed)

        # initialize the root
        self.root = Node()
        # self.children = [] # all level 2 grids
        self.root.n_data = data
        self.root.n_box = np.array([param.LOW, param.HIGH])
Example #6
0
 def __init__(self, data, param):
     self.param = param
     self.differ = Differential(self.param.Seed)
     self.mapp = None
     self.root = KNode()
     self.realData = data
     self.root.n_box = None
     self.root.n_budget = Params.maxHeight
Example #7
0
 def __init__(self, data, param):
     self.param = param
     self.differ = Differential(self.param.Seed)
     # ## initialize the root
     self.root = KNode()
     self.root.n_data = data
     self.root.n_box = np.array([Params.LOW, Params.HIGH])
     self.root.n_budget = Params.maxHeight
    def __init__(self, param):
        """
        generated source for method __init__
        """
        Parser.__init__(self)

        self.param = param
        self.differ = Differential(self.param.Seed)

        self.predict = []
        self.interval = None

        # Kalman Filter params
        self.P = 100

        # estimation error covariance (over all time instance)
        self.Q = 1000

        # process noise synthetic data
        self.R = 1000000

        # measurement noise optimal for alpha = 1, synthetic data
        self.K = 0

        # kalman gain
        # PID control params - default
        self.Cp = 0.9  # proportional gain, to keep output proportional to current error
        self.Ci = 0.1  # integral gain, to eliminate offset
        self.Cd = 0.0  # derivative gain, to ensure stability - prevent large error in future

        # fixed internally
        self.theta = 1  # magnitude of changes
        self.xi = 0.2  # gamma (10%)
        self.minIntvl = 1  # make sure the interval is greater than 1

        self.windowPID = 5  # I(integration) window
        self.ratioM = 0.2  # sampling rate

        #
        self.isSampling = False
Example #9
0
def evalDivGeoI(p, D_actual):
    exp_name = sys._getframe().f_code.co_name
    logging.info(exp_name)
    res_cube = np.zeros((len(eps_list), len(seed_list), len(divMetricList)))

    sensitivity = p.M # diversitySensitivity(p.M)

    differ = Differential(p.seed)

    u = distance(p.x_min, p.y_min, p.x_max, p.y_min) * 1000.0 / Params.GEOI_GRID_SIZE
    v = distance(p.x_min, p.y_min, p.x_min, p.y_max) * 1000.0 / Params.GEOI_GRID_SIZE
    rad = euclideanToRadian((u, v))
    cell_size = np.array([rad[0], rad[1]])

    for j in range(len(seed_list)):
        for i in range(len(eps_list)):
            p.seed = seed_list[j]
            p.eps = eps_list[i]

            D_noisy = defaultdict(Counter)
            for lid, loc in p.locDict.iteritems():
                eps = p.eps/sensitivity
                noisyLoc = differ.addPolarNoise(eps, loc, p.radius) # perturbed noisy location

                # rounded to grid
                roundedPoint = round2Grid(noisyLoc, cell_size, p.x_min, p.y_min)

                cellId = coord2CellId(roundedPoint, p)  # obtain cell id from noisy location
                D_noisy[cellId].update(p.locs[lid]) # update count(userid/freq)

            actual, noisy = [], []
            for cellId, d in D_actual.iteritems():
                actual.append(d)
                noisy.append(normalizeDiversity(randomEntropy(len(D_noisy.get(cellId, Counter([]))))))   # default entropy = 0
            for k in range(len(divMetricList)):
                res_cube[i, j, k] = divMetricList[k](actual, noisy)

    res_summary = np.average(res_cube, axis=1)
    np.savetxt(p.resdir + Params.DATASET + "_" + exp_name + '_M' + str(p.M) + '_C' + str(p.C), res_summary, header="\t".join([f.__name__ for f in divMetricList]), fmt='%.4f\t')
Example #10
0
def evalCountGeoI(p, C_actual):
    exp_name = sys._getframe().f_code.co_name
    logging.info(exp_name)

    res_cube = np.zeros((len(eps_list), len(seed_list), len(freqMetricList)))

    differ = Differential(p.seed)

    u = distance(p.x_min, p.y_min, p.x_max, p.y_min) * 1000.0 / Params.GEOI_GRID_SIZE
    v = distance(p.x_min, p.y_min, p.x_min, p.y_max) * 1000.0 / Params.GEOI_GRID_SIZE
    rad = euclideanToRadian((u, v))
    cell_size = np.array([rad[0], rad[1]])

    for j in range(len(seed_list)):
        for i in range(len(eps_list)):
            p.seed = seed_list[j]
            p.eps = eps_list[i]

            C_noisy = defaultdict()
            for lid, loc in p.locDict.iteritems():
                noisyLoc = differ.addPolarNoise(p.eps, loc, p.radius) # perturbed noisy location

                # rounded to grid
                roundedPoint = round2Grid(noisyLoc, cell_size, p.x_min, p.y_min)

                cellId = coord2CellId(roundedPoint, p)  # obtain cell id from noisy location
                C_noisy[cellId] = C_noisy.get(cellId, 0) + 1

            actual, noisy = [], []
            for cellId, c in C_actual.iteritems():
                if c > 0:
                    actual.append(c)
                    noisy.append(C_noisy.get(cellId, Params.DEFAULT_ENTROPY))   # default entropy = 0
            for k in range(len(freqMetricList)):
                res_cube[i, j, k] = freqMetricList[k](actual, noisy)

    res_summary = np.average(res_cube, axis=1)
    np.savetxt(p.resdir + Params.DATASET + "_" + exp_name + "_m" + str(p.m) + '_r' + str(p.radius), res_summary, header="\t".join([f.__name__ for f in divMetricList]), fmt='%.4f\t')
Example #11
0
def testDifferential():
    p = Params(1000)
    p.select_dataset()
    differ = Differential(1000)
    # RTH = (34.020412, -118.289936)
    TS = (40.758890, -73.985100)

    for i in range(100):
        # (x, y) = differ.getPolarNoise(1000000, p.eps)
        # pp = noisyPoint(TS, (x,y))

        pp = differ.addPolarNoise(1.0, TS, 100)


        # u = distance(p.x_min, p.y_min, p.x_max, p.y_min) * 1000.0 / Params.GRID_SIZE
        # v = distance(p.x_min, p.y_min, p.x_min, p.y_max) * 1000.0 / Params.GRID_SIZE
        # rad = euclideanToRadian((u, v))
        # cell_size = np.array([rad[0], rad[1]])
        # roundedPoint = round2Grid(pp, cell_size, p.x_min, p.y_min)


        roundedPoint = pp
        print (str(roundedPoint[0]) + ',' + str(roundedPoint[1]))
    def __init__(self, data, eps, param, firstGrid=None, use_domain_knowledge=None):
        """
        two levels grid
        """
        self.eps = eps
        self.first = False
        self.DOMAIN_KNOWLEDGE = use_domain_knowledge

        if firstGrid is None:
            # this first grid --> need to construct
            self.first = True
            Grid_adaptiveM.__init__(self, data, eps, param, self.DOMAIN_KNOWLEDGE)
        else:
            self.param = param
            self.differ = Differential(self.param.Seed)

            # update root
            self.root = copy.deepcopy(firstGrid.root)
            self.root.n_data = data
    def __init__(self, param):
        """
        generated source for method __init__
        """
        Parser.__init__(self)

        self.param = param
        self.differ = Differential(self.param.Seed)

        self.predict = []
        self.interval = None

        # Kalman Filter params
        self.P = 100

        # estimation error covariance (over all time instance)
        self.Q = 1000

        # process noise synthetic data
        self.R = 1000000

        # measurement noise optimal for alpha = 1, synthetic data
        self.K = 0

        # kalman gain
        # PID control params - default
        self.Cp = 0.9  # proportional gain, to keep output proportional to current error
        self.Ci = 0.1  # integral gain, to eliminate offset
        self.Cd = 0.0  # derivative gain, to ensure stability - prevent large error in future

        # fixed internally
        self.theta = 1  # magnitude of changes
        self.xi = 0.2  # gamma (10%)
        self.minIntvl = 1  # make sure the interval is greater than 1

        self.windowPID = 5  # I(integration) window
        self.ratioM = 0.2  # sampling rate

        #
        self.isSampling = False
Example #14
0
class KTree(object):
    """Generic tree template"""

    def __init__(self, data, param):
        self.param = param
        self.differ = Differential(self.param.Seed)
        # ## initialize the root
        self.root = KNode()
        self.root.n_data = data
        self.root.n_box = np.array([Params.LOW, Params.HIGH])
        self.root.n_budget = Params.maxHeight

    def getSplitBudget(self):
        """return a list of h budget values for split"""
        raise NotImplementedError

    def getCountBudget(self):
        """return a list of (h+1) budget values for noisy count"""
        raise NotImplementedError

    def getNoisyMedian(self, array, left, right, epsilon):
        """return the split value of an array"""
        raise NotImplementedError

    def getCoordinates(self, curr):
        """
        return the coordinate of lower-right point of the NW sub-node
        and the upper-left point of the SW sub-node and the data points
        in the four subnodes, i.e.
        return (x_nw,y_nw),(x_se,y_se), nw_data, ne_data, sw_data, se_data
        """
        raise NotImplementedError

    def getSplit(self, array, left, right, epsilon):
        """
        return the split point given an array, may be data-independent or
        true median or noisy median, depending on the type of the tree
        """
        raise NotImplementedError

    def getCount(self, curr, epsilon):
        """ return true count or noisy count of a node, depending on epsilon"""
        if curr.n_data is None:
            count = 0
        else:
            count = curr.n_data.shape[1]
        if epsilon < 10 ** (-6):
            return count
        else:
            return count + self.differ.getNoise(1, epsilon)

    def testLeaf(self, curr):
        """ test whether a node should be a leaf node """
        if (curr.n_depth == Params.maxHeight) or \
                (curr.n_budget <= 0) or \
                (curr.n_data is None or curr.n_data.shape[1] == 0) or \
                (curr.n_count <= self.param.minPartSize):
            return True
        return False

    def cell_setLeaf(self, curr):
        """ will be overrided in kd_cell """
        return

    def buildIndex(self):
        """ Function to build the tree structure, fanout = 4 by default for spatial (2D) data """
        budget_c = self.getCountBudget()
        self.root.n_count = self.getCount(self.root, budget_c[0])  # ## add noisy count to root
        stack = deque()
        stack.append(self.root)
        nleaf = 0  # ## leaf counter
        max_depth = -1
        # ## main loop
        while len(stack) > 0:
            curr = stack.popleft()
            if curr.n_depth > max_depth:
                max_depth = curr.n_depth

            if self.testLeaf(curr) is True:  # ## curr is a leaf node
                if curr.n_depth < Params.maxHeight:  # ## if a node ends up earlier than maxHeight, it should be able to use the remaining count budget
                    remainingEps = sum(budget_c[curr.n_depth + 1:])
                    curr.n_count = self.getCount(curr, remainingEps)
                nleaf += 1
                curr.n_isLeaf = True
                self.cell_setLeaf(curr)

            else:  # ## curr needs to split
                curr.n_budget -= 1  # ## some budget will be used regardless the split is successful or not
                tmp = self.getCoordinates(curr)
                nw_node, ne_node, sw_node, se_node = KNode(), KNode(), KNode(), KNode()  # create sub-nodes
                nw_coord, ne_coord, nw_node.n_data, ne_node.n_data, sw_node.n_data, se_node.n_data = tmp
                x_nw, y_nw = nw_coord
                x_se, y_se = ne_coord
                # ## update bounding box, depth, count, budget for the four subnodes
                nw_node.n_box = np.array([[curr.n_box[0, 0], y_nw], [x_nw, curr.n_box[1, 1]]])
                ne_node.n_box = np.array([[x_nw, y_se], [curr.n_box[1, 0], curr.n_box[1, 1]]])
                sw_node.n_box = np.array([[curr.n_box[0, 0], curr.n_box[0, 1]], [x_se, y_nw]])
                se_node.n_box = np.array([[x_se, curr.n_box[0, 1]], [curr.n_box[1, 0], y_se]])

                for sub_node in [nw_node, ne_node, sw_node, se_node]:
                    sub_node.n_depth = curr.n_depth + 1
                    # if (sub_node.n_depth == Params.maxHeight and sub_node.n_data is not None):
                    # print len(sub_node.n_data[0])
                    sub_node.n_count = self.getCount(sub_node, budget_c[sub_node.n_depth])
                    sub_node.n_budget = curr.n_budget
                    stack.append(sub_node)

                curr.n_data = None  # ## do not need the data points coordinates now
                curr.nw, curr.ne, curr.sw, curr.se = nw_node, ne_node, sw_node, se_node
        # end of while

        logging.debug("number of leaves: %d" % nleaf)
        logging.debug("max depth: %d" % max_depth)

    def rect_intersect(self, hrect, query):
        """
        checks if the hyper-rectangle intersects with the
        hyper-rectangle defined by the query in every dimension
    
        """
        bool_m1 = query[0, :] >= hrect[1, :]
        bool_m2 = query[1, :] <= hrect[0, :]
        bool_m = np.logical_or(bool_m1, bool_m2)
        if np.any(bool_m):
            return False
        else:
            return True

    def rangeCount(self, query):
        """
        Query answering function. Find the number of data points within a query rectangle.
        """
        stack = deque()
        stack.append(self.root)
        count = 0.0
        # ## Below are three variables recording the number of 1) whole leaf 2) partial leaf 3) whole internal node,
        # ## respectively, which contribute to the query answer. For debug purpose only.
        l_whole, l_part, i_whole = 0, 0, 0

        while len(stack) > 0:
            curr = stack.popleft()
            _box = curr.n_box
            if curr.n_isLeaf is True:
                frac = 1
                if self.rect_intersect(_box, query):
                    for i in range(_box.shape[1]):
                        if _box[1, i] == _box[0, i] or Params.WorstCase == True:
                            frac *= 1
                        else:
                            frac *= (min(query[1, i], _box[1, i]) - max(query[0, i], _box[0, i])) / (
                                _box[1, i] - _box[0, i])
                    count += curr.n_count * frac
                    if 1.0 - frac < 10 ** (-6):
                        l_whole += 1
                    else:
                        l_part += 1

            else:  # ## if not leaf
                bool_matrix = np.zeros((2, query.shape[1]))
                bool_matrix[0, :] = query[0, :] <= _box[0, :]
                bool_matrix[1, :] = query[1, :] >= _box[1, :]

                if np.all(bool_matrix) and self.param.useLeafOnly is False:  # ## if query range contains node range
                    count += curr.n_count
                    i_whole += 1
                else:
                    if self.rect_intersect(curr.nw.n_box, query):
                        stack.append(curr.nw)
                    if self.rect_intersect(curr.ne.n_box, query):
                        stack.append(curr.ne)
                    if self.rect_intersect(curr.sw.n_box, query):
                        stack.append(curr.sw)
                    if self.rect_intersect(curr.se.n_box, query):
                        stack.append(curr.se)

        return float(count)  # , i_whole, l_whole, l_part

    def adjustConsistency(self):
        """ 
        Post processing for uniform noise across levels. Due to 
        Michael Hay, Vibhor Rastogi, Gerome Miklau, Dan Suciu, 
        Boosting the Accuracy of Differentially-Private Histograms Through Consistency,
        VLDB 2010
        """
        logging.debug('adjusting consistency...')
        # ## upward pass
        self.root.get_z()
        # ## downward pass
        queue = deque()
        queue.append(self.root)
        while len(queue) > 0:
            curr = queue.popleft()
            if curr.n_isLeaf is False:
                adjust = (curr.n_count - curr.nw.n_count - curr.ne.n_count - curr.sw.n_count - curr.se.n_count) / 4.0
                for subnode in [curr.nw, curr.ne, curr.sw, curr.se]:
                    subnode.n_count += adjust
                    queue.append(subnode)

    def postProcessing(self):
        """ 
        Post processing for general noise distribution across levels. Due to
        G. Cormode, M. Procopiuc, E. Shen, D. Srivastava and T. Yu, 
        Differentially Private Spatial Decompositions, ICDE 2012.
        """
        logging.debug("post-processing...")
        budget = self.getCountBudget()  # ## count budget for h+1 levels
        H = Params.maxHeight
        # ## Phase 1 (top-down)
        queue = deque()
        self.root.n_count *= budget[self.root.n_depth] ** 2
        queue.append(self.root)
        while len(queue) > 0:
            curr = queue.popleft()
            if curr.n_isLeaf is False:
                for subnode in [curr.nw, curr.ne, curr.sw, curr.se]:
                    subnode.n_count = curr.n_count + subnode.n_count * (budget[subnode.n_depth] ** 2)
                    queue.append(subnode)
        # ## Phase 2 (bottom-up)
        self.root.update_count()
        # ## Phase 3 (top-down)
        queue = deque()
        E_root = 0
        for i in range(H + 1):
            E_root += 4 ** i * budget[H - i] * budget[H - i]
        self.root.n_count /= E_root
        self.root.n_F = 0
        queue.append(self.root)
        while len(queue) > 0:
            curr = queue.popleft()
            if curr.n_isLeaf is False:
                h = H - curr.n_depth - 1  # ## height of curr's children
                E_h = 0
                for i in range(h + 1):
                    E_h += 4 ** i * budget[H - i] * budget[H - i]
                for subnode in [curr.nw, curr.ne, curr.sw, curr.se]:
                    subnode.n_F = curr.n_F + curr.n_count * (budget[curr.n_depth] ** 2)
                    subnode.n_count = (subnode.n_count - 4 ** h * subnode.n_F) / E_h
                    queue.append(subnode)

    def pruning(self):
        """
        If the tree is grown without the stopping condition of minLeafSize, prune it here after post processing
        """
        logging.debug("pruning...")
        queue = deque()
        queue.append(self.root)
        while len(queue) > 0:
            curr = queue.popleft()
            if curr.n_isLeaf is False:
                if curr.n_count <= self.param.minPartSize:
                    curr.n_isLeaf = True
                else:
                    queue.append(curr.nw)
                    queue.append(curr.ne)
                    queue.append(curr.sw)
                    queue.append(curr.se)
class Hilbert(Kd_standard):
    """ Hilbert R-tree """
    def __init__(self, data, param):
        self.param = param
        self.differ = Differential(self.param.Seed)
        self.root = KNode()
        self.realData = data
        self.root.n_budget = Params.maxHeight

    def h_encode(self, x, y, r):
        """ (x,y) -> value h in Hilbert space, r is the resolution of the Hilbert curve """
        mask = (1 << r) - 1
        heven = x ^ y
        notx = ~x & mask
        noty = ~y & mask
        temp = notx ^ y
        v0, v1 = 0, 0
        for k in range(r - 1):
            v1 = ((v1 & heven) | ((v0 ^ noty) & temp)) >> 1
            v0 = ((v0 & (v1 ^ notx)) | (~v0 & (v1 ^ noty))) >> 1
        hodd = (~v0 & (v1 ^ x)) | (v0 & (v1 ^ noty))
        return self.interleaveBits(hodd, heven)

    def h_decode(self, h, r):
        """ h -> (x,y) """
        heven, hodd = self.deleaveBits(h)
        mask = (1 << r) - 1
        v0, v1 = 0, 0
        temp1 = ~(heven | hodd) & mask
        temp0 = ~(heven ^ hodd) & mask
        for k in range(r - 1):
            v1 = (v1 ^ temp1) >> 1
            v0 = (v0 ^ temp0) >> 1
        return (v0 & ~heven) ^ v1 ^ hodd, (v0 | heven) ^ v1 ^ hodd

    def interleaveBits(self, hodd, heven):
        val = 0
        maxx = max(hodd, heven)
        n = 0
        while maxx > 0:
            n += 1
            maxx >>= 1
        for i in range(n):
            bitMask = 1 << i
            a = 1 << (2 * i) if (heven & bitMask) else 0
            b = 1 << (2 * i + 1) if (hodd & bitMask) else 0
            val += a + b
        return val

    def deleaveBitsOdd(self, x):
        x &= 0x5555555555555555
        x = (x | (x >> 1)) & 0x3333333333333333
        x = (x | (x >> 2)) & 0x0F0F0F0F0F0F0F0F
        x = (x | (x >> 4)) & 0x00FF00FF00FF00FF
        x = (x | (x >> 8)) & 0x0000FFFF0000FFFF
        x = (x | (x >> 16)) & 0x00000000FFFFFFFF
        return x

    def deleaveBits(self, x):
        return self.deleaveBitsOdd(x), self.deleaveBitsOdd(x >> 1)

    def get_Hcoord(self, x, y, R):
        hx = int((x - Params.LOW[0]) /
                 (Params.HIGH[0] - Params.LOW[0] + 10**(-8)) * (2**R))
        hy = int((y - Params.LOW[1]) /
                 (Params.HIGH[1] - Params.LOW[1] + 10**(-8)) * (2**R))
        return hx, hy

    def get_Rcoord(self, hx, hy, R):
        x = float(hx) / (2**
                         R) * (Params.HIGH[0] - Params.LOW[0]) + Params.LOW[0]
        y = float(hy) / (2**
                         R) * (Params.HIGH[1] - Params.LOW[1]) + Params.LOW[1]
        return x, y

    def getCount(self, curr, epsilon):
        count = len(curr.n_data)
        if epsilon < 10**(-6):
            return count
        else:
            return count + self.differ.getNoise(1, epsilon)

    def testLeaf(self, curr):
        """ test whether a node should be a leaf node """
        if (curr.n_depth == Params.maxHeight) or \
                (curr.n_budget <= 0) or \
                (curr.n_count <= self.param.minPartSize):
            return True
        return False

    def buildIndex(self):
        budget_c = self.getCountBudget()
        logging.debug('encoding coordinates...')
        RES = self.param.Res  # order of Hilbert curve
        ndata = self.realData.shape[1]
        hidx = np.zeros(ndata)
        for i in range(ndata):
            hx, hy = self.get_Hcoord(self.realData[0, i], self.realData[1, i],
                                     RES)
            hidx[i] = self.h_encode(hx, hy, RES)
        hidx = np.sort(hidx)

        logging.debug('building index...')
        self.root.n_data = hidx
        self.root.n_box = (0, 2**(2 * RES) - 1)
        self.root.n_count = self.getCount(self.root, budget_c[0])

        stack = deque()
        stack.append(self.root)
        tree = [self.root]
        leaf_li = []  # storage of all leaves
        nleaf = 0  # leaf counter
        max_depth = -1

        while len(stack) > 0:
            curr = stack.popleft()
            if curr.n_depth > max_depth:
                max_depth = curr.n_depth
            if self.testLeaf(curr) is True:  # curr is a leaf node
                if curr.n_depth < Params.maxHeight:
                    remainingEps = sum(budget_c[curr.n_depth + 1:])
                    curr.n_count = self.getCount(curr, remainingEps)
                nleaf += 1
                curr.n_isLeaf = True
                leaf_li.append(curr)

            else:  # curr needs to split
                curr.n_budget -= 1
                tmp = self.getCoordinates(curr)
                if tmp is False:  # if split fails
                    stack.append(curr)
                    continue
                nw_node, ne_node, sw_node, se_node = KNode(), KNode(), KNode(
                ), KNode()  # create sub-nodes
                split_prm, split_sec1, split_sec2, nw_node.n_data, ne_node.n_data, sw_node.n_data, se_node.n_data = tmp

                nw_node.n_box = (curr.n_box[0], split_sec1)
                ne_node.n_box = (split_sec1, split_prm)
                sw_node.n_box = (split_prm, split_sec2)
                se_node.n_box = (split_sec2, curr.n_box[1])

                for sub_node in [nw_node, ne_node, sw_node, se_node]:
                    sub_node.n_depth = curr.n_depth + 1
                    sub_node.n_count = self.getCount(
                        sub_node, budget_c[sub_node.n_depth])
                    sub_node.n_budget = curr.n_budget
                    stack.append(sub_node)
                    tree.append(sub_node)
                curr.n_data = None
                curr.nw, curr.ne, curr.sw, curr.se = nw_node, ne_node, sw_node, se_node

        # end of while
        logging.debug("number of leaves: %d" % nleaf)
        logging.debug("max depth: %d" % max_depth)

        # # convert hilbert values in leaf nodes to real coordinates and update bounding box
        logging.debug('decoding and updating bounding box...')
        for leaf in leaf_li:
            bbox = np.array([[1000.0, 1000.0], [-1000.0, -1000.0]],
                            dtype='float64')
            for hvalue in leaf.n_data:
                hx, hy = self.h_decode(int(hvalue), RES)
                x, y = self.get_Rcoord(hx, hy, RES)
                bbox[0, 0] = x if x < bbox[0, 0] else bbox[0, 0]
                bbox[1, 0] = x if x > bbox[1, 0] else bbox[1, 0]
                bbox[0, 1] = y if y < bbox[0, 1] else bbox[0, 1]
                bbox[1, 1] = y if y > bbox[1, 1] else bbox[1, 1]
            leaf.n_box = bbox

        # # update bounding box bottom-up
        tree = sorted(tree, cmp=self.cmp_node)
        logging.debug('updating box for each node in the tree...')
        for node in tree:
            if node.n_data is None:
                node.n_box = np.zeros((2, 2))
                node.n_box[0,
                           0] = min(node.ne.n_box[0, 0], node.nw.n_box[0, 0],
                                    node.se.n_box[0, 0], node.sw.n_box[0, 0])
                node.n_box[0,
                           1] = min(node.ne.n_box[0, 1], node.nw.n_box[0, 1],
                                    node.se.n_box[0, 1], node.sw.n_box[0, 1])
                node.n_box[1,
                           0] = max(node.ne.n_box[1, 0], node.nw.n_box[1, 0],
                                    node.se.n_box[1, 0], node.sw.n_box[1, 0])
                node.n_box[1,
                           1] = max(node.ne.n_box[1, 1], node.nw.n_box[1, 1],
                                    node.se.n_box[1, 1], node.sw.n_box[1, 1])

    def cmp_node(self, node1, node2):
        # reverse order
        return int(node2.n_depth - node1.n_depth)

    def getCoordinates(self, curr):
        budget_s = self.getSplitBudget()
        _data = curr.n_data
        _ndata = len(_data)
        split_1 = self.getSplit(_data, curr.n_box[0], curr.n_box[1],
                                budget_s[curr.n_depth] / 2)
        pos_1 = np.searchsorted(_data, split_1)
        if pos_1 == 0 or pos_1 == _ndata:
            return False
        data_1 = _data[:pos_1]
        data_2 = _data[pos_1:]
        split_sec1 = self.getSplit(data_1, curr.n_box[0], split_1,
                                   budget_s[curr.n_depth] / 2)
        split_sec2 = self.getSplit(data_2, split_1, curr.n_box[1],
                                   budget_s[curr.n_depth] / 2)
        pos_sec1 = np.searchsorted(data_1, split_sec1)
        pos_sec2 = np.searchsorted(data_2, split_sec2)

        if pos_sec1 == 0 or pos_sec1 == len(
                data_1) or pos_sec2 == 0 or pos_sec2 == len(data_2):
            return False
        nw_data, ne_data, sw_data, se_data = data_1[:pos_sec1], data_1[
            pos_sec1:], data_2[:pos_sec2], data_2[pos_sec2:]
        return split_1, split_sec1, split_sec2, nw_data, ne_data, sw_data, se_data
class KalmanFilterPID(Parser):
    """ generated source for class KalmanFilterPID """

    # sampling rate
    def __init__(self, param):
        """
        generated source for method __init__
        """
        Parser.__init__(self)

        self.param = param
        self.differ = Differential(self.param.Seed)

        self.predict = []
        self.interval = None

        # Kalman Filter params
        self.P = 100

        # estimation error covariance (over all time instance)
        self.Q = 1000

        # process noise synthetic data
        self.R = 1000000

        # measurement noise optimal for alpha = 1, synthetic data
        self.K = 0

        # kalman gain
        # PID control params - default
        self.Cp = 0.9  # proportional gain, to keep output proportional to current error
        self.Ci = 0.1  # integral gain, to eliminate offset
        self.Cd = 0.0  # derivative gain, to ensure stability - prevent large error in future

        # fixed internally
        self.theta = 1  # magnitude of changes
        self.xi = 0.2  # gamma (10%)
        self.minIntvl = 1  # make sure the interval is greater than 1

        self.windowPID = 5  # I(integration) window
        self.ratioM = 0.2  # sampling rate

        #
        self.isSampling = False


    def adjustParams(self):
        # adjust params
        if self.ratioM < 0.1:
            self.theta = 20
        if 0.1 <= self.ratioM < 0.2:
            self.theta = 14
        if 0.2 <= self.ratioM < 0.3:
            self.theta = 2
        if 0.3 <= self.ratioM < 0.4:
            self.theta = 0.5
        if 0.4 <= self.ratioM < 0.5:
            self.theta = 0.3
        if 0.5 <= self.ratioM:
            self.theta = 0.1

    # test
    @classmethod
    def main(self, args):
        """ generated source for method main """
        if len(args) < 5:
            print "Usage: python KalmanFilterPID.py input output privacy-budget process-variance Cp(optional) Ci(optional) Cd(optional)"
            sys.exit()

        output = open(args[2], "w")
        budget = eval(args[3])
        Q = float(args[4])
        if budget <= 0 or Q <= 0:
            print "Usage: privacy-budget AND process-variance are positive values"
            sys.exit()

        p = Params(1000)
        kfPID = KalmanFilterPID(p)
        kfPID.setTotalBudget(budget)
        kfPID.setQ(Q)

        kfPID.orig = Parser.getData(args[1])

        kfPID.publish = [None] * len(kfPID.orig)

        # adjust R based on T and alpha
        kfPID.setR(len(kfPID.orig) * len(kfPID.orig) / (0.0 + budget * budget))

        # set optional control gains
        if len(args) >= 6:
            d = args[5]
            if d > 1:
                d = 1
            kfPID.setCp(d)

        if len(args) >= 7:
            d = args[6]
            if d + kfPID.Cp > 1:
                d = 1 - kfPID.Cp
            kfPID.setCi(d)
        else:
            kfPID.setCi(1 - kfPID.Cp)

        if len(args) >= 8:
            d = args[7]
            if d + kfPID.Cp + kfPID.Ci > 1:
                d = 1 - kfPID.Cp - kfPID.Ci
            kfPID.setCd(d)
        else:
            kfPID.setCd(1 - kfPID.Cp - kfPID.Ci)

        # kfPID.adjustParams()

        start = time.time()
        kfPID.publishCounts()
        end = time.time()

        Parser.outputData(output, kfPID.publish)

        print "Method:\tKalman Filter with Adaptive Sampling"
        print "Data Series Length:\t" + str(len(kfPID.orig))
        print "Queries Issued:\t" + str(kfPID.query.count(1))
        print "Privacy Budget Used:\t" + str(kfPID.query.count(1) * kfPID.epsilon)
        print "Average Relative Error:\t" + str(kfPID.getRelError())
        print "Time Used (in second):\t" + str(end - start)

    def kalmanFilter(self, orig, budget, samplingRate=None):
        self.totalBudget = budget
        self.orig = orig
        if samplingRate is not None:
            self.isSampling = True
            self.ratioM = samplingRate
        else:
            self.isSampling = False

        # self.adjustParams()

        self.publish = [None] * len(self.orig)

        # adjust R based on T and alpha
        self.setR(len(self.orig) * len(self.orig) / (0.0 + budget * budget))

        self.publishCounts()

        return self.publish

    def getCount(self, value, epsilon):
        """
        return true count or noisy count of a node, depending on epsilon.
        Note that the noisy count can be negative
        """
        if epsilon < 10 ** (-8):
            return value
        else:
            return value + self.differ.getNoise(1, epsilon)  # sensitivity is 1


    # data publication procedure
    def publishCounts(self):
        """ generated source for method publish """

        self.query = BitArray(len(self.orig))
        self.predict = [None] * len(self.orig)

        # recalculate individual budget based on M
        if (self.isSampling):
            M = int(self.ratioM * (len(self.orig)))  # 0.25 optimal percentile
        else:
            M = len(self.orig)

        if M <= 0:
            M = 1
        self.epsilon = (self.totalBudget + 0.0) / M

        # error = 0
        self.interval = 1
        nextQuery = max(1, self.windowPID) + self.interval - 1

        for i in range(len(self.orig)):
            if i == 0:
                # the first time instance
                self.publish[i] = self.getCount(self.orig[i], self.epsilon)
                self.query[i] = 1
                self.correctKF(i, 0)
            else:
                predct = self.predictKF(i)
                self.predict[i] = predct
                if self.query.count(1) < self.windowPID and self.query.count(1) < M:
                    # i is NOT the sampling point

                    self.publish[i] = self.getCount(self.orig[i], self.epsilon)
                    self.query[i] = 1

                    # update count using observation
                    self.correctKF(i, predct)
                elif i == nextQuery and self.query.count(1) < M:
                    # if i is the sampling point

                    # query
                    self.publish[i] = self.getCount(self.orig[i], self.epsilon)
                    self.query[i] = 1

                    # update count using observation
                    self.correctKF(i, predct)

                    # update freq
                    if (self.isSampling):
                        ratio = self.PID(i)
                        frac = min(20, (ratio - self.xi) / self.xi)
                        deltaI = self.theta * (1 - math.exp(frac))
                        deltaI = int(deltaI) + (random.random() < deltaI - int(deltaI))
                        self.interval += deltaI
                    else:
                        self.interval = 1

                    if self.interval < self.minIntvl:
                        self.interval = self.minIntvl
                    nextQuery += self.interval  # nextQuery is ns in the paper
                else:
                    # --> predict
                    self.publish[i] = predct

                    # del self.orig
                    # del self.predict
                    # del self.query

                    # if self.isPostProcessing:
                    # self.postProcessing()

    # def postProcessing(self):
    # print len(self.samples), self.samples
    # remainedEps = self.totalBudget - len(self.samples) * self.epsilon
    # self.epsilon = self.epsilon + remainedEps/len(self.samples)
    #
    # # recompute noisy counts
    #     prev = 0
    #     for i in self.samples:
    #         self.publish[i] = self.getCount(self.orig[i], self.epsilon)
    #         if i > prev + 1:
    #             self.publish[prev + 1 : i] = [self.publish[prev]] * (i - prev - 1)
    #         prev = i

    def setR(self, r):
        """ generated source for method setR """
        self.R = r

    def setQ(self, q):
        """ generated source for method setQ """
        self.Q = q

    def setCp(self, cp):
        """ generated source for method setCp """
        self.Cp = cp

    def setCi(self, ci):
        """ generated source for method setCi """
        self.Ci = ci

    def setCd(self, cd):
        """ generated source for method setCd """
        self.Cd = cd

    # prediction step
    def predictKF(self, curr):
        """ generated source for method predictKF """
        # predict using Kalman Filter
        lastValue = self.getLastQuery(curr)

        # project estimation error
        self.P += self.Q  # Q is gaussian noise
        return lastValue

    # correction step
    def correctKF(self, curr, predict):
        """ generated source for method correctKF """
        self.K = (self.P + 0.0) / (self.P + self.R)
        correct = predict + self.K * (self.publish[curr] - predict)

        # publish[curr] = Math.max((int) correct, 0)
        if curr > 0:
            # only correct from 2nd values
            self.publish[curr] = correct

        # print correct, "\t", self.publish[curr], self.K, self.P

        # update estimation error variance
        self.P *= (1 - self.K)

    def getLastQuery(self, curr):
        """ generated source for method getLastQuery """
        for i in reversed(range(curr)):
            if self.query[i]:
                break
        return self.publish[i]

    # adaptive sampling - return feedback error
    def PID(self, curr):
        """ generated source for method PID """
        sum = 0
        lastValue = 0
        change = 0
        timeDiff = 0
        next = curr
        for j in reversed(range(self.windowPID - 1)):
            index = next
            while index >= 0:
                if self.query[index]:
                    next = index - 1  # the last nextQuery
                    break
                index -= 1
            if j == self.windowPID - 1:
                lastValue = abs(self.publish[index] - self.predict[index]) / (0.0 + max(self.publish[index], 1))
                change = abs(self.publish[index] - self.predict[index]) / (0.0 + max(self.publish[index], 1))
                timeDiff = index
            if j == self.windowPID - 2:
                change -= abs(self.publish[index] - self.predict[index]) / (0.0 + max(self.publish[index], 1))
                timeDiff -= index
            sum += (abs(self.publish[index] - self.predict[index]) / (0.0 + max(self.publish[index], 1)))

        ratio = self.Cp * lastValue + self.Ci * sum + self.Cd * change / (0.0 + timeDiff)
        return ratio
Example #17
0
class Generic(object):
    """
    Generic data structure, used for both htree and grid
    """

    def __init__(self, data, param):
        self.param = param
        self.differ = Differential(self.param.Seed)

        # initialize the root
        self.root = Node()
        # self.children = [] # all level 2 grids
        self.root.n_data = data
        self.root.n_box = np.array([param.LOW, param.HIGH])

    def getEqualSplit(self, partitions, min, max):
        """return equal split points, including both ends"""
        if min > max:
            logging.debug("getEqualSplit: Error: min > max")
        if partitions <= 1:
            return [min, max]
        return [min + (max - min) * i / partitions for i in range(partitions + 1)]

    def getCountBudget(self):
        """return noisy count budget for different levels of the indices"""
        raise NotImplementedError

    def getCoordinates(self, curr):
        """return the split dimension, the split points and the data points in each subnodes"""
        raise NotImplementedError

    def getCount(self, curr, epsilon):
        """
	return true count or noisy count of a node, depending on epsilon. 
	Note that the noisy count can be negative
	"""
        if curr.n_data is None:
            count = 0
        else:
            count = curr.n_data.shape[1]

        if epsilon < 10 ** (-6):
            return count
        else:
            return count + self.differ.getNoise(1, epsilon)

    def testLeaf(self, curr):
        """test whether a node is a leaf node"""
        raise NotImplementedError

    def intersect(self, hrect, query):
        """
        checks if the hyper-rectangle intersects with the 
        hyper-rectangle defined by the query in every dimension
        """
        bool_m1 = query[0, :] >= hrect[1, :]
        bool_m2 = query[1, :] <= hrect[0, :]
        bool_m = np.logical_or(bool_m1, bool_m2)
        if np.any(bool_m):
            return False
        else:
            return True

    def buildIndex(self):
        """build the htree & grid structure. htree is a high fanout and low level tree"""
        budget_c = self.getCountBudget()  # an array with two elements
        self.root.n_count = self.getCount(self.root, 0)  # add noisy count to the root
        queue = deque()
        queue.append(self.root)
        nleaf = 0  # number of leaf node, for debug only
        # ## main loop
        while len(queue) > 0:
            curr = queue.popleft()

            if self.testLeaf(curr) is True:  # if curr is a leaf node
                if curr.n_depth < self.param.maxHeightHTree:
                    remainingEps = sum(budget_c[curr.n_depth:])
                    curr.n_count = self.getCount(curr, remainingEps)
                    curr.eps = remainingEps
                nleaf += 1
                curr.n_isLeaf = True

            else:  # curr needs to split
                split_arr, n_data_arr = self.getCoordinates(curr)
                if split_arr is None:
                    if curr.n_depth < self.param.maxHeightHTree:
                        remainingEps = sum(budget_c[curr.n_depth:])
                        curr.n_count = self.getCount(curr, remainingEps)
                        curr.eps = remainingEps
                    nleaf += 1
                    curr.n_isLeaf = True
                    curr.children = []
                    continue  # if the first level cell is leaf node
                for i in range(len(n_data_arr)):
                    node = Node()
                    if curr.n_depth % Params.NDIM == 0:  # split by x coord
                        node.n_box = np.array([[split_arr[i], curr.n_box[0, 1]], [split_arr[i + 1], curr.n_box[1, 1]]])
                    else:  # split by y coord
                        node.n_box = np.array([[curr.n_box[0, 0], split_arr[i]], [curr.n_box[1, 0], split_arr[i + 1]]])

                    node.index = i
                    node.parent = curr
                    node.n_depth = curr.n_depth + 1
                    node.n_data = n_data_arr[i]
                    node.n_count = self.getCount(node, budget_c[node.n_depth])
                    node.eps = budget_c[node.n_depth]
                    if curr.n_depth == 2:
                        node.secondLevelPartitions = curr.secondLevelPartitions
                    curr.children.append(node)
                    queue.append(node)

                # if curr.n_depth == 2:
                # self.children.append(curr)

                curr.n_data = None  # ## do not need the data points coordinates now
        # end of while      
        logging.debug("Generic: number of leaves: %d" % nleaf)


        # canonical range query does apply

    def rangeCount(self, query):
        """
        Query answering function. Find the number of data points within a query rectangle.
        This function assume that the tree is contructed with noisy count for every node
        """
        queue = deque()
        queue.append(self.root)
        count = 0.0
        while len(queue) > 0:
            curr = queue.popleft()
            _box = curr.n_box
            if curr.n_isLeaf is True:
                frac = 1
                if self.intersect(_box, query):
                    for i in range(_box.shape[1]):
                        if _box[1, i] == _box[0, i] or Params.WorstCase == True:
                            frac *= 1
                        else:
                            frac *= (min(query[1, i], _box[1, i]) - max(query[0, i], _box[0, i])) / (
                                _box[1, i] - _box[0, i])
                    count += curr.n_count * frac
            else:  # if not leaf
                for node in curr.children:
                    bool_matrix = np.zeros((2, query.shape[1]))
                    bool_matrix[0, :] = query[0, :] <= _box[0, :]
                    bool_matrix[1, :] = query[1, :] >= _box[1, :]

                    if np.all(bool_matrix):  # if query range contains node range
                        count += node.n_count
                    elif self.intersect(_box, query):
                        queue.append(node)
        return float(count)


    def leafCover(self, loc):
        """
        find a leaf node that cover the location
        """
        queue = deque()
        queue.append(self.root)
        while len(queue) > 0:
            curr = queue.popleft()
            _box = curr.n_box
            if curr.n_isLeaf is True:
                if is_rect_cover(_box, loc):
                    return curr
            else:  # if not leaf
                queue.extend(curr.children)


    def checkCorrectness(self, node, nodePoints=None):
        """
        Total number of data points of all leaf nodes should equal to the total data points
        """
        totalPoints = 0
        if node is None:
            return 0
        if node.n_isLeaf and node.n_data is not None:
            return node.n_data.shape[1]
        for child in node.children:
            totalPoints += self.checkCorrectness(child)

        if nodePoints is None:
            return totalPoints

        if totalPoints == nodePoints:
            return True
        return False
Example #18
0
def main():

    #
    # Need to parse command line arguements first, because PyROOT is going
    # to muck up the usage as soon as a root class is loaded.
    #
    parser = OptionParser()
    parser.add_option("-b",
                      "--base",
                      dest="baseline",
                      help="Set the baseline geometry [required]",
                      metavar="BASE",
                      default="NONE")
    parser.add_option("-g",
                      "--geom",
                      dest="geometry",
                      help="Set the comparison geometry [required]",
                      metavar="GEOM",
                      default="NONE")

    parser.add_option(
        "--basename",
        dest="basename",
        help=
        "Set the name of the baseline geometry if different from base [optional]",
        metavar="BASENAME",
        default="same")
    parser.add_option(
        "--geomname",
        dest="geomname",
        help=
        "Set the name of the comparsion geometry if different from geom [optional]",
        metavar="GEOMNAME",
        default="same")

    parser.add_option("-v",
                      "--volume",
                      dest="volume",
                      help="Set the top level volume [required]",
                      metavar="VOLUME",
                      default="CAVE")
    parser.add_option("--basepath", dest="basepath", default="NONE")
    parser.add_option("--geompath", dest="geompath", default="NONE")

    parser.add_option("--stat",
                      dest="stat",
                      default="radlen",
                      help="Statistic to display")

    parser.add_option(
        "--thumbnail",
        dest="thumbnail",
        default=False,
        action="store_true",
        help="Creates thumbnails of the front page of the PDF file.")
    parser.add_option("--size",
                      dest="size",
                      default=False,
                      help="Sets the size of the thumbnail, e.g. 850x1100")

    (opts, args) = parser.parse_args()

    if (opts.baseline == "NONE"):
        print ""
        print "Must specify a baseline geometry."
        print ""
        os.system("./differential.py --help")
        return

    if (opts.geometry == "NONE"):
        print ""
        print "Must specify a comparison geometry."
        print ""
        os.system("./differential.py --help")
        return

    from Differential import Differential, _file_path
    from Differential import get_geom_file
    from Canvas import CanvasPDF

    from ROOT import TFile
    from ROOT import TGeoManager
    from ROOT import TGeoVolume
    from ROOT import TGeoNode

    from ROOT import kWhite
    from ROOT import gStyle

    gStyle.SetHistMinimumZero()
    gStyle.SetCanvasColor(kWhite)

    # Setup temporary symbolic links to the root files
    if (opts.basepath != "NONE"):
        os.system("ln -s " + opts.basepath + "/" + opts.baseline + ".root .")
    if (opts.geompath != "NONE"):
        os.system("ln -s " + opts.geompath + "/" + opts.geometry + ".root .")

    canvas = CanvasPDF(name="differential-" + opts.baseline + "-vs-" +
                       opts.geometry + "-" + opts.volume,
                       title="Geometry differential for volume=" +
                       opts.volume + " " + opts.baseline + " vs " +
                       opts.geometry,
                       nx=1,
                       ny=1,
                       thumbnail=opts.thumbnail)

    differ = Differential(base=opts.baseline,
                          comp=opts.geometry,
                          top=opts.volume,
                          basegeo=opts.basename,
                          compgeo=opts.geomname,
                          canvas=canvas,
                          stat=opts.stat)

    # Remove temporary symbolic links to the root files
    if (opts.basepath != "NONE"):
        os.system("rm " + opts.basepath + "/" + opts.baseline + ".root")
    if (opts.geompath != "NONE"):
        os.system("rm " + opts.geompath + "/" + opts.geometry + ".root")
Example #19
0
class KTree(object):
    """Generic tree template"""

    def __init__(self, data, param):
        self.param = param
        self.differ = Differential(self.param.Seed)
        # ## initialize the root
        self.root = KNode()
        self.root.n_data = data
        self.root.n_box = np.array([Params.LOW, Params.HIGH])
        self.root.n_budget = Params.maxHeight

    def getSplitBudget(self):
        """return a list of h budget values for split"""
        raise NotImplementedError

    def getCountBudget(self):
        """return a list of (h+1) budget values for noisy count"""
        raise NotImplementedError

    def getNoisyMedian(self, array, left, right, epsilon):
        """return the split value of an array"""
        raise NotImplementedError

    def getCoordinates(self, curr):
        """
        return the coordinate of lower-right point of the NW sub-node
        and the upper-left point of the SW sub-node and the data points
        in the four subnodes, i.e.
        return (x_nw,y_nw),(x_se,y_se), nw_data, ne_data, sw_data, se_data
        """
        raise NotImplementedError

    def getSplit(self, array, left, right, epsilon):
        """
        return the split point given an array, may be data-independent or
        true median or noisy median, depending on the type of the tree
        """
        raise NotImplementedError

    def getCount(self, curr, epsilon):
        """ return true count or noisy count of a node, depending on epsilon"""
        if curr.n_data is None:
            count = 0
        else:
            count = curr.n_data.shape[1]
        if epsilon < 10 ** (-6):
            return count
        else:
            return count + self.differ.getNoise(1, epsilon)

    def testLeaf(self, curr):
        """ test whether a node should be a leaf node """
        if (curr.n_depth == Params.maxHeight) or \
                (curr.n_budget <= 0) or \
                (curr.n_data is None or curr.n_data.shape[1] == 0) or \
                (curr.n_count <= self.param.minPartSize):
            return True
        return False

    def cell_setLeaf(self, curr):
        """ will be overrided in kd_cell """
        return

    def buildIndex(self):
        """ Function to build the tree structure, fanout = 4 by default for spatial (2D) data """
        budget_c = self.getCountBudget()
        self.root.n_count = self.getCount(self.root, budget_c[0])  # ## add noisy count to root
        stack = deque()
        stack.append(self.root)
        nleaf = 0  # ## leaf counter
        max_depth = -1
        # ## main loop
        while len(stack) > 0:
            curr = stack.popleft()
            if curr.n_depth > max_depth:
                max_depth = curr.n_depth

            if self.testLeaf(curr) is True:  # ## curr is a leaf node
                if curr.n_depth < Params.maxHeight:  # ## if a node ends up earlier than maxHeight, it should be able to use the remaining count budget
                    remainingEps = sum(budget_c[curr.n_depth + 1:])
                    curr.n_count = self.getCount(curr, remainingEps)
                nleaf += 1
                curr.n_isLeaf = True
                self.cell_setLeaf(curr)

            else:  # ## curr needs to split
                curr.n_budget -= 1  # ## some budget will be used regardless the split is successful or not
                tmp = self.getCoordinates(curr)
                nw_node, ne_node, sw_node, se_node = KNode(), KNode(), KNode(), KNode()  # create sub-nodes
                nw_coord, ne_coord, nw_node.n_data, ne_node.n_data, sw_node.n_data, se_node.n_data = tmp
                x_nw, y_nw = nw_coord
                x_se, y_se = ne_coord
                # ## update bounding box, depth, count, budget for the four subnodes
                nw_node.n_box = np.array([[curr.n_box[0, 0], y_nw], [x_nw, curr.n_box[1, 1]]])
                ne_node.n_box = np.array([[x_nw, y_se], [curr.n_box[1, 0], curr.n_box[1, 1]]])
                sw_node.n_box = np.array([[curr.n_box[0, 0], curr.n_box[0, 1]], [x_se, y_nw]])
                se_node.n_box = np.array([[x_se, curr.n_box[0, 1]], [curr.n_box[1, 0], y_se]])

                for sub_node in [nw_node, ne_node, sw_node, se_node]:
                    sub_node.n_depth = curr.n_depth + 1
                    # if (sub_node.n_depth == Params.maxHeight and sub_node.n_data is not None):
                    # print len(sub_node.n_data[0])
                    sub_node.n_count = self.getCount(sub_node, budget_c[sub_node.n_depth])
                    sub_node.n_budget = curr.n_budget
                    stack.append(sub_node)

                curr.n_data = None  # ## do not need the data points coordinates now
                curr.nw, curr.ne, curr.sw, curr.se = nw_node, ne_node, sw_node, se_node
        # end of while

        logging.debug("number of leaves: %d" % nleaf)
        logging.debug("max depth: %d" % max_depth)

    def rect_intersect(self, hrect, query):
        """
        checks if the hyper-rectangle intersects with the
        hyper-rectangle defined by the query in every dimension
    
        """
        bool_m1 = query[0, :] >= hrect[1, :]
        bool_m2 = query[1, :] <= hrect[0, :]
        bool_m = np.logical_or(bool_m1, bool_m2)
        if np.any(bool_m):
            return False
        else:
            return True

    def rangeCount(self, query):
        """
        Query answering function. Find the number of data points within a query rectangle.
        """
        stack = deque()
        stack.append(self.root)
        count = 0.0
        # ## Below are three variables recording the number of 1) whole leaf 2) partial leaf 3) whole internal node,
        # ## respectively, which contribute to the query answer. For debug purpose only.
        l_whole, l_part, i_whole = 0, 0, 0

        while len(stack) > 0:
            curr = stack.popleft()
            _box = curr.n_box
            if curr.n_isLeaf is True:
                frac = 1
                if self.rect_intersect(_box, query):
                    for i in range(_box.shape[1]):
                        if _box[1, i] == _box[0, i] or Params.WorstCase == True:
                            frac *= 1
                        else:
                            frac *= (min(query[1, i], _box[1, i]) - max(query[0, i], _box[0, i])) / (
                                _box[1, i] - _box[0, i])
                    count += curr.n_count * frac
                    if 1.0 - frac < 10 ** (-6):
                        l_whole += 1
                    else:
                        l_part += 1

            else:  # ## if not leaf
                bool_matrix = np.zeros((2, query.shape[1]))
                bool_matrix[0, :] = query[0, :] <= _box[0, :]
                bool_matrix[1, :] = query[1, :] >= _box[1, :]

                if np.all(bool_matrix) and self.param.useLeafOnly is False:  # ## if query range contains node range
                    count += curr.n_count
                    i_whole += 1
                else:
                    if self.rect_intersect(curr.nw.n_box, query):
                        stack.append(curr.nw)
                    if self.rect_intersect(curr.ne.n_box, query):
                        stack.append(curr.ne)
                    if self.rect_intersect(curr.sw.n_box, query):
                        stack.append(curr.sw)
                    if self.rect_intersect(curr.se.n_box, query):
                        stack.append(curr.se)

        return float(count)  # , i_whole, l_whole, l_part

    def adjustConsistency(self):
        """ 
        Post processing for uniform noise across levels. Due to 
        Michael Hay, Vibhor Rastogi, Gerome Miklau, Dan Suciu, 
        Boosting the Accuracy of Differentially-Private Histograms Through Consistency,
        VLDB 2010
        """
        logging.debug('adjusting consistency...')
        # ## upward pass
        self.root.get_z()
        # ## downward pass
        queue = deque()
        queue.append(self.root)
        while len(queue) > 0:
            curr = queue.popleft()
            if curr.n_isLeaf is False:
                adjust = (curr.n_count - curr.nw.n_count - curr.ne.n_count - curr.sw.n_count - curr.se.n_count) / 4.0
                for subnode in [curr.nw, curr.ne, curr.sw, curr.se]:
                    subnode.n_count += adjust
                    queue.append(subnode)

    def postProcessing(self):
        """ 
        Post processing for general noise distribution across levels. Due to
        G. Cormode, M. Procopiuc, E. Shen, D. Srivastava and T. Yu, 
        Differentially Private Spatial Decompositions, ICDE 2012.
        """
        logging.debug("post-processing...")
        budget = self.getCountBudget()  # ## count budget for h+1 levels
        H = Params.maxHeight
        # ## Phase 1 (top-down)
        queue = deque()
        self.root.n_count *= budget[self.root.n_depth] ** 2
        queue.append(self.root)
        while len(queue) > 0:
            curr = queue.popleft()
            if curr.n_isLeaf is False:
                for subnode in [curr.nw, curr.ne, curr.sw, curr.se]:
                    subnode.n_count = curr.n_count + subnode.n_count * (budget[subnode.n_depth] ** 2)
                    queue.append(subnode)
        # ## Phase 2 (bottom-up)
        self.root.update_count()
        # ## Phase 3 (top-down)
        queue = deque()
        E_root = 0
        for i in range(H + 1):
            E_root += 4 ** i * budget[H - i] * budget[H - i]
        self.root.n_count /= E_root
        self.root.n_F = 0
        queue.append(self.root)
        while len(queue) > 0:
            curr = queue.popleft()
            if curr.n_isLeaf is False:
                h = H - curr.n_depth - 1  # ## height of curr's children
                E_h = 0
                for i in range(h + 1):
                    E_h += 4 ** i * budget[H - i] * budget[H - i]
                for subnode in [curr.nw, curr.ne, curr.sw, curr.se]:
                    subnode.n_F = curr.n_F + curr.n_count * (budget[curr.n_depth] ** 2)
                    subnode.n_count = (subnode.n_count - 4 ** h * subnode.n_F) / E_h
                    queue.append(subnode)

    def pruning(self):
        """
        If the tree is grown without the stopping condition of minLeafSize, prune it here after post processing
        """
        logging.debug("pruning...")
        queue = deque()
        queue.append(self.root)
        while len(queue) > 0:
            curr = queue.popleft()
            if curr.n_isLeaf is False:
                if curr.n_count <= self.param.minPartSize:
                    curr.n_isLeaf = True
                else:
                    queue.append(curr.nw)
                    queue.append(curr.ne)
                    queue.append(curr.sw)
                    queue.append(curr.se)
Example #20
0
class Kd_cell(Kd_pure):
    """ Kd tree based on syntatic data generation and a grid structure. See
    Y. Xiao, L. Xiong, and C. Yuan, Differentially private data release
    through multidimensional partitioning, in SDM Workshop, VLDB, 2010
    """

    def __init__(self, data, param):
        self.param = param
        self.differ = Differential(self.param.Seed)
        self.mapp = None
        self.root = KNode()
        self.realData = data
        self.root.n_box = None
        self.root.n_budget = Params.maxHeight

    def getCountBudget(self):
        count_eps = self.param.Eps * 0.5
        H = Params.maxHeight
        if self.param.geoBudget == 'none':
            return [count_eps / (H + 1) for _ in range(H + 1)]
        elif self.param.geoBudget == 'aggressive':
            unit = count_eps / (2 ** (H + 1) - 1)
            return [unit * 2 ** i for i in range(H + 1)]
        elif self.param.geoBudget == 'quadratic':
            unit = count_eps * (np.sqrt(2) - 1) / (2 ** (0.5 * (H + 1)) - 1)
            return [unit * 2 ** (0.5 * i) for i in range(H + 1)]
        elif self.param.geoBudget == 'optimal':
            unit = count_eps * ((2 ** (1.0 / 3)) - 1) / (2 ** ((1.0 / 3) * (H + 1)) - 1)
            return [unit * 2 ** ((1.0 / 3) * i) for i in range(H + 1)]
        elif self.param.geoBudget == 'quartic':
            unit = count_eps * ((2 ** (1.0 / 4)) - 1) / (2 ** ((1.0 / 4) * (H + 1)) - 1)
            return [unit * 2 ** ((1.0 / 4) * i) for i in range(H + 1)]
        else:
            logging.error('No such geoBudget scheme')
            sys.exit(1)

    def synthetic_gen(self):
        """Apply a grid structure on the domain and perturb the count using half
        of the available privacy budget """
        logging.debug('generating synthetic map...')
        data = self.realData
        unit = Params.unitGrid
        x_min = np.floor(Params.LOW[0] / unit) * unit
        x_max = np.ceil(Params.HIGH[0] / unit) * unit
        y_min = np.floor(Params.LOW[1] / unit) * unit
        y_max = np.ceil(Params.HIGH[1] / unit) * unit

        x_CELL = int(np.rint((x_max - x_min) / unit))
        y_CELL = int(np.rint((y_max - y_min) / unit))

        self.root.n_box = np.array([[x_min, y_min], [x_max, y_max]])

        self.mapp = np.zeros((x_CELL, y_CELL)) - 1  # ## initialize every cell with -1
        for i in range(Params.NDATA):  # ## populate the map
            point = data[:, i]
            cell_x = int(np.floor((point[0] - x_min) / unit))
            cell_y = int(np.floor((point[1] - y_min) / unit))
            if self.mapp[cell_x, cell_y] != -1:
                self.mapp[cell_x, cell_y] += 1
            else:
                self.mapp[cell_x, cell_y] = 1

        for i in range(x_CELL):  # ## perturb the counts
            for j in range(y_CELL):
                if self.mapp[i, j] != -1:
                    self.mapp[i, j] += np.rint(self.differ.getNoise(1, 0.5 * self.param.Eps))
                else:
                    self.mapp[i, j] = np.rint(self.differ.getNoise(1, 0.5 * self.param.Eps))
                # if noisy count is negative, ignore the noise and generate no points
                if self.mapp[i, j] < 0:
                    self.mapp[i, j] = 0

    def cell_setLeaf(self, curr):
        """ Throw away the counts based on the syntatic data """
        curr.n_count = 0
        return

    def testLeaf(self, curr):
        if (curr.n_count <= self.param.minPartSize) or (curr.n_depth == Params.maxHeight) or (
                self.uniform_test(curr, self.param.cellDistance)):
            return True
        return False

    def uniform_test(self, curr, distance):
        """ One of the stopping conditions: cell is uniform according to some threshold 'distance') """
        unit = Params.unitGrid
        x_min = int(np.rint((curr.n_box[0, 0] - self.root.n_box[0, 0]) / unit))
        x_max = int(np.rint((curr.n_box[1, 0] - self.root.n_box[0, 0]) / unit))
        y_min = int(np.rint((curr.n_box[0, 1] - self.root.n_box[0, 1]) / unit))
        y_max = int(np.rint((curr.n_box[1, 1] - self.root.n_box[0, 1]) / unit))
        data = self.mapp[x_min:x_max, y_min:y_max]
        total = np.sum(data)
        avg = total / ((x_max - x_min) * (y_max - y_min))
        dist = np.sum(np.abs(data - avg))
        if dist > distance:
            return False
        else:
            return True

    def buildIndex(self):
        stack = deque()
        stack.append(self.root)
        nleaf = 0  # leaf counter
        max_depth = -1
        self.root.n_count = np.sum(self.mapp)
        while len(stack) > 0:
            curr = stack.popleft()
            if curr.n_depth > max_depth:
                max_depth = curr.n_depth
            if self.testLeaf(curr) is True:  # curr is a leaf node
                nleaf += 1
                curr.n_isLeaf = True
                self.cell_setLeaf(curr)
            else:  # curr needs to split
                curr.n_budget -= 1
                tmp = self.getCoordinates(curr)
                nw_node, ne_node, sw_node, se_node = KNode(), KNode(), KNode(), KNode()  # create sub-nodes
                nw_coord, ne_coord, count_tmp = tmp
                x_nw, y_nw = nw_coord
                x_se, y_se = ne_coord

                nw_node.n_box = np.array([[curr.n_box[0, 0], y_nw], [x_nw, curr.n_box[1, 1]]])
                ne_node.n_box = np.array([[x_nw, y_se], [curr.n_box[1, 0], curr.n_box[1, 1]]])
                sw_node.n_box = np.array([[curr.n_box[0, 0], curr.n_box[0, 1]], [x_se, y_nw]])
                se_node.n_box = np.array([[x_se, curr.n_box[0, 1]], [curr.n_box[1, 0], y_se]])

                c_t = 0
                for sub_node in [nw_node, ne_node, sw_node, se_node]:
                    sub_node.n_depth = curr.n_depth + 1
                    sub_node.n_count = count_tmp[c_t]
                    sub_node.n_budget = curr.n_budget
                    stack.append(sub_node)
                    c_t += 1
                curr.nw, curr.ne, curr.sw, curr.se = nw_node, ne_node, sw_node, se_node

        # end of while
        logging.debug("number of leaves: %d" % nleaf)
        logging.debug("max depth: %d" % max_depth)

    def getCoordinates(self, curr):
        dim_1 = curr.n_depth % Params.NDIM  # primary split dimension
        UNIT = Params.unitGrid
        x_min = int(np.rint((curr.n_box[0, 0] - self.root.n_box[0, 0]) / UNIT))
        x_max = int(np.rint((curr.n_box[1, 0] - self.root.n_box[0, 0]) / UNIT))
        y_min = int(np.rint((curr.n_box[0, 1] - self.root.n_box[0, 1]) / UNIT))
        y_max = int(np.rint((curr.n_box[1, 1] - self.root.n_box[0, 1]) / UNIT))

        total = np.sum(self.mapp[x_min:x_max, y_min:y_max])
        if dim_1 == 0:
            for i in range(x_max - x_min):
                if np.sum(self.mapp[x_min:x_min + i + 1, y_min:y_max]) >= total / 2:
                    break
            split_prm = (x_min + i + 1) * UNIT + self.root.n_box[0, 0]

            half_1 = np.sum(self.mapp[x_min:x_min + i + 1, y_min:y_max])
            half_2 = np.sum(self.mapp[x_min + i + 1:x_max, y_min:y_max])
            for j in range(y_max - y_min):
                if np.sum(self.mapp[x_min:x_min + i + 1, y_min:y_min + j + 1]) >= half_1 / 2:
                    break
            split_sec1 = self.root.n_box[0, 1] + (y_min + j + 1) * UNIT
            n_sw = np.sum(self.mapp[x_min:x_min + i + 1, y_min:y_min + j + 1])
            n_nw = np.sum(self.mapp[x_min:x_min + i + 1, y_min + j + 1:y_max])
            for k in range(y_max - y_min):
                if np.sum(self.mapp[x_min + i + 1:x_max, y_min:y_min + k + 1]) >= half_2 / 2:
                    break
            split_sec2 = self.root.n_box[0, 1] + (y_min + k + 1) * UNIT
            n_se = np.sum(self.mapp[x_min + i + 1:x_max, y_min:y_min + k + 1])
            n_ne = np.sum(self.mapp[x_min + i + 1:x_max, y_min + k + 1:y_max])
            return (split_prm, split_sec1), (split_prm, split_sec2), (n_nw, n_ne, n_sw, n_se)

        else:
            for i in range(y_max - y_min):
                if np.sum(self.mapp[x_min:x_max, y_min:y_min + i + 1]) >= total / 2:
                    break
            split_prm = self.root.n_box[0, 1] + (y_min + i + 1) * UNIT

            half_1 = np.sum(self.mapp[x_min:x_max, y_min:y_min + i + 1])
            half_2 = np.sum(self.mapp[x_min:x_max, y_min + i + 1:y_max])
            for j in range(x_max - x_min):
                if np.sum(self.mapp[x_min:x_min + j + 1, y_min:y_min + i + 1]) >= half_1 / 2:
                    break
            split_sec1 = (x_min + j + 1) * UNIT + self.root.n_box[0, 0]
            n_sw = np.sum(self.mapp[x_min:x_min + j + 1, y_min:y_min + i + 1])
            n_se = np.sum(self.mapp[x_min + j + 1:x_max, y_min:y_min + i + 1])
            for k in range(x_max - x_min):
                if np.sum(self.mapp[x_min:x_min + k + 1, y_min + i + 1:y_max]) >= half_2 / 2:
                    break
            split_sec2 = (x_min + k + 1) * UNIT + self.root.n_box[0, 0]
            n_nw = np.sum(self.mapp[x_min:x_min + k + 1, y_min + i + 1:y_max])
            n_ne = np.sum(self.mapp[x_min + k + 1:x_max, y_min + i + 1:y_max])
            return (split_sec2, split_prm), (split_sec1, split_prm), (n_nw, n_ne, n_sw, n_se)


    def populate_synthetic_tree(self):
        """ Populate real data to the synthetic tree """
        logging.debug('populating synthetic tree...')
        a_data = self.realData
        ndata = a_data.shape[1]
        for i in range(ndata):
            ptx = a_data[0, i]
            pty = a_data[1, i]
            leaf = self.root.find_subnode(ptx, pty)
            leaf.n_count += 1

        # traverse the tree and update leaf counts
        stack = deque()
        stack.append(self.root)
        while len(stack) > 0:
            cur_node = stack.popleft()
            if cur_node.n_isLeaf is True:  # leaf
                cur_node.n_count += self.differ.getNoise(1, 0.5 * self.param.Eps)
            else:
                stack.append(cur_node.nw)
                stack.append(cur_node.ne)
                stack.append(cur_node.sw)
                stack.append(cur_node.se)
Example #21
0
def test_privateMedian():
    f = open(res_dir + 'privateMedian', 'w')
    f_t = open(res_dir + 'privateMedian-time', 'w')
    data = np.sort(dataGen.data_gen(dist, NDIM, LO, HI, NDATA)).flatten()
    n = len(data)

    for i in range(10):
        print 'level ' + `i`
        container = np.zeros(6)
        container_t = np.zeros(6)
        for seed in seed_list:
            perturber = Differential(seed)
            for j in range(2 ** i):
                c_data = data[n * j / 2 ** i:n * (j + 1) / 2 ** i]
                c_len = len(c_data)

                # exponential mechanism
                start = time.clock()
                em = perturber.getSplit_exp(c_data, c_data[0], c_data[-1], eps, 1)
                end = time.clock()
                container_t[0] += end - start
                # smooth sensitivity (2-approx)
                start = time.clock()
                ls = perturber.getSplit_smooth(c_data, c_data[0], c_data[-1], eps, 1)
                end = time.clock()
                container_t[1] += end - start
                # exponential mechanism sampling
                start = time.clock()
                em_samp = perturber.getSplit_exp(c_data, c_data[0], c_data[-1], eps, srt)
                end = time.clock()
                container_t[2] += end - start
                # smooth sensitivity sampling
                start = time.clock()
                ls_samp = perturber.getSplit_smooth(c_data, c_data[0], c_data[-1], eps, srt)
                end = time.clock()
                container_t[3] += end - start
                # noisy mean approximation
                start = time.clock()
                nm = perturber.getSplit_noisyMean(c_data, c_data[0], c_data[-1], eps)
                end = time.clock()
                container_t[4] += end - start
                # noisy grid approximation
                start = time.clock()
                ng = perturber.getSplit_grid(c_data, c_data[0], c_data[-1], eps, unit)
                end = time.clock()
                container_t[5] += end - start

                res = [em, ls, em_samp, ls_samp, nm, ng]
                for k in range(6):
                    if res[k] >= c_data[-1] or res[k] <= c_data[0]:
                        container[k] += 1.0
                    else:
                        r_k = np.searchsorted(c_data, res[k])
                        r_m = float(c_len) / 2
                        container[k] += abs(r_m - r_k) / r_m
                        # end of j loop
        for k in range(6):
            f.write(`container[k] / (2 ** i * len(seed_list))` + ' ')
        f.write('\n')
        for k in range(6):
            f_t.write(`container_t[k] / (2 ** i * len(seed_list))` + ' ')
        f_t.write('\n')
    # end of i
    f.close()
    f_t.close()
Example #22
0
class Hilbert(Kd_standard):
    """ Hilbert R-tree """

    def __init__(self, data, param):
        self.param = param
        self.differ = Differential(self.param.Seed)
        self.root = KNode()
        self.realData = data
        self.root.n_budget = Params.maxHeight

    def h_encode(self, x, y, r):
        """ (x,y) -> value h in Hilbert space, r is the resolution of the Hilbert curve """
        mask = (1 << r) - 1
        heven = x ^ y
        notx = ~x & mask
        noty = ~y & mask
        temp = notx ^ y
        v0, v1 = 0, 0
        for k in range(r - 1):
            v1 = ((v1 & heven) | ((v0 ^ noty) & temp)) >> 1
            v0 = ((v0 & (v1 ^ notx)) | (~v0 & (v1 ^ noty))) >> 1
        hodd = (~v0 & (v1 ^ x)) | (v0 & (v1 ^ noty))
        return self.interleaveBits(hodd, heven)

    def h_decode(self, h, r):
        """ h -> (x,y) """
        heven, hodd = self.deleaveBits(h)
        mask = (1 << r) - 1
        v0, v1 = 0, 0
        temp1 = ~(heven | hodd) & mask
        temp0 = ~(heven ^ hodd) & mask
        for k in range(r - 1):
            v1 = (v1 ^ temp1) >> 1
            v0 = (v0 ^ temp0) >> 1
        return (v0 & ~heven) ^ v1 ^ hodd, (v0 | heven) ^ v1 ^ hodd

    def interleaveBits(self, hodd, heven):
        val = 0
        maxx = max(hodd, heven)
        n = 0
        while maxx > 0:
            n += 1
            maxx >>= 1
        for i in range(n):
            bitMask = 1 << i
            a = 1 << (2 * i) if (heven & bitMask) else 0
            b = 1 << (2 * i + 1) if (hodd & bitMask) else 0
            val += a + b
        return val

    def deleaveBitsOdd(self, x):
        x &= 0x5555555555555555
        x = (x | (x >> 1)) & 0x3333333333333333
        x = (x | (x >> 2)) & 0x0F0F0F0F0F0F0F0F
        x = (x | (x >> 4)) & 0x00FF00FF00FF00FF
        x = (x | (x >> 8)) & 0x0000FFFF0000FFFF
        x = (x | (x >> 16)) & 0x00000000FFFFFFFF
        return x

    def deleaveBits(self, x):
        return self.deleaveBitsOdd(x), self.deleaveBitsOdd(x >> 1)

    def get_Hcoord(self, x, y, R):
        hx = int((x - Params.LOW[0]) / (Params.HIGH[0] - Params.LOW[0] + 10 ** (-8)) * (2 ** R))
        hy = int((y - Params.LOW[1]) / (Params.HIGH[1] - Params.LOW[1] + 10 ** (-8)) * (2 ** R))
        return hx, hy

    def get_Rcoord(self, hx, hy, R):
        x = float(hx) / (2 ** R) * (Params.HIGH[0] - Params.LOW[0]) + Params.LOW[0]
        y = float(hy) / (2 ** R) * (Params.HIGH[1] - Params.LOW[1]) + Params.LOW[1]
        return x, y

    def getCount(self, curr, epsilon):
        count = len(curr.n_data)
        if epsilon < 10 ** (-6):
            return count
        else:
            return count + self.differ.getNoise(1, epsilon)

    def testLeaf(self, curr):
        """ test whether a node should be a leaf node """
        if (curr.n_depth == Params.maxHeight) or \
                (curr.n_budget <= 0) or \
                (curr.n_count <= self.param.minPartSize):
            return True
        return False

    def buildIndex(self):
        budget_c = self.getCountBudget()
        logging.debug('encoding coordinates...')
        RES = self.param.Res  # order of Hilbert curve
        ndata = self.realData.shape[1]
        hidx = np.zeros(ndata)
        for i in range(ndata):
            hx, hy = self.get_Hcoord(self.realData[0, i], self.realData[1, i], RES)
            hidx[i] = self.h_encode(hx, hy, RES)
        hidx = np.sort(hidx)

        logging.debug('building index...')
        self.root.n_data = hidx
        self.root.n_box = (0, 2 ** (2 * RES) - 1)
        self.root.n_count = self.getCount(self.root, budget_c[0])

        stack = deque()
        stack.append(self.root)
        tree = [self.root]
        leaf_li = []  # storage of all leaves
        nleaf = 0  # leaf counter
        max_depth = -1

        while len(stack) > 0:
            curr = stack.popleft()
            if curr.n_depth > max_depth:
                max_depth = curr.n_depth
            if self.testLeaf(curr) is True:  # curr is a leaf node
                if curr.n_depth < Params.maxHeight:
                    remainingEps = sum(budget_c[curr.n_depth + 1:])
                    curr.n_count = self.getCount(curr, remainingEps)
                nleaf += 1
                curr.n_isLeaf = True
                leaf_li.append(curr)

            else:  # curr needs to split
                curr.n_budget -= 1
                tmp = self.getCoordinates(curr)
                if tmp is False:  # if split fails
                    stack.append(curr)
                    continue
                nw_node, ne_node, sw_node, se_node = KNode(), KNode(), KNode(), KNode()  # create sub-nodes
                split_prm, split_sec1, split_sec2, nw_node.n_data, ne_node.n_data, sw_node.n_data, se_node.n_data = tmp

                nw_node.n_box = (curr.n_box[0], split_sec1)
                ne_node.n_box = (split_sec1, split_prm)
                sw_node.n_box = (split_prm, split_sec2)
                se_node.n_box = (split_sec2, curr.n_box[1])

                for sub_node in [nw_node, ne_node, sw_node, se_node]:
                    sub_node.n_depth = curr.n_depth + 1
                    sub_node.n_count = self.getCount(sub_node, budget_c[sub_node.n_depth])
                    sub_node.n_budget = curr.n_budget
                    stack.append(sub_node)
                    tree.append(sub_node)
                curr.n_data = None
                curr.nw, curr.ne, curr.sw, curr.se = nw_node, ne_node, sw_node, se_node

        # end of while
        logging.debug("number of leaves: %d" % nleaf)
        logging.debug("max depth: %d" % max_depth)

        # # convert hilbert values in leaf nodes to real coordinates and update bounding box
        logging.debug('decoding and updating bounding box...')
        for leaf in leaf_li:
            bbox = np.array([[1000.0, 1000.0], [-1000.0, -1000.0]], dtype='float64')
            for hvalue in leaf.n_data:
                hx, hy = self.h_decode(int(hvalue), RES)
                x, y = self.get_Rcoord(hx, hy, RES)
                bbox[0, 0] = x if x < bbox[0, 0] else bbox[0, 0]
                bbox[1, 0] = x if x > bbox[1, 0] else bbox[1, 0]
                bbox[0, 1] = y if y < bbox[0, 1] else bbox[0, 1]
                bbox[1, 1] = y if y > bbox[1, 1] else bbox[1, 1]
            leaf.n_box = bbox

        # # update bounding box bottom-up
        tree = sorted(tree, cmp=self.cmp_node)
        logging.debug('updating box for each node in the tree...')
        for node in tree:
            if node.n_data is None:
                node.n_box = np.zeros((2, 2))
                node.n_box[0, 0] = min(node.ne.n_box[0, 0], node.nw.n_box[0, 0], node.se.n_box[0, 0],
                                       node.sw.n_box[0, 0])
                node.n_box[0, 1] = min(node.ne.n_box[0, 1], node.nw.n_box[0, 1], node.se.n_box[0, 1],
                                       node.sw.n_box[0, 1])
                node.n_box[1, 0] = max(node.ne.n_box[1, 0], node.nw.n_box[1, 0], node.se.n_box[1, 0],
                                       node.sw.n_box[1, 0])
                node.n_box[1, 1] = max(node.ne.n_box[1, 1], node.nw.n_box[1, 1], node.se.n_box[1, 1],
                                       node.sw.n_box[1, 1])


    def cmp_node(self, node1, node2):
        # reverse order
        return int(node2.n_depth - node1.n_depth)

    def getCoordinates(self, curr):
        budget_s = self.getSplitBudget()
        _data = curr.n_data
        _ndata = len(_data)
        split_1 = self.getSplit(_data, curr.n_box[0], curr.n_box[1], budget_s[curr.n_depth] / 2)
        pos_1 = np.searchsorted(_data, split_1)
        if pos_1 == 0 or pos_1 == _ndata:
            return False
        data_1 = _data[:pos_1]
        data_2 = _data[pos_1:]
        split_sec1 = self.getSplit(data_1, curr.n_box[0], split_1, budget_s[curr.n_depth] / 2)
        split_sec2 = self.getSplit(data_2, split_1, curr.n_box[1], budget_s[curr.n_depth] / 2)
        pos_sec1 = np.searchsorted(data_1, split_sec1)
        pos_sec2 = np.searchsorted(data_2, split_sec2)

        if pos_sec1 == 0 or pos_sec1 == len(data_1) or pos_sec2 == 0 or pos_sec2 == len(data_2):
            return False
        nw_data, ne_data, sw_data, se_data = data_1[:pos_sec1], data_1[pos_sec1:], data_2[:pos_sec2], data_2[pos_sec2:]
        return split_1, split_sec1, split_sec2, nw_data, ne_data, sw_data, se_data
Example #23
0
class GenericT(object):
    """
    Generic data structure, used for grid
    """
    def __init__(self, data, param):
        self.param = param
        self.differ = Differential(self.param.Seed)

        # initialize the root
        self.root = NodeT()
        # self.children = [] # all level 2 grids
        self.root.n_data = data
        self.root.n_box = np.array([param.LOW, param.HIGH])

    def getEqualSplit(self, partitions, min, max):
        """return equal split points, including both ends"""
        if min > max:
            logging.debug("getEqualSplit: Error: min > max")
        if partitions <= 1:
            return [min, max]
        return [
            min + (max - min) * i / partitions for i in range(partitions + 1)
        ]

    def getCountBudget(self):
        """return noisy count budget for different levels of the indices"""
        raise NotImplementedError

    def getCoordinates(self, curr):
        """return the split dimension, the split points and the data points in each subnodes"""
        raise NotImplementedError

    def getCount(self, curr, epsilon):
        """
        return true count or noisy count of a node, depending on epsilon.
        Note that the noisy count can be negative
        """
        if curr.n_data is None:
            count = 0
        else:
            count = curr.n_data.shape[1]

        if epsilon < 10**(-8):
            return count
        else:
            return count + self.differ.getNoise(1, epsilon)

    def intersect(self, hrect, query):
        """
        checks if the hyper-rectangle intersects with the
        hyper-rectangle defined by the query in every dimension
        """
        bool_m1 = query[0, :] >= hrect[1, :]
        bool_m2 = query[1, :] <= hrect[0, :]
        bool_m = np.logical_or(bool_m1, bool_m2)
        if np.any(bool_m):
            return False
        else:
            return True

    def testLeaf(self, curr):
        """ test whether a node should be a leaf node """
        if (curr.n_depth == Params.maxHeightAdaptiveGrid) or \
                (curr.n_data is None or curr.n_data.shape[1] == 0) or \
                (curr.n_count <= self.param.minPartSize):
            return True
        return False

    def buildIndex(self):
        """build the grid structure."""
        budget_c = self.getCountBudget()  # an array with two elements
        # print budget_c
        self.root.n_count = self.getCount(self.root,
                                          0)  # add noisy count to the root
        queue = deque()
        queue.append(self.root)
        # ## main loop
        while len(queue) > 0:
            curr = queue.popleft()
            if curr.n_data is None:
                curr.a_count.append(0)
            else:
                curr.a_count.append(curr.n_data.shape[1])

            if self.testLeaf(curr) is True:  # if curr is a leaf node
                remainingEps = sum(budget_c[curr.n_depth:])
                curr.n_count, curr.eps, curr.n_isLeaf = self.getCount(
                    curr, remainingEps), remainingEps, True
                curr.l_count.append(curr.n_count)
            else:  # curr needs to split --> find splitting granularity
                gran, split_arr_x, split_arr_y, n_data_matrix = self.getCoordinates(
                    curr)
                if gran == 1:
                    remainingEps = sum(budget_c[curr.n_depth:])
                    curr.n_count, curr.eps, curr.n_isLeaf = self.getCount(
                        curr, remainingEps), remainingEps, True
                    curr.children = None
                    curr.l_count.append(curr.n_count)
                    continue  # if the first level cell is leaf node

                # add all nodes to queue
                for x in range(gran):
                    for y in range(gran):
                        node = NodeT()
                        node.n_box = np.array(
                            [[split_arr_x[x], split_arr_y[y]],
                             [split_arr_x[x + 1], split_arr_y[y + 1]]])
                        node.index, node.parent, node.n_depth = x * gran + y, curr, curr.n_depth + 1
                        if n_data_matrix[x][y] is None:
                            node.n_data = None
                        else:
                            node.n_data = np.transpose(n_data_matrix[x][y])
                        node.n_count = self.getCount(node,
                                                     budget_c[node.n_depth])
                        node.eps = budget_c[node.n_depth]
                        if node.n_depth == 2:
                            node.n_isLeaf = True
                        if curr.children is None:
                            curr.children = np.ndarray(shape=(gran, gran),
                                                       dtype=NodeT)
                        curr.children[x][y] = node
                        queue.append(node)

                curr.n_data = None  # ## do not need the data points coordinates now
                # end of while

    # canonical range query does apply
    def rangeCount(self, query):
        """
        Query answering function. Find the number of data points within a query rectangle.
        This function assume that the tree is constructed with noisy count for every node
        """
        queue = deque()
        queue.append(self.root)
        count = 0.0
        while len(queue) > 0:
            curr = queue.popleft()
            _box = curr.n_box
            if curr.n_isLeaf is True:
                frac = 1
                if self.intersect(_box, query):
                    for i in range(_box.shape[1]):
                        if _box[1, i] == _box[0,
                                              i] or Params.WorstCase == True:
                            frac *= 1
                        else:
                            frac *= (min(query[1, i], _box[1, i]) -
                                     max(query[0, i], _box[0, i])) / (
                                         _box[1, i] - _box[0, i])
                    count += curr.n_count * frac
            else:  # if not leaf

                for (_, _), node in np.ndenumerate(curr.children):
                    bool_matrix = np.zeros((2, query.shape[1]))
                    bool_matrix[0, :] = query[0, :] <= _box[0, :]
                    bool_matrix[1, :] = query[1, :] >= _box[1, :]

                    if np.all(
                            bool_matrix):  # if query range contains node range
                        count += node.n_count
                    elif self.intersect(_box, query):
                        queue.append(node)
        return float(count)

    def leafCover(self, loc):
        """
        find a leaf node that cover the location
        """
        gran_1st = len(self.root.children)
        x1 = min(gran_1st - 1, (loc[0] - self.root.n_box[0, 0]) * gran_1st /
                 (self.root.n_box[1, 0] - self.root.n_box[0, 0]))
        y1 = min(gran_1st - 1, (loc[1] - self.root.n_box[0, 1]) * gran_1st /
                 (self.root.n_box[1, 1] - self.root.n_box[0, 1]))

        node_1st = self.root.children[x1][y1]
        """
        Note that there are cases when the actual count of first level cell is zero but the noisy count is > 0, 
        thus the cell may be splited into a number of empty cells
        """
        if node_1st.n_isLeaf or node_1st.children is None:
            return node_1st
        else:
            gran_2st = len(node_1st.children)
            x2 = min(gran_2st - 1, (loc[0] - node_1st.n_box[0, 0]) * gran_2st /
                     (node_1st.n_box[1, 0] - node_1st.n_box[0, 0]))
            y2 = min(gran_2st - 1, (loc[1] - node_1st.n_box[0, 1]) * gran_2st /
                     (node_1st.n_box[1, 1] - node_1st.n_box[0, 1]))
            return node_1st.children[x2][y2]

    def checkCorrectness(self, node, nodePoints=None):
        """
        Total number of data points of all leaf nodes should equal to the total data points
        only check the FIRST time instance
        """
        totalPoints = 0
        if node is None:
            return 0
        if (node.n_isLeaf
                and node.n_data is not None) or node.children is None:
            return node.a_count[0]

        for (_, _), child in np.ndenumerate(node.children):
            totalPoints += self.checkCorrectness(child)

        if nodePoints is None:
            return totalPoints

        if totalPoints == nodePoints:
            return True
        return False
Example #24
0
def test_privateMedian():
    f = open(res_dir + 'privateMedian', 'w')
    f_t = open(res_dir + 'privateMedian-time', 'w')
    data = np.sort(dataGen.data_gen(dist, NDIM, LO, HI, NDATA)).flatten()
    n = len(data)

    for i in range(10):
        print 'level ' + ` i `
        container = np.zeros(6)
        container_t = np.zeros(6)
        for seed in seed_list:
            perturber = Differential(seed)
            for j in range(2**i):
                c_data = data[n * j / 2**i:n * (j + 1) / 2**i]
                c_len = len(c_data)

                # exponential mechanism
                start = time.clock()
                em = perturber.getSplit_exp(c_data, c_data[0], c_data[-1], eps,
                                            1)
                end = time.clock()
                container_t[0] += end - start
                # smooth sensitivity (2-approx)
                start = time.clock()
                ls = perturber.getSplit_smooth(c_data, c_data[0], c_data[-1],
                                               eps, 1)
                end = time.clock()
                container_t[1] += end - start
                # exponential mechanism sampling
                start = time.clock()
                em_samp = perturber.getSplit_exp(c_data, c_data[0], c_data[-1],
                                                 eps, srt)
                end = time.clock()
                container_t[2] += end - start
                # smooth sensitivity sampling
                start = time.clock()
                ls_samp = perturber.getSplit_smooth(c_data, c_data[0],
                                                    c_data[-1], eps, srt)
                end = time.clock()
                container_t[3] += end - start
                # noisy mean approximation
                start = time.clock()
                nm = perturber.getSplit_noisyMean(c_data, c_data[0],
                                                  c_data[-1], eps)
                end = time.clock()
                container_t[4] += end - start
                # noisy grid approximation
                start = time.clock()
                ng = perturber.getSplit_grid(c_data, c_data[0], c_data[-1],
                                             eps, unit)
                end = time.clock()
                container_t[5] += end - start

                res = [em, ls, em_samp, ls_samp, nm, ng]
                for k in range(6):
                    if res[k] >= c_data[-1] or res[k] <= c_data[0]:
                        container[k] += 1.0
                    else:
                        r_k = np.searchsorted(c_data, res[k])
                        r_m = float(c_len) / 2
                        container[k] += abs(r_m - r_k) / r_m
                        # end of j loop
        for k in range(6):
            f.write( ` container[k] / (2**i * len(seed_list)) ` + ' ')
        f.write('\n')
        for k in range(6):
            f_t.write( ` container_t[k] / (2**i * len(seed_list)) ` + ' ')
        f_t.write('\n')
    # end of i
    f.close()
    f_t.close()
Example #25
0
# norm.stats(loc=mu, scale=sigma, moments="mv")

mu, sigma = 4.26246819779, math.sqrt(3.68211892668)

for i in range(200):
    print math.sqrt(random.gauss(mu, sigma))
    # print norm.pdf(loc=mu, scale=sigma)

# for rd in np.arange(1000, 3001, 50):
#     x2 = (rd/1000.0)**2
#     print "%.1f\t%s" % (x2, norm.pdf(x2, loc=mu, scale=sigma))

p = Params(1000)
p.select_dataset()
dp = Differential(p.seed)

# Randomly picking location in a small MBR of tdrive dataset.
minLat, maxLat = 39.1232147, 40.7225952
minLon, maxLon = 115.3879166, 117.3795395

# diffLat = maxLat - minLat
# diffLon = maxLon - minLon
#
# maxLat = maxLat - 0.95*diffLat
# maxLon = maxLon - 0.95*diffLon

samples = 200000  # sample size
d2_list = []
for i in range(samples):
    # First point
Example #26
0
class Kd_cell(Kd_pure):
    """ Kd tree based on syntatic data generation and a grid structure. See
    Y. Xiao, L. Xiong, and C. Yuan, Differentially private data release
    through multidimensional partitioning, in SDM Workshop, VLDB, 2010
    """
    def __init__(self, data, param):
        self.param = param
        self.differ = Differential(self.param.Seed)
        self.mapp = None
        self.root = KNode()
        self.realData = data
        self.root.n_box = None
        self.root.n_budget = Params.maxHeight

    def getCountBudget(self):
        count_eps = self.param.Eps * 0.5
        H = Params.maxHeight
        if self.param.geoBudget == 'none':
            return [count_eps / (H + 1) for _ in range(H + 1)]
        elif self.param.geoBudget == 'aggressive':
            unit = count_eps / (2**(H + 1) - 1)
            return [unit * 2**i for i in range(H + 1)]
        elif self.param.geoBudget == 'quadratic':
            unit = count_eps * (np.sqrt(2) - 1) / (2**(0.5 * (H + 1)) - 1)
            return [unit * 2**(0.5 * i) for i in range(H + 1)]
        elif self.param.geoBudget == 'optimal':
            unit = count_eps * ((2**(1.0 / 3)) - 1) / (2**((1.0 / 3) *
                                                           (H + 1)) - 1)
            return [unit * 2**((1.0 / 3) * i) for i in range(H + 1)]
        elif self.param.geoBudget == 'quartic':
            unit = count_eps * ((2**(1.0 / 4)) - 1) / (2**((1.0 / 4) *
                                                           (H + 1)) - 1)
            return [unit * 2**((1.0 / 4) * i) for i in range(H + 1)]
        else:
            logging.error('No such geoBudget scheme')
            sys.exit(1)

    def synthetic_gen(self):
        """Apply a grid structure on the domain and perturb the count using half
        of the available privacy budget """
        logging.debug('generating synthetic map...')
        data = self.realData
        unit = Params.unitGrid
        x_min = np.floor(Params.LOW[0] / unit) * unit
        x_max = np.ceil(Params.HIGH[0] / unit) * unit
        y_min = np.floor(Params.LOW[1] / unit) * unit
        y_max = np.ceil(Params.HIGH[1] / unit) * unit

        x_CELL = int(np.rint((x_max - x_min) / unit))
        y_CELL = int(np.rint((y_max - y_min) / unit))

        self.root.n_box = np.array([[x_min, y_min], [x_max, y_max]])

        self.mapp = np.zeros(
            (x_CELL, y_CELL)) - 1  # ## initialize every cell with -1
        for i in range(Params.NDATA):  # ## populate the map
            point = data[:, i]
            cell_x = int(np.floor((point[0] - x_min) / unit))
            cell_y = int(np.floor((point[1] - y_min) / unit))
            if self.mapp[cell_x, cell_y] != -1:
                self.mapp[cell_x, cell_y] += 1
            else:
                self.mapp[cell_x, cell_y] = 1

        for i in range(x_CELL):  # ## perturb the counts
            for j in range(y_CELL):
                if self.mapp[i, j] != -1:
                    self.mapp[i, j] += np.rint(
                        self.differ.getNoise(1, 0.5 * self.param.Eps))
                else:
                    self.mapp[i, j] = np.rint(
                        self.differ.getNoise(1, 0.5 * self.param.Eps))
                # if noisy count is negative, ignore the noise and generate no points
                if self.mapp[i, j] < 0:
                    self.mapp[i, j] = 0

    def cell_setLeaf(self, curr):
        """ Throw away the counts based on the syntatic data """
        curr.n_count = 0
        return

    def testLeaf(self, curr):
        if (curr.n_count <= self.param.minPartSize) or (
                curr.n_depth == Params.maxHeight) or (self.uniform_test(
                    curr, self.param.cellDistance)):
            return True
        return False

    def uniform_test(self, curr, distance):
        """ One of the stopping conditions: cell is uniform according to some threshold 'distance') """
        unit = Params.unitGrid
        x_min = int(np.rint((curr.n_box[0, 0] - self.root.n_box[0, 0]) / unit))
        x_max = int(np.rint((curr.n_box[1, 0] - self.root.n_box[0, 0]) / unit))
        y_min = int(np.rint((curr.n_box[0, 1] - self.root.n_box[0, 1]) / unit))
        y_max = int(np.rint((curr.n_box[1, 1] - self.root.n_box[0, 1]) / unit))
        data = self.mapp[x_min:x_max, y_min:y_max]
        total = np.sum(data)
        avg = total / ((x_max - x_min) * (y_max - y_min))
        dist = np.sum(np.abs(data - avg))
        if dist > distance:
            return False
        else:
            return True

    def buildIndex(self):
        stack = deque()
        stack.append(self.root)
        nleaf = 0  # leaf counter
        max_depth = -1
        self.root.n_count = np.sum(self.mapp)
        while len(stack) > 0:
            curr = stack.popleft()
            if curr.n_depth > max_depth:
                max_depth = curr.n_depth
            if self.testLeaf(curr) is True:  # curr is a leaf node
                nleaf += 1
                curr.n_isLeaf = True
                self.cell_setLeaf(curr)
            else:  # curr needs to split
                curr.n_budget -= 1
                tmp = self.getCoordinates(curr)
                nw_node, ne_node, sw_node, se_node = KNode(), KNode(), KNode(
                ), KNode()  # create sub-nodes
                nw_coord, ne_coord, count_tmp = tmp
                x_nw, y_nw = nw_coord
                x_se, y_se = ne_coord

                nw_node.n_box = np.array([[curr.n_box[0, 0], y_nw],
                                          [x_nw, curr.n_box[1, 1]]])
                ne_node.n_box = np.array([[x_nw, y_se],
                                          [curr.n_box[1, 0], curr.n_box[1,
                                                                        1]]])
                sw_node.n_box = np.array([[curr.n_box[0, 0], curr.n_box[0, 1]],
                                          [x_se, y_nw]])
                se_node.n_box = np.array([[x_se, curr.n_box[0, 1]],
                                          [curr.n_box[1, 0], y_se]])

                c_t = 0
                for sub_node in [nw_node, ne_node, sw_node, se_node]:
                    sub_node.n_depth = curr.n_depth + 1
                    sub_node.n_count = count_tmp[c_t]
                    sub_node.n_budget = curr.n_budget
                    stack.append(sub_node)
                    c_t += 1
                curr.nw, curr.ne, curr.sw, curr.se = nw_node, ne_node, sw_node, se_node

        # end of while
        logging.debug("number of leaves: %d" % nleaf)
        logging.debug("max depth: %d" % max_depth)

    def getCoordinates(self, curr):
        dim_1 = curr.n_depth % Params.NDIM  # primary split dimension
        UNIT = Params.unitGrid
        x_min = int(np.rint((curr.n_box[0, 0] - self.root.n_box[0, 0]) / UNIT))
        x_max = int(np.rint((curr.n_box[1, 0] - self.root.n_box[0, 0]) / UNIT))
        y_min = int(np.rint((curr.n_box[0, 1] - self.root.n_box[0, 1]) / UNIT))
        y_max = int(np.rint((curr.n_box[1, 1] - self.root.n_box[0, 1]) / UNIT))

        total = np.sum(self.mapp[x_min:x_max, y_min:y_max])
        if dim_1 == 0:
            for i in range(x_max - x_min):
                if np.sum(self.mapp[x_min:x_min + i + 1,
                                    y_min:y_max]) >= total / 2:
                    break
            split_prm = (x_min + i + 1) * UNIT + self.root.n_box[0, 0]

            half_1 = np.sum(self.mapp[x_min:x_min + i + 1, y_min:y_max])
            half_2 = np.sum(self.mapp[x_min + i + 1:x_max, y_min:y_max])
            for j in range(y_max - y_min):
                if np.sum(self.mapp[x_min:x_min + i + 1,
                                    y_min:y_min + j + 1]) >= half_1 / 2:
                    break
            split_sec1 = self.root.n_box[0, 1] + (y_min + j + 1) * UNIT
            n_sw = np.sum(self.mapp[x_min:x_min + i + 1, y_min:y_min + j + 1])
            n_nw = np.sum(self.mapp[x_min:x_min + i + 1, y_min + j + 1:y_max])
            for k in range(y_max - y_min):
                if np.sum(self.mapp[x_min + i + 1:x_max,
                                    y_min:y_min + k + 1]) >= half_2 / 2:
                    break
            split_sec2 = self.root.n_box[0, 1] + (y_min + k + 1) * UNIT
            n_se = np.sum(self.mapp[x_min + i + 1:x_max, y_min:y_min + k + 1])
            n_ne = np.sum(self.mapp[x_min + i + 1:x_max, y_min + k + 1:y_max])
            return (split_prm, split_sec1), (split_prm,
                                             split_sec2), (n_nw, n_ne, n_sw,
                                                           n_se)

        else:
            for i in range(y_max - y_min):
                if np.sum(self.mapp[x_min:x_max,
                                    y_min:y_min + i + 1]) >= total / 2:
                    break
            split_prm = self.root.n_box[0, 1] + (y_min + i + 1) * UNIT

            half_1 = np.sum(self.mapp[x_min:x_max, y_min:y_min + i + 1])
            half_2 = np.sum(self.mapp[x_min:x_max, y_min + i + 1:y_max])
            for j in range(x_max - x_min):
                if np.sum(self.mapp[x_min:x_min + j + 1,
                                    y_min:y_min + i + 1]) >= half_1 / 2:
                    break
            split_sec1 = (x_min + j + 1) * UNIT + self.root.n_box[0, 0]
            n_sw = np.sum(self.mapp[x_min:x_min + j + 1, y_min:y_min + i + 1])
            n_se = np.sum(self.mapp[x_min + j + 1:x_max, y_min:y_min + i + 1])
            for k in range(x_max - x_min):
                if np.sum(self.mapp[x_min:x_min + k + 1,
                                    y_min + i + 1:y_max]) >= half_2 / 2:
                    break
            split_sec2 = (x_min + k + 1) * UNIT + self.root.n_box[0, 0]
            n_nw = np.sum(self.mapp[x_min:x_min + k + 1, y_min + i + 1:y_max])
            n_ne = np.sum(self.mapp[x_min + k + 1:x_max, y_min + i + 1:y_max])
            return (split_sec2, split_prm), (split_sec1,
                                             split_prm), (n_nw, n_ne, n_sw,
                                                          n_se)

    def populate_synthetic_tree(self):
        """ Populate real data to the synthetic tree """
        logging.debug('populating synthetic tree...')
        a_data = self.realData
        ndata = a_data.shape[1]
        for i in range(ndata):
            ptx = a_data[0, i]
            pty = a_data[1, i]
            leaf = self.root.find_subnode(ptx, pty)
            leaf.n_count += 1

        # traverse the tree and update leaf counts
        stack = deque()
        stack.append(self.root)
        while len(stack) > 0:
            cur_node = stack.popleft()
            if cur_node.n_isLeaf is True:  # leaf
                cur_node.n_count += self.differ.getNoise(
                    1, 0.5 * self.param.Eps)
            else:
                stack.append(cur_node.nw)
                stack.append(cur_node.ne)
                stack.append(cur_node.sw)
                stack.append(cur_node.se)
Example #27
0
class GenericT(object):
    """
    Generic data structure, used for grid
    """

    def __init__(self, data, param):
        self.param = param
        self.differ = Differential(self.param.Seed)

        # initialize the root
        self.root = NodeT()
        # self.children = [] # all level 2 grids
        self.root.n_data = data
        self.root.n_box = np.array([param.LOW, param.HIGH])

    def getEqualSplit(self, partitions, min, max):
        """return equal split points, including both ends"""
        if min > max:
            logging.debug("getEqualSplit: Error: min > max")
        if partitions <= 1:
            return [min, max]
        return [min + (max - min) * i / partitions for i in range(partitions + 1)]

    def getCountBudget(self):
        """return noisy count budget for different levels of the indices"""
        raise NotImplementedError

    def getCoordinates(self, curr):
        """return the split dimension, the split points and the data points in each subnodes"""
        raise NotImplementedError

    def getCount(self, curr, epsilon):
        """
        return true count or noisy count of a node, depending on epsilon.
        Note that the noisy count can be negative
        """
        if curr.n_data is None:
            count = 0
        else:
            count = curr.n_data.shape[1]

        if epsilon < 10 ** (-8):
            return count
        else:
            return count + self.differ.getNoise(1, epsilon)


    def intersect(self, hrect, query):
        """
        checks if the hyper-rectangle intersects with the
        hyper-rectangle defined by the query in every dimension
        """
        bool_m1 = query[0, :] >= hrect[1, :]
        bool_m2 = query[1, :] <= hrect[0, :]
        bool_m = np.logical_or(bool_m1, bool_m2)
        if np.any(bool_m):
            return False
        else:
            return True

    def testLeaf(self, curr):
        """ test whether a node should be a leaf node """
        if (curr.n_depth == Params.maxHeightAdaptiveGrid) or \
                (curr.n_data is None or curr.n_data.shape[1] == 0) or \
                (curr.n_count <= self.param.minPartSize):
            return True
        return False

    def buildIndex(self):
        """build the grid structure."""
        budget_c = self.getCountBudget()  # an array with two elements
        # print budget_c
        self.root.n_count = self.getCount(self.root, 0)  # add noisy count to the root
        queue = deque()
        queue.append(self.root)
        # ## main loop
        while len(queue) > 0:
            curr = queue.popleft()
            if curr.n_data is None:
                curr.a_count.append(0)
            else:
                curr.a_count.append(curr.n_data.shape[1])

            if self.testLeaf(curr) is True:  # if curr is a leaf node
                remainingEps = sum(budget_c[curr.n_depth:])
                curr.n_count, curr.eps, curr.n_isLeaf = self.getCount(curr, remainingEps), remainingEps, True
                curr.l_count.append(curr.n_count)
            else:  # curr needs to split --> find splitting granularity
                gran, split_arr_x, split_arr_y, n_data_matrix = self.getCoordinates(curr)
                if gran == 1:
                    remainingEps = sum(budget_c[curr.n_depth:])
                    curr.n_count, curr.eps, curr.n_isLeaf = self.getCount(curr, remainingEps), remainingEps, True
                    curr.children = None
                    curr.l_count.append(curr.n_count)
                    continue  # if the first level cell is leaf node

                # add all nodes to queue
                for x in range(gran):
                    for y in range(gran):
                        node = NodeT()
                        node.n_box = np.array(
                            [[split_arr_x[x], split_arr_y[y]], [split_arr_x[x + 1], split_arr_y[y + 1]]])
                        node.index, node.parent, node.n_depth = x * gran + y, curr, curr.n_depth + 1
                        if n_data_matrix[x][y] is None:
                            node.n_data = None
                        else:
                            node.n_data = np.transpose(n_data_matrix[x][y])
                        node.n_count = self.getCount(node, budget_c[node.n_depth])
                        node.eps = budget_c[node.n_depth]
                        if node.n_depth == 2:
                            node.n_isLeaf = True
                        if curr.children is None:
                            curr.children = np.ndarray(shape=(gran, gran), dtype=NodeT)
                        curr.children[x][y] = node
                        queue.append(node)

                curr.n_data = None  # ## do not need the data points coordinates now
                # end of while


    # canonical range query does apply
    def rangeCount(self, query):
        """
        Query answering function. Find the number of data points within a query rectangle.
        This function assume that the tree is constructed with noisy count for every node
        """
        queue = deque()
        queue.append(self.root)
        count = 0.0
        while len(queue) > 0:
            curr = queue.popleft()
            _box = curr.n_box
            if curr.n_isLeaf is True:
                frac = 1
                if self.intersect(_box, query):
                    for i in range(_box.shape[1]):
                        if _box[1, i] == _box[0, i] or Params.WorstCase == True:
                            frac *= 1
                        else:
                            frac *= (min(query[1, i], _box[1, i]) - max(query[0, i], _box[0, i])) / (
                                _box[1, i] - _box[0, i])
                    count += curr.n_count * frac
            else:  # if not leaf

                for (_, _), node in np.ndenumerate(curr.children):
                    bool_matrix = np.zeros((2, query.shape[1]))
                    bool_matrix[0, :] = query[0, :] <= _box[0, :]
                    bool_matrix[1, :] = query[1, :] >= _box[1, :]

                    if np.all(bool_matrix):  # if query range contains node range
                        count += node.n_count
                    elif self.intersect(_box, query):
                        queue.append(node)
        return float(count)


    def leafCover(self, loc):
        """
        find a leaf node that cover the location
        """
        gran_1st = len(self.root.children)
        x1 = min(gran_1st - 1,
                 (loc[0] - self.root.n_box[0, 0]) * gran_1st / (self.root.n_box[1, 0] - self.root.n_box[0, 0]))
        y1 = min(gran_1st - 1,
                 (loc[1] - self.root.n_box[0, 1]) * gran_1st / (self.root.n_box[1, 1] - self.root.n_box[0, 1]))

        node_1st = self.root.children[x1][y1]
        """
        Note that there are cases when the actual count of first level cell is zero but the noisy count is > 0, 
        thus the cell may be splited into a number of empty cells
        """
        if node_1st.n_isLeaf or node_1st.children is None:
            return node_1st
        else:
            gran_2st = len(node_1st.children)
            x2 = min(gran_2st - 1,
                     (loc[0] - node_1st.n_box[0, 0]) * gran_2st / (node_1st.n_box[1, 0] - node_1st.n_box[0, 0]))
            y2 = min(gran_2st - 1,
                     (loc[1] - node_1st.n_box[0, 1]) * gran_2st / (node_1st.n_box[1, 1] - node_1st.n_box[0, 1]))
            return node_1st.children[x2][y2]


    def checkCorrectness(self, node, nodePoints=None):
        """
        Total number of data points of all leaf nodes should equal to the total data points
        only check the FIRST time instance
        """
        totalPoints = 0
        if node is None:
            return 0
        if (node.n_isLeaf and node.n_data is not None) or node.children is None:
            return node.a_count[0]

        for (_, _), child in np.ndenumerate(node.children):
            totalPoints += self.checkCorrectness(child)

        if nodePoints is None:
            return totalPoints

        if totalPoints == nodePoints:
            return True
        return False
Example #28
0
        r, theta = [
            math.sqrt(random.uniform(0, 1)) * math.sqrt(1),
            2 * math.pi * random.uniform(0, 1)
        ]
        y = [math.cos(theta) * r, math.sin(theta) * r]
        d = dist(x, y)
        total += d

    print("Expected dist: ", total / N)
"""
Simulated dataset
"""
p = Params(1000)
p.select_dataset()
reachable_range = Utils.reachableDistance()
dp = Differential(p.seed)

# Randomly picking location in a small MBR of tdrive dataset.
minLat, maxLat = 39.1232147, 40.7225952
minLon, maxLon = 115.3879166, 117.3795395

diffLat = maxLat - minLat
diffLon = maxLon - minLon

maxLat = maxLat - 0.95 * diffLat
maxLon = maxLon - 0.95 * diffLon

# print ("diagonal dist: ", Utils.distance(minLat, minLon, maxLat, maxLon))


def probs_from_sampling(samples, step, d_prime_values, d_matches_values):
class KalmanFilterPID(Parser):
    """ generated source for class KalmanFilterPID """

    # sampling rate
    def __init__(self, param):
        """
        generated source for method __init__
        """
        Parser.__init__(self)

        self.param = param
        self.differ = Differential(self.param.Seed)

        self.predict = []
        self.interval = None

        # Kalman Filter params
        self.P = 100

        # estimation error covariance (over all time instance)
        self.Q = 1000

        # process noise synthetic data
        self.R = 1000000

        # measurement noise optimal for alpha = 1, synthetic data
        self.K = 0

        # kalman gain
        # PID control params - default
        self.Cp = 0.9  # proportional gain, to keep output proportional to current error
        self.Ci = 0.1  # integral gain, to eliminate offset
        self.Cd = 0.0  # derivative gain, to ensure stability - prevent large error in future

        # fixed internally
        self.theta = 1  # magnitude of changes
        self.xi = 0.2  # gamma (10%)
        self.minIntvl = 1  # make sure the interval is greater than 1

        self.windowPID = 5  # I(integration) window
        self.ratioM = 0.2  # sampling rate

        #
        self.isSampling = False

    def adjustParams(self):
        # adjust params
        if self.ratioM < 0.1:
            self.theta = 20
        if 0.1 <= self.ratioM < 0.2:
            self.theta = 14
        if 0.2 <= self.ratioM < 0.3:
            self.theta = 2
        if 0.3 <= self.ratioM < 0.4:
            self.theta = 0.5
        if 0.4 <= self.ratioM < 0.5:
            self.theta = 0.3
        if 0.5 <= self.ratioM:
            self.theta = 0.1

    # test
    @classmethod
    def main(self, args):
        """ generated source for method main """
        if len(args) < 5:
            print "Usage: python KalmanFilterPID.py input output privacy-budget process-variance Cp(optional) Ci(optional) Cd(optional)"
            sys.exit()

        output = open(args[2], "w")
        budget = eval(args[3])
        Q = float(args[4])
        if budget <= 0 or Q <= 0:
            print "Usage: privacy-budget AND process-variance are positive values"
            sys.exit()

        p = Params(1000)
        kfPID = KalmanFilterPID(p)
        kfPID.setTotalBudget(budget)
        kfPID.setQ(Q)

        kfPID.orig = Parser.getData(args[1])

        kfPID.publish = [None] * len(kfPID.orig)

        # adjust R based on T and alpha
        kfPID.setR(len(kfPID.orig) * len(kfPID.orig) / (0.0 + budget * budget))

        # set optional control gains
        if len(args) >= 6:
            d = args[5]
            if d > 1:
                d = 1
            kfPID.setCp(d)

        if len(args) >= 7:
            d = args[6]
            if d + kfPID.Cp > 1:
                d = 1 - kfPID.Cp
            kfPID.setCi(d)
        else:
            kfPID.setCi(1 - kfPID.Cp)

        if len(args) >= 8:
            d = args[7]
            if d + kfPID.Cp + kfPID.Ci > 1:
                d = 1 - kfPID.Cp - kfPID.Ci
            kfPID.setCd(d)
        else:
            kfPID.setCd(1 - kfPID.Cp - kfPID.Ci)

        # kfPID.adjustParams()

        start = time.time()
        kfPID.publishCounts()
        end = time.time()

        Parser.outputData(output, kfPID.publish)

        print "Method:\tKalman Filter with Adaptive Sampling"
        print "Data Series Length:\t" + str(len(kfPID.orig))
        print "Queries Issued:\t" + str(kfPID.query.count(1))
        print "Privacy Budget Used:\t" + str(
            kfPID.query.count(1) * kfPID.epsilon)
        print "Average Relative Error:\t" + str(kfPID.getRelError())
        print "Time Used (in second):\t" + str(end - start)

    def kalmanFilter(self, orig, budget, samplingRate=None):
        self.totalBudget = budget
        self.orig = orig
        if samplingRate is not None:
            self.isSampling = True
            self.ratioM = samplingRate
        else:
            self.isSampling = False

        # self.adjustParams()

        self.publish = [None] * len(self.orig)

        # adjust R based on T and alpha
        self.setR(len(self.orig) * len(self.orig) / (0.0 + budget * budget))

        self.publishCounts()

        return self.publish

    def getCount(self, value, epsilon):
        """
        return true count or noisy count of a node, depending on epsilon.
        Note that the noisy count can be negative
        """
        if epsilon < 10**(-8):
            return value
        else:
            return value + self.differ.getNoise(1, epsilon)  # sensitivity is 1

    # data publication procedure
    def publishCounts(self):
        """ generated source for method publish """

        self.query = BitArray(len(self.orig))
        self.predict = [None] * len(self.orig)

        # recalculate individual budget based on M
        if (self.isSampling):
            M = int(self.ratioM * (len(self.orig)))  # 0.25 optimal percentile
        else:
            M = len(self.orig)

        if M <= 0:
            M = 1
        self.epsilon = (self.totalBudget + 0.0) / M

        # error = 0
        self.interval = 1
        nextQuery = max(1, self.windowPID) + self.interval - 1

        for i in range(len(self.orig)):
            if i == 0:
                # the first time instance
                self.publish[i] = self.getCount(self.orig[i], self.epsilon)
                self.query[i] = 1
                self.correctKF(i, 0)
            else:
                predct = self.predictKF(i)
                self.predict[i] = predct
                if self.query.count(1) < self.windowPID and self.query.count(
                        1) < M:
                    # i is NOT the sampling point

                    self.publish[i] = self.getCount(self.orig[i], self.epsilon)
                    self.query[i] = 1

                    # update count using observation
                    self.correctKF(i, predct)
                elif i == nextQuery and self.query.count(1) < M:
                    # if i is the sampling point

                    # query
                    self.publish[i] = self.getCount(self.orig[i], self.epsilon)
                    self.query[i] = 1

                    # update count using observation
                    self.correctKF(i, predct)

                    # update freq
                    if (self.isSampling):
                        ratio = self.PID(i)
                        frac = min(20, (ratio - self.xi) / self.xi)
                        deltaI = self.theta * (1 - math.exp(frac))
                        deltaI = int(deltaI) + (random.random() <
                                                deltaI - int(deltaI))
                        self.interval += deltaI
                    else:
                        self.interval = 1

                    if self.interval < self.minIntvl:
                        self.interval = self.minIntvl
                    nextQuery += self.interval  # nextQuery is ns in the paper
                else:
                    # --> predict
                    self.publish[i] = predct

                    # del self.orig
                    # del self.predict
                    # del self.query

                    # if self.isPostProcessing:
                    # self.postProcessing()

    # def postProcessing(self):
    # print len(self.samples), self.samples
    # remainedEps = self.totalBudget - len(self.samples) * self.epsilon
    # self.epsilon = self.epsilon + remainedEps/len(self.samples)
    #
    # # recompute noisy counts
    #     prev = 0
    #     for i in self.samples:
    #         self.publish[i] = self.getCount(self.orig[i], self.epsilon)
    #         if i > prev + 1:
    #             self.publish[prev + 1 : i] = [self.publish[prev]] * (i - prev - 1)
    #         prev = i

    def setR(self, r):
        """ generated source for method setR """
        self.R = r

    def setQ(self, q):
        """ generated source for method setQ """
        self.Q = q

    def setCp(self, cp):
        """ generated source for method setCp """
        self.Cp = cp

    def setCi(self, ci):
        """ generated source for method setCi """
        self.Ci = ci

    def setCd(self, cd):
        """ generated source for method setCd """
        self.Cd = cd

    # prediction step
    def predictKF(self, curr):
        """ generated source for method predictKF """
        # predict using Kalman Filter
        lastValue = self.getLastQuery(curr)

        # project estimation error
        self.P += self.Q  # Q is gaussian noise
        return lastValue

    # correction step
    def correctKF(self, curr, predict):
        """ generated source for method correctKF """
        self.K = (self.P + 0.0) / (self.P + self.R)
        correct = predict + self.K * (self.publish[curr] - predict)

        # publish[curr] = Math.max((int) correct, 0)
        if curr > 0:
            # only correct from 2nd values
            self.publish[curr] = correct

        # print correct, "\t", self.publish[curr], self.K, self.P

        # update estimation error variance
        self.P *= (1 - self.K)

    def getLastQuery(self, curr):
        """ generated source for method getLastQuery """
        for i in reversed(range(curr)):
            if self.query[i]:
                break
        return self.publish[i]

    # adaptive sampling - return feedback error
    def PID(self, curr):
        """ generated source for method PID """
        sum = 0
        lastValue = 0
        change = 0
        timeDiff = 0
        next = curr
        for j in reversed(range(self.windowPID - 1)):
            index = next
            while index >= 0:
                if self.query[index]:
                    next = index - 1  # the last nextQuery
                    break
                index -= 1
            if j == self.windowPID - 1:
                lastValue = abs(self.publish[index] - self.predict[index]) / (
                    0.0 + max(self.publish[index], 1))
                change = abs(self.publish[index] - self.predict[index]) / (
                    0.0 + max(self.publish[index], 1))
                timeDiff = index
            if j == self.windowPID - 2:
                change -= abs(self.publish[index] - self.predict[index]) / (
                    0.0 + max(self.publish[index], 1))
                timeDiff -= index
            sum += (abs(self.publish[index] - self.predict[index]) /
                    (0.0 + max(self.publish[index], 1)))

        ratio = self.Cp * lastValue + self.Ci * sum + self.Cd * change / (
            0.0 + timeDiff)
        return ratio