def perturbeCount(p): C_noisy = defaultdict() differ = Differential(p.seed) for lid, loc in p.locDict.iteritems(): noisyLoc = differ.addPolarNoise(p.eps, loc, p.radius) # perturbed noisy location cellId = coord2CellId(noisyLoc, p) # obtain cell id from noisy location C_noisy[cellId] = C_noisy.get(cellId, 0) + 1 return C_noisy
def __init__(self, data, param): self.param = param self.differ = Differential(self.param.Seed) self.mapp = None self.root = KNode() self.realData = data self.root.n_box = None self.root.n_budget = Params.maxHeight
def __init__(self, data, param): self.param = param self.differ = Differential(self.param.Seed) # ## initialize the root self.root = KNode() self.root.n_data = data self.root.n_box = np.array([Params.LOW, Params.HIGH]) self.root.n_budget = Params.maxHeight
def testDifferential(self): differ = Differential(1000) RTH = (34.020412, -118.289936) radius = 500.0 # default unit is meters eps = np.log(2) for i in range(100): (x, y) = differ.getPolarNoise(radius, eps) print (str(RTH[0] + x * Params.ONE_KM * 0.001) + ',' + str(RTH[1] + y * Params.ONE_KM*1.2833*0.001))
def __init__(self, data, param): self.param = param self.differ = Differential(self.param.Seed) # initialize the root self.root = Node() # self.children = [] # all level 2 grids self.root.n_data = data self.root.n_box = np.array([param.LOW, param.HIGH])
def __init__(self, param): """ generated source for method __init__ """ Parser.__init__(self) self.param = param self.differ = Differential(self.param.Seed) self.predict = [] self.interval = None # Kalman Filter params self.P = 100 # estimation error covariance (over all time instance) self.Q = 1000 # process noise synthetic data self.R = 1000000 # measurement noise optimal for alpha = 1, synthetic data self.K = 0 # kalman gain # PID control params - default self.Cp = 0.9 # proportional gain, to keep output proportional to current error self.Ci = 0.1 # integral gain, to eliminate offset self.Cd = 0.0 # derivative gain, to ensure stability - prevent large error in future # fixed internally self.theta = 1 # magnitude of changes self.xi = 0.2 # gamma (10%) self.minIntvl = 1 # make sure the interval is greater than 1 self.windowPID = 5 # I(integration) window self.ratioM = 0.2 # sampling rate # self.isSampling = False
def evalDivGeoI(p, D_actual): exp_name = sys._getframe().f_code.co_name logging.info(exp_name) res_cube = np.zeros((len(eps_list), len(seed_list), len(divMetricList))) sensitivity = p.M # diversitySensitivity(p.M) differ = Differential(p.seed) u = distance(p.x_min, p.y_min, p.x_max, p.y_min) * 1000.0 / Params.GEOI_GRID_SIZE v = distance(p.x_min, p.y_min, p.x_min, p.y_max) * 1000.0 / Params.GEOI_GRID_SIZE rad = euclideanToRadian((u, v)) cell_size = np.array([rad[0], rad[1]]) for j in range(len(seed_list)): for i in range(len(eps_list)): p.seed = seed_list[j] p.eps = eps_list[i] D_noisy = defaultdict(Counter) for lid, loc in p.locDict.iteritems(): eps = p.eps/sensitivity noisyLoc = differ.addPolarNoise(eps, loc, p.radius) # perturbed noisy location # rounded to grid roundedPoint = round2Grid(noisyLoc, cell_size, p.x_min, p.y_min) cellId = coord2CellId(roundedPoint, p) # obtain cell id from noisy location D_noisy[cellId].update(p.locs[lid]) # update count(userid/freq) actual, noisy = [], [] for cellId, d in D_actual.iteritems(): actual.append(d) noisy.append(normalizeDiversity(randomEntropy(len(D_noisy.get(cellId, Counter([])))))) # default entropy = 0 for k in range(len(divMetricList)): res_cube[i, j, k] = divMetricList[k](actual, noisy) res_summary = np.average(res_cube, axis=1) np.savetxt(p.resdir + Params.DATASET + "_" + exp_name + '_M' + str(p.M) + '_C' + str(p.C), res_summary, header="\t".join([f.__name__ for f in divMetricList]), fmt='%.4f\t')
def evalCountGeoI(p, C_actual): exp_name = sys._getframe().f_code.co_name logging.info(exp_name) res_cube = np.zeros((len(eps_list), len(seed_list), len(freqMetricList))) differ = Differential(p.seed) u = distance(p.x_min, p.y_min, p.x_max, p.y_min) * 1000.0 / Params.GEOI_GRID_SIZE v = distance(p.x_min, p.y_min, p.x_min, p.y_max) * 1000.0 / Params.GEOI_GRID_SIZE rad = euclideanToRadian((u, v)) cell_size = np.array([rad[0], rad[1]]) for j in range(len(seed_list)): for i in range(len(eps_list)): p.seed = seed_list[j] p.eps = eps_list[i] C_noisy = defaultdict() for lid, loc in p.locDict.iteritems(): noisyLoc = differ.addPolarNoise(p.eps, loc, p.radius) # perturbed noisy location # rounded to grid roundedPoint = round2Grid(noisyLoc, cell_size, p.x_min, p.y_min) cellId = coord2CellId(roundedPoint, p) # obtain cell id from noisy location C_noisy[cellId] = C_noisy.get(cellId, 0) + 1 actual, noisy = [], [] for cellId, c in C_actual.iteritems(): if c > 0: actual.append(c) noisy.append(C_noisy.get(cellId, Params.DEFAULT_ENTROPY)) # default entropy = 0 for k in range(len(freqMetricList)): res_cube[i, j, k] = freqMetricList[k](actual, noisy) res_summary = np.average(res_cube, axis=1) np.savetxt(p.resdir + Params.DATASET + "_" + exp_name + "_m" + str(p.m) + '_r' + str(p.radius), res_summary, header="\t".join([f.__name__ for f in divMetricList]), fmt='%.4f\t')
def testDifferential(): p = Params(1000) p.select_dataset() differ = Differential(1000) # RTH = (34.020412, -118.289936) TS = (40.758890, -73.985100) for i in range(100): # (x, y) = differ.getPolarNoise(1000000, p.eps) # pp = noisyPoint(TS, (x,y)) pp = differ.addPolarNoise(1.0, TS, 100) # u = distance(p.x_min, p.y_min, p.x_max, p.y_min) * 1000.0 / Params.GRID_SIZE # v = distance(p.x_min, p.y_min, p.x_min, p.y_max) * 1000.0 / Params.GRID_SIZE # rad = euclideanToRadian((u, v)) # cell_size = np.array([rad[0], rad[1]]) # roundedPoint = round2Grid(pp, cell_size, p.x_min, p.y_min) roundedPoint = pp print (str(roundedPoint[0]) + ',' + str(roundedPoint[1]))
def __init__(self, data, eps, param, firstGrid=None, use_domain_knowledge=None): """ two levels grid """ self.eps = eps self.first = False self.DOMAIN_KNOWLEDGE = use_domain_knowledge if firstGrid is None: # this first grid --> need to construct self.first = True Grid_adaptiveM.__init__(self, data, eps, param, self.DOMAIN_KNOWLEDGE) else: self.param = param self.differ = Differential(self.param.Seed) # update root self.root = copy.deepcopy(firstGrid.root) self.root.n_data = data
class KTree(object): """Generic tree template""" def __init__(self, data, param): self.param = param self.differ = Differential(self.param.Seed) # ## initialize the root self.root = KNode() self.root.n_data = data self.root.n_box = np.array([Params.LOW, Params.HIGH]) self.root.n_budget = Params.maxHeight def getSplitBudget(self): """return a list of h budget values for split""" raise NotImplementedError def getCountBudget(self): """return a list of (h+1) budget values for noisy count""" raise NotImplementedError def getNoisyMedian(self, array, left, right, epsilon): """return the split value of an array""" raise NotImplementedError def getCoordinates(self, curr): """ return the coordinate of lower-right point of the NW sub-node and the upper-left point of the SW sub-node and the data points in the four subnodes, i.e. return (x_nw,y_nw),(x_se,y_se), nw_data, ne_data, sw_data, se_data """ raise NotImplementedError def getSplit(self, array, left, right, epsilon): """ return the split point given an array, may be data-independent or true median or noisy median, depending on the type of the tree """ raise NotImplementedError def getCount(self, curr, epsilon): """ return true count or noisy count of a node, depending on epsilon""" if curr.n_data is None: count = 0 else: count = curr.n_data.shape[1] if epsilon < 10 ** (-6): return count else: return count + self.differ.getNoise(1, epsilon) def testLeaf(self, curr): """ test whether a node should be a leaf node """ if (curr.n_depth == Params.maxHeight) or \ (curr.n_budget <= 0) or \ (curr.n_data is None or curr.n_data.shape[1] == 0) or \ (curr.n_count <= self.param.minPartSize): return True return False def cell_setLeaf(self, curr): """ will be overrided in kd_cell """ return def buildIndex(self): """ Function to build the tree structure, fanout = 4 by default for spatial (2D) data """ budget_c = self.getCountBudget() self.root.n_count = self.getCount(self.root, budget_c[0]) # ## add noisy count to root stack = deque() stack.append(self.root) nleaf = 0 # ## leaf counter max_depth = -1 # ## main loop while len(stack) > 0: curr = stack.popleft() if curr.n_depth > max_depth: max_depth = curr.n_depth if self.testLeaf(curr) is True: # ## curr is a leaf node if curr.n_depth < Params.maxHeight: # ## if a node ends up earlier than maxHeight, it should be able to use the remaining count budget remainingEps = sum(budget_c[curr.n_depth + 1:]) curr.n_count = self.getCount(curr, remainingEps) nleaf += 1 curr.n_isLeaf = True self.cell_setLeaf(curr) else: # ## curr needs to split curr.n_budget -= 1 # ## some budget will be used regardless the split is successful or not tmp = self.getCoordinates(curr) nw_node, ne_node, sw_node, se_node = KNode(), KNode(), KNode(), KNode() # create sub-nodes nw_coord, ne_coord, nw_node.n_data, ne_node.n_data, sw_node.n_data, se_node.n_data = tmp x_nw, y_nw = nw_coord x_se, y_se = ne_coord # ## update bounding box, depth, count, budget for the four subnodes nw_node.n_box = np.array([[curr.n_box[0, 0], y_nw], [x_nw, curr.n_box[1, 1]]]) ne_node.n_box = np.array([[x_nw, y_se], [curr.n_box[1, 0], curr.n_box[1, 1]]]) sw_node.n_box = np.array([[curr.n_box[0, 0], curr.n_box[0, 1]], [x_se, y_nw]]) se_node.n_box = np.array([[x_se, curr.n_box[0, 1]], [curr.n_box[1, 0], y_se]]) for sub_node in [nw_node, ne_node, sw_node, se_node]: sub_node.n_depth = curr.n_depth + 1 # if (sub_node.n_depth == Params.maxHeight and sub_node.n_data is not None): # print len(sub_node.n_data[0]) sub_node.n_count = self.getCount(sub_node, budget_c[sub_node.n_depth]) sub_node.n_budget = curr.n_budget stack.append(sub_node) curr.n_data = None # ## do not need the data points coordinates now curr.nw, curr.ne, curr.sw, curr.se = nw_node, ne_node, sw_node, se_node # end of while logging.debug("number of leaves: %d" % nleaf) logging.debug("max depth: %d" % max_depth) def rect_intersect(self, hrect, query): """ checks if the hyper-rectangle intersects with the hyper-rectangle defined by the query in every dimension """ bool_m1 = query[0, :] >= hrect[1, :] bool_m2 = query[1, :] <= hrect[0, :] bool_m = np.logical_or(bool_m1, bool_m2) if np.any(bool_m): return False else: return True def rangeCount(self, query): """ Query answering function. Find the number of data points within a query rectangle. """ stack = deque() stack.append(self.root) count = 0.0 # ## Below are three variables recording the number of 1) whole leaf 2) partial leaf 3) whole internal node, # ## respectively, which contribute to the query answer. For debug purpose only. l_whole, l_part, i_whole = 0, 0, 0 while len(stack) > 0: curr = stack.popleft() _box = curr.n_box if curr.n_isLeaf is True: frac = 1 if self.rect_intersect(_box, query): for i in range(_box.shape[1]): if _box[1, i] == _box[0, i] or Params.WorstCase == True: frac *= 1 else: frac *= (min(query[1, i], _box[1, i]) - max(query[0, i], _box[0, i])) / ( _box[1, i] - _box[0, i]) count += curr.n_count * frac if 1.0 - frac < 10 ** (-6): l_whole += 1 else: l_part += 1 else: # ## if not leaf bool_matrix = np.zeros((2, query.shape[1])) bool_matrix[0, :] = query[0, :] <= _box[0, :] bool_matrix[1, :] = query[1, :] >= _box[1, :] if np.all(bool_matrix) and self.param.useLeafOnly is False: # ## if query range contains node range count += curr.n_count i_whole += 1 else: if self.rect_intersect(curr.nw.n_box, query): stack.append(curr.nw) if self.rect_intersect(curr.ne.n_box, query): stack.append(curr.ne) if self.rect_intersect(curr.sw.n_box, query): stack.append(curr.sw) if self.rect_intersect(curr.se.n_box, query): stack.append(curr.se) return float(count) # , i_whole, l_whole, l_part def adjustConsistency(self): """ Post processing for uniform noise across levels. Due to Michael Hay, Vibhor Rastogi, Gerome Miklau, Dan Suciu, Boosting the Accuracy of Differentially-Private Histograms Through Consistency, VLDB 2010 """ logging.debug('adjusting consistency...') # ## upward pass self.root.get_z() # ## downward pass queue = deque() queue.append(self.root) while len(queue) > 0: curr = queue.popleft() if curr.n_isLeaf is False: adjust = (curr.n_count - curr.nw.n_count - curr.ne.n_count - curr.sw.n_count - curr.se.n_count) / 4.0 for subnode in [curr.nw, curr.ne, curr.sw, curr.se]: subnode.n_count += adjust queue.append(subnode) def postProcessing(self): """ Post processing for general noise distribution across levels. Due to G. Cormode, M. Procopiuc, E. Shen, D. Srivastava and T. Yu, Differentially Private Spatial Decompositions, ICDE 2012. """ logging.debug("post-processing...") budget = self.getCountBudget() # ## count budget for h+1 levels H = Params.maxHeight # ## Phase 1 (top-down) queue = deque() self.root.n_count *= budget[self.root.n_depth] ** 2 queue.append(self.root) while len(queue) > 0: curr = queue.popleft() if curr.n_isLeaf is False: for subnode in [curr.nw, curr.ne, curr.sw, curr.se]: subnode.n_count = curr.n_count + subnode.n_count * (budget[subnode.n_depth] ** 2) queue.append(subnode) # ## Phase 2 (bottom-up) self.root.update_count() # ## Phase 3 (top-down) queue = deque() E_root = 0 for i in range(H + 1): E_root += 4 ** i * budget[H - i] * budget[H - i] self.root.n_count /= E_root self.root.n_F = 0 queue.append(self.root) while len(queue) > 0: curr = queue.popleft() if curr.n_isLeaf is False: h = H - curr.n_depth - 1 # ## height of curr's children E_h = 0 for i in range(h + 1): E_h += 4 ** i * budget[H - i] * budget[H - i] for subnode in [curr.nw, curr.ne, curr.sw, curr.se]: subnode.n_F = curr.n_F + curr.n_count * (budget[curr.n_depth] ** 2) subnode.n_count = (subnode.n_count - 4 ** h * subnode.n_F) / E_h queue.append(subnode) def pruning(self): """ If the tree is grown without the stopping condition of minLeafSize, prune it here after post processing """ logging.debug("pruning...") queue = deque() queue.append(self.root) while len(queue) > 0: curr = queue.popleft() if curr.n_isLeaf is False: if curr.n_count <= self.param.minPartSize: curr.n_isLeaf = True else: queue.append(curr.nw) queue.append(curr.ne) queue.append(curr.sw) queue.append(curr.se)
class Hilbert(Kd_standard): """ Hilbert R-tree """ def __init__(self, data, param): self.param = param self.differ = Differential(self.param.Seed) self.root = KNode() self.realData = data self.root.n_budget = Params.maxHeight def h_encode(self, x, y, r): """ (x,y) -> value h in Hilbert space, r is the resolution of the Hilbert curve """ mask = (1 << r) - 1 heven = x ^ y notx = ~x & mask noty = ~y & mask temp = notx ^ y v0, v1 = 0, 0 for k in range(r - 1): v1 = ((v1 & heven) | ((v0 ^ noty) & temp)) >> 1 v0 = ((v0 & (v1 ^ notx)) | (~v0 & (v1 ^ noty))) >> 1 hodd = (~v0 & (v1 ^ x)) | (v0 & (v1 ^ noty)) return self.interleaveBits(hodd, heven) def h_decode(self, h, r): """ h -> (x,y) """ heven, hodd = self.deleaveBits(h) mask = (1 << r) - 1 v0, v1 = 0, 0 temp1 = ~(heven | hodd) & mask temp0 = ~(heven ^ hodd) & mask for k in range(r - 1): v1 = (v1 ^ temp1) >> 1 v0 = (v0 ^ temp0) >> 1 return (v0 & ~heven) ^ v1 ^ hodd, (v0 | heven) ^ v1 ^ hodd def interleaveBits(self, hodd, heven): val = 0 maxx = max(hodd, heven) n = 0 while maxx > 0: n += 1 maxx >>= 1 for i in range(n): bitMask = 1 << i a = 1 << (2 * i) if (heven & bitMask) else 0 b = 1 << (2 * i + 1) if (hodd & bitMask) else 0 val += a + b return val def deleaveBitsOdd(self, x): x &= 0x5555555555555555 x = (x | (x >> 1)) & 0x3333333333333333 x = (x | (x >> 2)) & 0x0F0F0F0F0F0F0F0F x = (x | (x >> 4)) & 0x00FF00FF00FF00FF x = (x | (x >> 8)) & 0x0000FFFF0000FFFF x = (x | (x >> 16)) & 0x00000000FFFFFFFF return x def deleaveBits(self, x): return self.deleaveBitsOdd(x), self.deleaveBitsOdd(x >> 1) def get_Hcoord(self, x, y, R): hx = int((x - Params.LOW[0]) / (Params.HIGH[0] - Params.LOW[0] + 10**(-8)) * (2**R)) hy = int((y - Params.LOW[1]) / (Params.HIGH[1] - Params.LOW[1] + 10**(-8)) * (2**R)) return hx, hy def get_Rcoord(self, hx, hy, R): x = float(hx) / (2** R) * (Params.HIGH[0] - Params.LOW[0]) + Params.LOW[0] y = float(hy) / (2** R) * (Params.HIGH[1] - Params.LOW[1]) + Params.LOW[1] return x, y def getCount(self, curr, epsilon): count = len(curr.n_data) if epsilon < 10**(-6): return count else: return count + self.differ.getNoise(1, epsilon) def testLeaf(self, curr): """ test whether a node should be a leaf node """ if (curr.n_depth == Params.maxHeight) or \ (curr.n_budget <= 0) or \ (curr.n_count <= self.param.minPartSize): return True return False def buildIndex(self): budget_c = self.getCountBudget() logging.debug('encoding coordinates...') RES = self.param.Res # order of Hilbert curve ndata = self.realData.shape[1] hidx = np.zeros(ndata) for i in range(ndata): hx, hy = self.get_Hcoord(self.realData[0, i], self.realData[1, i], RES) hidx[i] = self.h_encode(hx, hy, RES) hidx = np.sort(hidx) logging.debug('building index...') self.root.n_data = hidx self.root.n_box = (0, 2**(2 * RES) - 1) self.root.n_count = self.getCount(self.root, budget_c[0]) stack = deque() stack.append(self.root) tree = [self.root] leaf_li = [] # storage of all leaves nleaf = 0 # leaf counter max_depth = -1 while len(stack) > 0: curr = stack.popleft() if curr.n_depth > max_depth: max_depth = curr.n_depth if self.testLeaf(curr) is True: # curr is a leaf node if curr.n_depth < Params.maxHeight: remainingEps = sum(budget_c[curr.n_depth + 1:]) curr.n_count = self.getCount(curr, remainingEps) nleaf += 1 curr.n_isLeaf = True leaf_li.append(curr) else: # curr needs to split curr.n_budget -= 1 tmp = self.getCoordinates(curr) if tmp is False: # if split fails stack.append(curr) continue nw_node, ne_node, sw_node, se_node = KNode(), KNode(), KNode( ), KNode() # create sub-nodes split_prm, split_sec1, split_sec2, nw_node.n_data, ne_node.n_data, sw_node.n_data, se_node.n_data = tmp nw_node.n_box = (curr.n_box[0], split_sec1) ne_node.n_box = (split_sec1, split_prm) sw_node.n_box = (split_prm, split_sec2) se_node.n_box = (split_sec2, curr.n_box[1]) for sub_node in [nw_node, ne_node, sw_node, se_node]: sub_node.n_depth = curr.n_depth + 1 sub_node.n_count = self.getCount( sub_node, budget_c[sub_node.n_depth]) sub_node.n_budget = curr.n_budget stack.append(sub_node) tree.append(sub_node) curr.n_data = None curr.nw, curr.ne, curr.sw, curr.se = nw_node, ne_node, sw_node, se_node # end of while logging.debug("number of leaves: %d" % nleaf) logging.debug("max depth: %d" % max_depth) # # convert hilbert values in leaf nodes to real coordinates and update bounding box logging.debug('decoding and updating bounding box...') for leaf in leaf_li: bbox = np.array([[1000.0, 1000.0], [-1000.0, -1000.0]], dtype='float64') for hvalue in leaf.n_data: hx, hy = self.h_decode(int(hvalue), RES) x, y = self.get_Rcoord(hx, hy, RES) bbox[0, 0] = x if x < bbox[0, 0] else bbox[0, 0] bbox[1, 0] = x if x > bbox[1, 0] else bbox[1, 0] bbox[0, 1] = y if y < bbox[0, 1] else bbox[0, 1] bbox[1, 1] = y if y > bbox[1, 1] else bbox[1, 1] leaf.n_box = bbox # # update bounding box bottom-up tree = sorted(tree, cmp=self.cmp_node) logging.debug('updating box for each node in the tree...') for node in tree: if node.n_data is None: node.n_box = np.zeros((2, 2)) node.n_box[0, 0] = min(node.ne.n_box[0, 0], node.nw.n_box[0, 0], node.se.n_box[0, 0], node.sw.n_box[0, 0]) node.n_box[0, 1] = min(node.ne.n_box[0, 1], node.nw.n_box[0, 1], node.se.n_box[0, 1], node.sw.n_box[0, 1]) node.n_box[1, 0] = max(node.ne.n_box[1, 0], node.nw.n_box[1, 0], node.se.n_box[1, 0], node.sw.n_box[1, 0]) node.n_box[1, 1] = max(node.ne.n_box[1, 1], node.nw.n_box[1, 1], node.se.n_box[1, 1], node.sw.n_box[1, 1]) def cmp_node(self, node1, node2): # reverse order return int(node2.n_depth - node1.n_depth) def getCoordinates(self, curr): budget_s = self.getSplitBudget() _data = curr.n_data _ndata = len(_data) split_1 = self.getSplit(_data, curr.n_box[0], curr.n_box[1], budget_s[curr.n_depth] / 2) pos_1 = np.searchsorted(_data, split_1) if pos_1 == 0 or pos_1 == _ndata: return False data_1 = _data[:pos_1] data_2 = _data[pos_1:] split_sec1 = self.getSplit(data_1, curr.n_box[0], split_1, budget_s[curr.n_depth] / 2) split_sec2 = self.getSplit(data_2, split_1, curr.n_box[1], budget_s[curr.n_depth] / 2) pos_sec1 = np.searchsorted(data_1, split_sec1) pos_sec2 = np.searchsorted(data_2, split_sec2) if pos_sec1 == 0 or pos_sec1 == len( data_1) or pos_sec2 == 0 or pos_sec2 == len(data_2): return False nw_data, ne_data, sw_data, se_data = data_1[:pos_sec1], data_1[ pos_sec1:], data_2[:pos_sec2], data_2[pos_sec2:] return split_1, split_sec1, split_sec2, nw_data, ne_data, sw_data, se_data
class KalmanFilterPID(Parser): """ generated source for class KalmanFilterPID """ # sampling rate def __init__(self, param): """ generated source for method __init__ """ Parser.__init__(self) self.param = param self.differ = Differential(self.param.Seed) self.predict = [] self.interval = None # Kalman Filter params self.P = 100 # estimation error covariance (over all time instance) self.Q = 1000 # process noise synthetic data self.R = 1000000 # measurement noise optimal for alpha = 1, synthetic data self.K = 0 # kalman gain # PID control params - default self.Cp = 0.9 # proportional gain, to keep output proportional to current error self.Ci = 0.1 # integral gain, to eliminate offset self.Cd = 0.0 # derivative gain, to ensure stability - prevent large error in future # fixed internally self.theta = 1 # magnitude of changes self.xi = 0.2 # gamma (10%) self.minIntvl = 1 # make sure the interval is greater than 1 self.windowPID = 5 # I(integration) window self.ratioM = 0.2 # sampling rate # self.isSampling = False def adjustParams(self): # adjust params if self.ratioM < 0.1: self.theta = 20 if 0.1 <= self.ratioM < 0.2: self.theta = 14 if 0.2 <= self.ratioM < 0.3: self.theta = 2 if 0.3 <= self.ratioM < 0.4: self.theta = 0.5 if 0.4 <= self.ratioM < 0.5: self.theta = 0.3 if 0.5 <= self.ratioM: self.theta = 0.1 # test @classmethod def main(self, args): """ generated source for method main """ if len(args) < 5: print "Usage: python KalmanFilterPID.py input output privacy-budget process-variance Cp(optional) Ci(optional) Cd(optional)" sys.exit() output = open(args[2], "w") budget = eval(args[3]) Q = float(args[4]) if budget <= 0 or Q <= 0: print "Usage: privacy-budget AND process-variance are positive values" sys.exit() p = Params(1000) kfPID = KalmanFilterPID(p) kfPID.setTotalBudget(budget) kfPID.setQ(Q) kfPID.orig = Parser.getData(args[1]) kfPID.publish = [None] * len(kfPID.orig) # adjust R based on T and alpha kfPID.setR(len(kfPID.orig) * len(kfPID.orig) / (0.0 + budget * budget)) # set optional control gains if len(args) >= 6: d = args[5] if d > 1: d = 1 kfPID.setCp(d) if len(args) >= 7: d = args[6] if d + kfPID.Cp > 1: d = 1 - kfPID.Cp kfPID.setCi(d) else: kfPID.setCi(1 - kfPID.Cp) if len(args) >= 8: d = args[7] if d + kfPID.Cp + kfPID.Ci > 1: d = 1 - kfPID.Cp - kfPID.Ci kfPID.setCd(d) else: kfPID.setCd(1 - kfPID.Cp - kfPID.Ci) # kfPID.adjustParams() start = time.time() kfPID.publishCounts() end = time.time() Parser.outputData(output, kfPID.publish) print "Method:\tKalman Filter with Adaptive Sampling" print "Data Series Length:\t" + str(len(kfPID.orig)) print "Queries Issued:\t" + str(kfPID.query.count(1)) print "Privacy Budget Used:\t" + str(kfPID.query.count(1) * kfPID.epsilon) print "Average Relative Error:\t" + str(kfPID.getRelError()) print "Time Used (in second):\t" + str(end - start) def kalmanFilter(self, orig, budget, samplingRate=None): self.totalBudget = budget self.orig = orig if samplingRate is not None: self.isSampling = True self.ratioM = samplingRate else: self.isSampling = False # self.adjustParams() self.publish = [None] * len(self.orig) # adjust R based on T and alpha self.setR(len(self.orig) * len(self.orig) / (0.0 + budget * budget)) self.publishCounts() return self.publish def getCount(self, value, epsilon): """ return true count or noisy count of a node, depending on epsilon. Note that the noisy count can be negative """ if epsilon < 10 ** (-8): return value else: return value + self.differ.getNoise(1, epsilon) # sensitivity is 1 # data publication procedure def publishCounts(self): """ generated source for method publish """ self.query = BitArray(len(self.orig)) self.predict = [None] * len(self.orig) # recalculate individual budget based on M if (self.isSampling): M = int(self.ratioM * (len(self.orig))) # 0.25 optimal percentile else: M = len(self.orig) if M <= 0: M = 1 self.epsilon = (self.totalBudget + 0.0) / M # error = 0 self.interval = 1 nextQuery = max(1, self.windowPID) + self.interval - 1 for i in range(len(self.orig)): if i == 0: # the first time instance self.publish[i] = self.getCount(self.orig[i], self.epsilon) self.query[i] = 1 self.correctKF(i, 0) else: predct = self.predictKF(i) self.predict[i] = predct if self.query.count(1) < self.windowPID and self.query.count(1) < M: # i is NOT the sampling point self.publish[i] = self.getCount(self.orig[i], self.epsilon) self.query[i] = 1 # update count using observation self.correctKF(i, predct) elif i == nextQuery and self.query.count(1) < M: # if i is the sampling point # query self.publish[i] = self.getCount(self.orig[i], self.epsilon) self.query[i] = 1 # update count using observation self.correctKF(i, predct) # update freq if (self.isSampling): ratio = self.PID(i) frac = min(20, (ratio - self.xi) / self.xi) deltaI = self.theta * (1 - math.exp(frac)) deltaI = int(deltaI) + (random.random() < deltaI - int(deltaI)) self.interval += deltaI else: self.interval = 1 if self.interval < self.minIntvl: self.interval = self.minIntvl nextQuery += self.interval # nextQuery is ns in the paper else: # --> predict self.publish[i] = predct # del self.orig # del self.predict # del self.query # if self.isPostProcessing: # self.postProcessing() # def postProcessing(self): # print len(self.samples), self.samples # remainedEps = self.totalBudget - len(self.samples) * self.epsilon # self.epsilon = self.epsilon + remainedEps/len(self.samples) # # # recompute noisy counts # prev = 0 # for i in self.samples: # self.publish[i] = self.getCount(self.orig[i], self.epsilon) # if i > prev + 1: # self.publish[prev + 1 : i] = [self.publish[prev]] * (i - prev - 1) # prev = i def setR(self, r): """ generated source for method setR """ self.R = r def setQ(self, q): """ generated source for method setQ """ self.Q = q def setCp(self, cp): """ generated source for method setCp """ self.Cp = cp def setCi(self, ci): """ generated source for method setCi """ self.Ci = ci def setCd(self, cd): """ generated source for method setCd """ self.Cd = cd # prediction step def predictKF(self, curr): """ generated source for method predictKF """ # predict using Kalman Filter lastValue = self.getLastQuery(curr) # project estimation error self.P += self.Q # Q is gaussian noise return lastValue # correction step def correctKF(self, curr, predict): """ generated source for method correctKF """ self.K = (self.P + 0.0) / (self.P + self.R) correct = predict + self.K * (self.publish[curr] - predict) # publish[curr] = Math.max((int) correct, 0) if curr > 0: # only correct from 2nd values self.publish[curr] = correct # print correct, "\t", self.publish[curr], self.K, self.P # update estimation error variance self.P *= (1 - self.K) def getLastQuery(self, curr): """ generated source for method getLastQuery """ for i in reversed(range(curr)): if self.query[i]: break return self.publish[i] # adaptive sampling - return feedback error def PID(self, curr): """ generated source for method PID """ sum = 0 lastValue = 0 change = 0 timeDiff = 0 next = curr for j in reversed(range(self.windowPID - 1)): index = next while index >= 0: if self.query[index]: next = index - 1 # the last nextQuery break index -= 1 if j == self.windowPID - 1: lastValue = abs(self.publish[index] - self.predict[index]) / (0.0 + max(self.publish[index], 1)) change = abs(self.publish[index] - self.predict[index]) / (0.0 + max(self.publish[index], 1)) timeDiff = index if j == self.windowPID - 2: change -= abs(self.publish[index] - self.predict[index]) / (0.0 + max(self.publish[index], 1)) timeDiff -= index sum += (abs(self.publish[index] - self.predict[index]) / (0.0 + max(self.publish[index], 1))) ratio = self.Cp * lastValue + self.Ci * sum + self.Cd * change / (0.0 + timeDiff) return ratio
class Generic(object): """ Generic data structure, used for both htree and grid """ def __init__(self, data, param): self.param = param self.differ = Differential(self.param.Seed) # initialize the root self.root = Node() # self.children = [] # all level 2 grids self.root.n_data = data self.root.n_box = np.array([param.LOW, param.HIGH]) def getEqualSplit(self, partitions, min, max): """return equal split points, including both ends""" if min > max: logging.debug("getEqualSplit: Error: min > max") if partitions <= 1: return [min, max] return [min + (max - min) * i / partitions for i in range(partitions + 1)] def getCountBudget(self): """return noisy count budget for different levels of the indices""" raise NotImplementedError def getCoordinates(self, curr): """return the split dimension, the split points and the data points in each subnodes""" raise NotImplementedError def getCount(self, curr, epsilon): """ return true count or noisy count of a node, depending on epsilon. Note that the noisy count can be negative """ if curr.n_data is None: count = 0 else: count = curr.n_data.shape[1] if epsilon < 10 ** (-6): return count else: return count + self.differ.getNoise(1, epsilon) def testLeaf(self, curr): """test whether a node is a leaf node""" raise NotImplementedError def intersect(self, hrect, query): """ checks if the hyper-rectangle intersects with the hyper-rectangle defined by the query in every dimension """ bool_m1 = query[0, :] >= hrect[1, :] bool_m2 = query[1, :] <= hrect[0, :] bool_m = np.logical_or(bool_m1, bool_m2) if np.any(bool_m): return False else: return True def buildIndex(self): """build the htree & grid structure. htree is a high fanout and low level tree""" budget_c = self.getCountBudget() # an array with two elements self.root.n_count = self.getCount(self.root, 0) # add noisy count to the root queue = deque() queue.append(self.root) nleaf = 0 # number of leaf node, for debug only # ## main loop while len(queue) > 0: curr = queue.popleft() if self.testLeaf(curr) is True: # if curr is a leaf node if curr.n_depth < self.param.maxHeightHTree: remainingEps = sum(budget_c[curr.n_depth:]) curr.n_count = self.getCount(curr, remainingEps) curr.eps = remainingEps nleaf += 1 curr.n_isLeaf = True else: # curr needs to split split_arr, n_data_arr = self.getCoordinates(curr) if split_arr is None: if curr.n_depth < self.param.maxHeightHTree: remainingEps = sum(budget_c[curr.n_depth:]) curr.n_count = self.getCount(curr, remainingEps) curr.eps = remainingEps nleaf += 1 curr.n_isLeaf = True curr.children = [] continue # if the first level cell is leaf node for i in range(len(n_data_arr)): node = Node() if curr.n_depth % Params.NDIM == 0: # split by x coord node.n_box = np.array([[split_arr[i], curr.n_box[0, 1]], [split_arr[i + 1], curr.n_box[1, 1]]]) else: # split by y coord node.n_box = np.array([[curr.n_box[0, 0], split_arr[i]], [curr.n_box[1, 0], split_arr[i + 1]]]) node.index = i node.parent = curr node.n_depth = curr.n_depth + 1 node.n_data = n_data_arr[i] node.n_count = self.getCount(node, budget_c[node.n_depth]) node.eps = budget_c[node.n_depth] if curr.n_depth == 2: node.secondLevelPartitions = curr.secondLevelPartitions curr.children.append(node) queue.append(node) # if curr.n_depth == 2: # self.children.append(curr) curr.n_data = None # ## do not need the data points coordinates now # end of while logging.debug("Generic: number of leaves: %d" % nleaf) # canonical range query does apply def rangeCount(self, query): """ Query answering function. Find the number of data points within a query rectangle. This function assume that the tree is contructed with noisy count for every node """ queue = deque() queue.append(self.root) count = 0.0 while len(queue) > 0: curr = queue.popleft() _box = curr.n_box if curr.n_isLeaf is True: frac = 1 if self.intersect(_box, query): for i in range(_box.shape[1]): if _box[1, i] == _box[0, i] or Params.WorstCase == True: frac *= 1 else: frac *= (min(query[1, i], _box[1, i]) - max(query[0, i], _box[0, i])) / ( _box[1, i] - _box[0, i]) count += curr.n_count * frac else: # if not leaf for node in curr.children: bool_matrix = np.zeros((2, query.shape[1])) bool_matrix[0, :] = query[0, :] <= _box[0, :] bool_matrix[1, :] = query[1, :] >= _box[1, :] if np.all(bool_matrix): # if query range contains node range count += node.n_count elif self.intersect(_box, query): queue.append(node) return float(count) def leafCover(self, loc): """ find a leaf node that cover the location """ queue = deque() queue.append(self.root) while len(queue) > 0: curr = queue.popleft() _box = curr.n_box if curr.n_isLeaf is True: if is_rect_cover(_box, loc): return curr else: # if not leaf queue.extend(curr.children) def checkCorrectness(self, node, nodePoints=None): """ Total number of data points of all leaf nodes should equal to the total data points """ totalPoints = 0 if node is None: return 0 if node.n_isLeaf and node.n_data is not None: return node.n_data.shape[1] for child in node.children: totalPoints += self.checkCorrectness(child) if nodePoints is None: return totalPoints if totalPoints == nodePoints: return True return False
def main(): # # Need to parse command line arguements first, because PyROOT is going # to muck up the usage as soon as a root class is loaded. # parser = OptionParser() parser.add_option("-b", "--base", dest="baseline", help="Set the baseline geometry [required]", metavar="BASE", default="NONE") parser.add_option("-g", "--geom", dest="geometry", help="Set the comparison geometry [required]", metavar="GEOM", default="NONE") parser.add_option( "--basename", dest="basename", help= "Set the name of the baseline geometry if different from base [optional]", metavar="BASENAME", default="same") parser.add_option( "--geomname", dest="geomname", help= "Set the name of the comparsion geometry if different from geom [optional]", metavar="GEOMNAME", default="same") parser.add_option("-v", "--volume", dest="volume", help="Set the top level volume [required]", metavar="VOLUME", default="CAVE") parser.add_option("--basepath", dest="basepath", default="NONE") parser.add_option("--geompath", dest="geompath", default="NONE") parser.add_option("--stat", dest="stat", default="radlen", help="Statistic to display") parser.add_option( "--thumbnail", dest="thumbnail", default=False, action="store_true", help="Creates thumbnails of the front page of the PDF file.") parser.add_option("--size", dest="size", default=False, help="Sets the size of the thumbnail, e.g. 850x1100") (opts, args) = parser.parse_args() if (opts.baseline == "NONE"): print "" print "Must specify a baseline geometry." print "" os.system("./differential.py --help") return if (opts.geometry == "NONE"): print "" print "Must specify a comparison geometry." print "" os.system("./differential.py --help") return from Differential import Differential, _file_path from Differential import get_geom_file from Canvas import CanvasPDF from ROOT import TFile from ROOT import TGeoManager from ROOT import TGeoVolume from ROOT import TGeoNode from ROOT import kWhite from ROOT import gStyle gStyle.SetHistMinimumZero() gStyle.SetCanvasColor(kWhite) # Setup temporary symbolic links to the root files if (opts.basepath != "NONE"): os.system("ln -s " + opts.basepath + "/" + opts.baseline + ".root .") if (opts.geompath != "NONE"): os.system("ln -s " + opts.geompath + "/" + opts.geometry + ".root .") canvas = CanvasPDF(name="differential-" + opts.baseline + "-vs-" + opts.geometry + "-" + opts.volume, title="Geometry differential for volume=" + opts.volume + " " + opts.baseline + " vs " + opts.geometry, nx=1, ny=1, thumbnail=opts.thumbnail) differ = Differential(base=opts.baseline, comp=opts.geometry, top=opts.volume, basegeo=opts.basename, compgeo=opts.geomname, canvas=canvas, stat=opts.stat) # Remove temporary symbolic links to the root files if (opts.basepath != "NONE"): os.system("rm " + opts.basepath + "/" + opts.baseline + ".root") if (opts.geompath != "NONE"): os.system("rm " + opts.geompath + "/" + opts.geometry + ".root")
class Kd_cell(Kd_pure): """ Kd tree based on syntatic data generation and a grid structure. See Y. Xiao, L. Xiong, and C. Yuan, Differentially private data release through multidimensional partitioning, in SDM Workshop, VLDB, 2010 """ def __init__(self, data, param): self.param = param self.differ = Differential(self.param.Seed) self.mapp = None self.root = KNode() self.realData = data self.root.n_box = None self.root.n_budget = Params.maxHeight def getCountBudget(self): count_eps = self.param.Eps * 0.5 H = Params.maxHeight if self.param.geoBudget == 'none': return [count_eps / (H + 1) for _ in range(H + 1)] elif self.param.geoBudget == 'aggressive': unit = count_eps / (2 ** (H + 1) - 1) return [unit * 2 ** i for i in range(H + 1)] elif self.param.geoBudget == 'quadratic': unit = count_eps * (np.sqrt(2) - 1) / (2 ** (0.5 * (H + 1)) - 1) return [unit * 2 ** (0.5 * i) for i in range(H + 1)] elif self.param.geoBudget == 'optimal': unit = count_eps * ((2 ** (1.0 / 3)) - 1) / (2 ** ((1.0 / 3) * (H + 1)) - 1) return [unit * 2 ** ((1.0 / 3) * i) for i in range(H + 1)] elif self.param.geoBudget == 'quartic': unit = count_eps * ((2 ** (1.0 / 4)) - 1) / (2 ** ((1.0 / 4) * (H + 1)) - 1) return [unit * 2 ** ((1.0 / 4) * i) for i in range(H + 1)] else: logging.error('No such geoBudget scheme') sys.exit(1) def synthetic_gen(self): """Apply a grid structure on the domain and perturb the count using half of the available privacy budget """ logging.debug('generating synthetic map...') data = self.realData unit = Params.unitGrid x_min = np.floor(Params.LOW[0] / unit) * unit x_max = np.ceil(Params.HIGH[0] / unit) * unit y_min = np.floor(Params.LOW[1] / unit) * unit y_max = np.ceil(Params.HIGH[1] / unit) * unit x_CELL = int(np.rint((x_max - x_min) / unit)) y_CELL = int(np.rint((y_max - y_min) / unit)) self.root.n_box = np.array([[x_min, y_min], [x_max, y_max]]) self.mapp = np.zeros((x_CELL, y_CELL)) - 1 # ## initialize every cell with -1 for i in range(Params.NDATA): # ## populate the map point = data[:, i] cell_x = int(np.floor((point[0] - x_min) / unit)) cell_y = int(np.floor((point[1] - y_min) / unit)) if self.mapp[cell_x, cell_y] != -1: self.mapp[cell_x, cell_y] += 1 else: self.mapp[cell_x, cell_y] = 1 for i in range(x_CELL): # ## perturb the counts for j in range(y_CELL): if self.mapp[i, j] != -1: self.mapp[i, j] += np.rint(self.differ.getNoise(1, 0.5 * self.param.Eps)) else: self.mapp[i, j] = np.rint(self.differ.getNoise(1, 0.5 * self.param.Eps)) # if noisy count is negative, ignore the noise and generate no points if self.mapp[i, j] < 0: self.mapp[i, j] = 0 def cell_setLeaf(self, curr): """ Throw away the counts based on the syntatic data """ curr.n_count = 0 return def testLeaf(self, curr): if (curr.n_count <= self.param.minPartSize) or (curr.n_depth == Params.maxHeight) or ( self.uniform_test(curr, self.param.cellDistance)): return True return False def uniform_test(self, curr, distance): """ One of the stopping conditions: cell is uniform according to some threshold 'distance') """ unit = Params.unitGrid x_min = int(np.rint((curr.n_box[0, 0] - self.root.n_box[0, 0]) / unit)) x_max = int(np.rint((curr.n_box[1, 0] - self.root.n_box[0, 0]) / unit)) y_min = int(np.rint((curr.n_box[0, 1] - self.root.n_box[0, 1]) / unit)) y_max = int(np.rint((curr.n_box[1, 1] - self.root.n_box[0, 1]) / unit)) data = self.mapp[x_min:x_max, y_min:y_max] total = np.sum(data) avg = total / ((x_max - x_min) * (y_max - y_min)) dist = np.sum(np.abs(data - avg)) if dist > distance: return False else: return True def buildIndex(self): stack = deque() stack.append(self.root) nleaf = 0 # leaf counter max_depth = -1 self.root.n_count = np.sum(self.mapp) while len(stack) > 0: curr = stack.popleft() if curr.n_depth > max_depth: max_depth = curr.n_depth if self.testLeaf(curr) is True: # curr is a leaf node nleaf += 1 curr.n_isLeaf = True self.cell_setLeaf(curr) else: # curr needs to split curr.n_budget -= 1 tmp = self.getCoordinates(curr) nw_node, ne_node, sw_node, se_node = KNode(), KNode(), KNode(), KNode() # create sub-nodes nw_coord, ne_coord, count_tmp = tmp x_nw, y_nw = nw_coord x_se, y_se = ne_coord nw_node.n_box = np.array([[curr.n_box[0, 0], y_nw], [x_nw, curr.n_box[1, 1]]]) ne_node.n_box = np.array([[x_nw, y_se], [curr.n_box[1, 0], curr.n_box[1, 1]]]) sw_node.n_box = np.array([[curr.n_box[0, 0], curr.n_box[0, 1]], [x_se, y_nw]]) se_node.n_box = np.array([[x_se, curr.n_box[0, 1]], [curr.n_box[1, 0], y_se]]) c_t = 0 for sub_node in [nw_node, ne_node, sw_node, se_node]: sub_node.n_depth = curr.n_depth + 1 sub_node.n_count = count_tmp[c_t] sub_node.n_budget = curr.n_budget stack.append(sub_node) c_t += 1 curr.nw, curr.ne, curr.sw, curr.se = nw_node, ne_node, sw_node, se_node # end of while logging.debug("number of leaves: %d" % nleaf) logging.debug("max depth: %d" % max_depth) def getCoordinates(self, curr): dim_1 = curr.n_depth % Params.NDIM # primary split dimension UNIT = Params.unitGrid x_min = int(np.rint((curr.n_box[0, 0] - self.root.n_box[0, 0]) / UNIT)) x_max = int(np.rint((curr.n_box[1, 0] - self.root.n_box[0, 0]) / UNIT)) y_min = int(np.rint((curr.n_box[0, 1] - self.root.n_box[0, 1]) / UNIT)) y_max = int(np.rint((curr.n_box[1, 1] - self.root.n_box[0, 1]) / UNIT)) total = np.sum(self.mapp[x_min:x_max, y_min:y_max]) if dim_1 == 0: for i in range(x_max - x_min): if np.sum(self.mapp[x_min:x_min + i + 1, y_min:y_max]) >= total / 2: break split_prm = (x_min + i + 1) * UNIT + self.root.n_box[0, 0] half_1 = np.sum(self.mapp[x_min:x_min + i + 1, y_min:y_max]) half_2 = np.sum(self.mapp[x_min + i + 1:x_max, y_min:y_max]) for j in range(y_max - y_min): if np.sum(self.mapp[x_min:x_min + i + 1, y_min:y_min + j + 1]) >= half_1 / 2: break split_sec1 = self.root.n_box[0, 1] + (y_min + j + 1) * UNIT n_sw = np.sum(self.mapp[x_min:x_min + i + 1, y_min:y_min + j + 1]) n_nw = np.sum(self.mapp[x_min:x_min + i + 1, y_min + j + 1:y_max]) for k in range(y_max - y_min): if np.sum(self.mapp[x_min + i + 1:x_max, y_min:y_min + k + 1]) >= half_2 / 2: break split_sec2 = self.root.n_box[0, 1] + (y_min + k + 1) * UNIT n_se = np.sum(self.mapp[x_min + i + 1:x_max, y_min:y_min + k + 1]) n_ne = np.sum(self.mapp[x_min + i + 1:x_max, y_min + k + 1:y_max]) return (split_prm, split_sec1), (split_prm, split_sec2), (n_nw, n_ne, n_sw, n_se) else: for i in range(y_max - y_min): if np.sum(self.mapp[x_min:x_max, y_min:y_min + i + 1]) >= total / 2: break split_prm = self.root.n_box[0, 1] + (y_min + i + 1) * UNIT half_1 = np.sum(self.mapp[x_min:x_max, y_min:y_min + i + 1]) half_2 = np.sum(self.mapp[x_min:x_max, y_min + i + 1:y_max]) for j in range(x_max - x_min): if np.sum(self.mapp[x_min:x_min + j + 1, y_min:y_min + i + 1]) >= half_1 / 2: break split_sec1 = (x_min + j + 1) * UNIT + self.root.n_box[0, 0] n_sw = np.sum(self.mapp[x_min:x_min + j + 1, y_min:y_min + i + 1]) n_se = np.sum(self.mapp[x_min + j + 1:x_max, y_min:y_min + i + 1]) for k in range(x_max - x_min): if np.sum(self.mapp[x_min:x_min + k + 1, y_min + i + 1:y_max]) >= half_2 / 2: break split_sec2 = (x_min + k + 1) * UNIT + self.root.n_box[0, 0] n_nw = np.sum(self.mapp[x_min:x_min + k + 1, y_min + i + 1:y_max]) n_ne = np.sum(self.mapp[x_min + k + 1:x_max, y_min + i + 1:y_max]) return (split_sec2, split_prm), (split_sec1, split_prm), (n_nw, n_ne, n_sw, n_se) def populate_synthetic_tree(self): """ Populate real data to the synthetic tree """ logging.debug('populating synthetic tree...') a_data = self.realData ndata = a_data.shape[1] for i in range(ndata): ptx = a_data[0, i] pty = a_data[1, i] leaf = self.root.find_subnode(ptx, pty) leaf.n_count += 1 # traverse the tree and update leaf counts stack = deque() stack.append(self.root) while len(stack) > 0: cur_node = stack.popleft() if cur_node.n_isLeaf is True: # leaf cur_node.n_count += self.differ.getNoise(1, 0.5 * self.param.Eps) else: stack.append(cur_node.nw) stack.append(cur_node.ne) stack.append(cur_node.sw) stack.append(cur_node.se)
def test_privateMedian(): f = open(res_dir + 'privateMedian', 'w') f_t = open(res_dir + 'privateMedian-time', 'w') data = np.sort(dataGen.data_gen(dist, NDIM, LO, HI, NDATA)).flatten() n = len(data) for i in range(10): print 'level ' + `i` container = np.zeros(6) container_t = np.zeros(6) for seed in seed_list: perturber = Differential(seed) for j in range(2 ** i): c_data = data[n * j / 2 ** i:n * (j + 1) / 2 ** i] c_len = len(c_data) # exponential mechanism start = time.clock() em = perturber.getSplit_exp(c_data, c_data[0], c_data[-1], eps, 1) end = time.clock() container_t[0] += end - start # smooth sensitivity (2-approx) start = time.clock() ls = perturber.getSplit_smooth(c_data, c_data[0], c_data[-1], eps, 1) end = time.clock() container_t[1] += end - start # exponential mechanism sampling start = time.clock() em_samp = perturber.getSplit_exp(c_data, c_data[0], c_data[-1], eps, srt) end = time.clock() container_t[2] += end - start # smooth sensitivity sampling start = time.clock() ls_samp = perturber.getSplit_smooth(c_data, c_data[0], c_data[-1], eps, srt) end = time.clock() container_t[3] += end - start # noisy mean approximation start = time.clock() nm = perturber.getSplit_noisyMean(c_data, c_data[0], c_data[-1], eps) end = time.clock() container_t[4] += end - start # noisy grid approximation start = time.clock() ng = perturber.getSplit_grid(c_data, c_data[0], c_data[-1], eps, unit) end = time.clock() container_t[5] += end - start res = [em, ls, em_samp, ls_samp, nm, ng] for k in range(6): if res[k] >= c_data[-1] or res[k] <= c_data[0]: container[k] += 1.0 else: r_k = np.searchsorted(c_data, res[k]) r_m = float(c_len) / 2 container[k] += abs(r_m - r_k) / r_m # end of j loop for k in range(6): f.write(`container[k] / (2 ** i * len(seed_list))` + ' ') f.write('\n') for k in range(6): f_t.write(`container_t[k] / (2 ** i * len(seed_list))` + ' ') f_t.write('\n') # end of i f.close() f_t.close()
class Hilbert(Kd_standard): """ Hilbert R-tree """ def __init__(self, data, param): self.param = param self.differ = Differential(self.param.Seed) self.root = KNode() self.realData = data self.root.n_budget = Params.maxHeight def h_encode(self, x, y, r): """ (x,y) -> value h in Hilbert space, r is the resolution of the Hilbert curve """ mask = (1 << r) - 1 heven = x ^ y notx = ~x & mask noty = ~y & mask temp = notx ^ y v0, v1 = 0, 0 for k in range(r - 1): v1 = ((v1 & heven) | ((v0 ^ noty) & temp)) >> 1 v0 = ((v0 & (v1 ^ notx)) | (~v0 & (v1 ^ noty))) >> 1 hodd = (~v0 & (v1 ^ x)) | (v0 & (v1 ^ noty)) return self.interleaveBits(hodd, heven) def h_decode(self, h, r): """ h -> (x,y) """ heven, hodd = self.deleaveBits(h) mask = (1 << r) - 1 v0, v1 = 0, 0 temp1 = ~(heven | hodd) & mask temp0 = ~(heven ^ hodd) & mask for k in range(r - 1): v1 = (v1 ^ temp1) >> 1 v0 = (v0 ^ temp0) >> 1 return (v0 & ~heven) ^ v1 ^ hodd, (v0 | heven) ^ v1 ^ hodd def interleaveBits(self, hodd, heven): val = 0 maxx = max(hodd, heven) n = 0 while maxx > 0: n += 1 maxx >>= 1 for i in range(n): bitMask = 1 << i a = 1 << (2 * i) if (heven & bitMask) else 0 b = 1 << (2 * i + 1) if (hodd & bitMask) else 0 val += a + b return val def deleaveBitsOdd(self, x): x &= 0x5555555555555555 x = (x | (x >> 1)) & 0x3333333333333333 x = (x | (x >> 2)) & 0x0F0F0F0F0F0F0F0F x = (x | (x >> 4)) & 0x00FF00FF00FF00FF x = (x | (x >> 8)) & 0x0000FFFF0000FFFF x = (x | (x >> 16)) & 0x00000000FFFFFFFF return x def deleaveBits(self, x): return self.deleaveBitsOdd(x), self.deleaveBitsOdd(x >> 1) def get_Hcoord(self, x, y, R): hx = int((x - Params.LOW[0]) / (Params.HIGH[0] - Params.LOW[0] + 10 ** (-8)) * (2 ** R)) hy = int((y - Params.LOW[1]) / (Params.HIGH[1] - Params.LOW[1] + 10 ** (-8)) * (2 ** R)) return hx, hy def get_Rcoord(self, hx, hy, R): x = float(hx) / (2 ** R) * (Params.HIGH[0] - Params.LOW[0]) + Params.LOW[0] y = float(hy) / (2 ** R) * (Params.HIGH[1] - Params.LOW[1]) + Params.LOW[1] return x, y def getCount(self, curr, epsilon): count = len(curr.n_data) if epsilon < 10 ** (-6): return count else: return count + self.differ.getNoise(1, epsilon) def testLeaf(self, curr): """ test whether a node should be a leaf node """ if (curr.n_depth == Params.maxHeight) or \ (curr.n_budget <= 0) or \ (curr.n_count <= self.param.minPartSize): return True return False def buildIndex(self): budget_c = self.getCountBudget() logging.debug('encoding coordinates...') RES = self.param.Res # order of Hilbert curve ndata = self.realData.shape[1] hidx = np.zeros(ndata) for i in range(ndata): hx, hy = self.get_Hcoord(self.realData[0, i], self.realData[1, i], RES) hidx[i] = self.h_encode(hx, hy, RES) hidx = np.sort(hidx) logging.debug('building index...') self.root.n_data = hidx self.root.n_box = (0, 2 ** (2 * RES) - 1) self.root.n_count = self.getCount(self.root, budget_c[0]) stack = deque() stack.append(self.root) tree = [self.root] leaf_li = [] # storage of all leaves nleaf = 0 # leaf counter max_depth = -1 while len(stack) > 0: curr = stack.popleft() if curr.n_depth > max_depth: max_depth = curr.n_depth if self.testLeaf(curr) is True: # curr is a leaf node if curr.n_depth < Params.maxHeight: remainingEps = sum(budget_c[curr.n_depth + 1:]) curr.n_count = self.getCount(curr, remainingEps) nleaf += 1 curr.n_isLeaf = True leaf_li.append(curr) else: # curr needs to split curr.n_budget -= 1 tmp = self.getCoordinates(curr) if tmp is False: # if split fails stack.append(curr) continue nw_node, ne_node, sw_node, se_node = KNode(), KNode(), KNode(), KNode() # create sub-nodes split_prm, split_sec1, split_sec2, nw_node.n_data, ne_node.n_data, sw_node.n_data, se_node.n_data = tmp nw_node.n_box = (curr.n_box[0], split_sec1) ne_node.n_box = (split_sec1, split_prm) sw_node.n_box = (split_prm, split_sec2) se_node.n_box = (split_sec2, curr.n_box[1]) for sub_node in [nw_node, ne_node, sw_node, se_node]: sub_node.n_depth = curr.n_depth + 1 sub_node.n_count = self.getCount(sub_node, budget_c[sub_node.n_depth]) sub_node.n_budget = curr.n_budget stack.append(sub_node) tree.append(sub_node) curr.n_data = None curr.nw, curr.ne, curr.sw, curr.se = nw_node, ne_node, sw_node, se_node # end of while logging.debug("number of leaves: %d" % nleaf) logging.debug("max depth: %d" % max_depth) # # convert hilbert values in leaf nodes to real coordinates and update bounding box logging.debug('decoding and updating bounding box...') for leaf in leaf_li: bbox = np.array([[1000.0, 1000.0], [-1000.0, -1000.0]], dtype='float64') for hvalue in leaf.n_data: hx, hy = self.h_decode(int(hvalue), RES) x, y = self.get_Rcoord(hx, hy, RES) bbox[0, 0] = x if x < bbox[0, 0] else bbox[0, 0] bbox[1, 0] = x if x > bbox[1, 0] else bbox[1, 0] bbox[0, 1] = y if y < bbox[0, 1] else bbox[0, 1] bbox[1, 1] = y if y > bbox[1, 1] else bbox[1, 1] leaf.n_box = bbox # # update bounding box bottom-up tree = sorted(tree, cmp=self.cmp_node) logging.debug('updating box for each node in the tree...') for node in tree: if node.n_data is None: node.n_box = np.zeros((2, 2)) node.n_box[0, 0] = min(node.ne.n_box[0, 0], node.nw.n_box[0, 0], node.se.n_box[0, 0], node.sw.n_box[0, 0]) node.n_box[0, 1] = min(node.ne.n_box[0, 1], node.nw.n_box[0, 1], node.se.n_box[0, 1], node.sw.n_box[0, 1]) node.n_box[1, 0] = max(node.ne.n_box[1, 0], node.nw.n_box[1, 0], node.se.n_box[1, 0], node.sw.n_box[1, 0]) node.n_box[1, 1] = max(node.ne.n_box[1, 1], node.nw.n_box[1, 1], node.se.n_box[1, 1], node.sw.n_box[1, 1]) def cmp_node(self, node1, node2): # reverse order return int(node2.n_depth - node1.n_depth) def getCoordinates(self, curr): budget_s = self.getSplitBudget() _data = curr.n_data _ndata = len(_data) split_1 = self.getSplit(_data, curr.n_box[0], curr.n_box[1], budget_s[curr.n_depth] / 2) pos_1 = np.searchsorted(_data, split_1) if pos_1 == 0 or pos_1 == _ndata: return False data_1 = _data[:pos_1] data_2 = _data[pos_1:] split_sec1 = self.getSplit(data_1, curr.n_box[0], split_1, budget_s[curr.n_depth] / 2) split_sec2 = self.getSplit(data_2, split_1, curr.n_box[1], budget_s[curr.n_depth] / 2) pos_sec1 = np.searchsorted(data_1, split_sec1) pos_sec2 = np.searchsorted(data_2, split_sec2) if pos_sec1 == 0 or pos_sec1 == len(data_1) or pos_sec2 == 0 or pos_sec2 == len(data_2): return False nw_data, ne_data, sw_data, se_data = data_1[:pos_sec1], data_1[pos_sec1:], data_2[:pos_sec2], data_2[pos_sec2:] return split_1, split_sec1, split_sec2, nw_data, ne_data, sw_data, se_data
class GenericT(object): """ Generic data structure, used for grid """ def __init__(self, data, param): self.param = param self.differ = Differential(self.param.Seed) # initialize the root self.root = NodeT() # self.children = [] # all level 2 grids self.root.n_data = data self.root.n_box = np.array([param.LOW, param.HIGH]) def getEqualSplit(self, partitions, min, max): """return equal split points, including both ends""" if min > max: logging.debug("getEqualSplit: Error: min > max") if partitions <= 1: return [min, max] return [ min + (max - min) * i / partitions for i in range(partitions + 1) ] def getCountBudget(self): """return noisy count budget for different levels of the indices""" raise NotImplementedError def getCoordinates(self, curr): """return the split dimension, the split points and the data points in each subnodes""" raise NotImplementedError def getCount(self, curr, epsilon): """ return true count or noisy count of a node, depending on epsilon. Note that the noisy count can be negative """ if curr.n_data is None: count = 0 else: count = curr.n_data.shape[1] if epsilon < 10**(-8): return count else: return count + self.differ.getNoise(1, epsilon) def intersect(self, hrect, query): """ checks if the hyper-rectangle intersects with the hyper-rectangle defined by the query in every dimension """ bool_m1 = query[0, :] >= hrect[1, :] bool_m2 = query[1, :] <= hrect[0, :] bool_m = np.logical_or(bool_m1, bool_m2) if np.any(bool_m): return False else: return True def testLeaf(self, curr): """ test whether a node should be a leaf node """ if (curr.n_depth == Params.maxHeightAdaptiveGrid) or \ (curr.n_data is None or curr.n_data.shape[1] == 0) or \ (curr.n_count <= self.param.minPartSize): return True return False def buildIndex(self): """build the grid structure.""" budget_c = self.getCountBudget() # an array with two elements # print budget_c self.root.n_count = self.getCount(self.root, 0) # add noisy count to the root queue = deque() queue.append(self.root) # ## main loop while len(queue) > 0: curr = queue.popleft() if curr.n_data is None: curr.a_count.append(0) else: curr.a_count.append(curr.n_data.shape[1]) if self.testLeaf(curr) is True: # if curr is a leaf node remainingEps = sum(budget_c[curr.n_depth:]) curr.n_count, curr.eps, curr.n_isLeaf = self.getCount( curr, remainingEps), remainingEps, True curr.l_count.append(curr.n_count) else: # curr needs to split --> find splitting granularity gran, split_arr_x, split_arr_y, n_data_matrix = self.getCoordinates( curr) if gran == 1: remainingEps = sum(budget_c[curr.n_depth:]) curr.n_count, curr.eps, curr.n_isLeaf = self.getCount( curr, remainingEps), remainingEps, True curr.children = None curr.l_count.append(curr.n_count) continue # if the first level cell is leaf node # add all nodes to queue for x in range(gran): for y in range(gran): node = NodeT() node.n_box = np.array( [[split_arr_x[x], split_arr_y[y]], [split_arr_x[x + 1], split_arr_y[y + 1]]]) node.index, node.parent, node.n_depth = x * gran + y, curr, curr.n_depth + 1 if n_data_matrix[x][y] is None: node.n_data = None else: node.n_data = np.transpose(n_data_matrix[x][y]) node.n_count = self.getCount(node, budget_c[node.n_depth]) node.eps = budget_c[node.n_depth] if node.n_depth == 2: node.n_isLeaf = True if curr.children is None: curr.children = np.ndarray(shape=(gran, gran), dtype=NodeT) curr.children[x][y] = node queue.append(node) curr.n_data = None # ## do not need the data points coordinates now # end of while # canonical range query does apply def rangeCount(self, query): """ Query answering function. Find the number of data points within a query rectangle. This function assume that the tree is constructed with noisy count for every node """ queue = deque() queue.append(self.root) count = 0.0 while len(queue) > 0: curr = queue.popleft() _box = curr.n_box if curr.n_isLeaf is True: frac = 1 if self.intersect(_box, query): for i in range(_box.shape[1]): if _box[1, i] == _box[0, i] or Params.WorstCase == True: frac *= 1 else: frac *= (min(query[1, i], _box[1, i]) - max(query[0, i], _box[0, i])) / ( _box[1, i] - _box[0, i]) count += curr.n_count * frac else: # if not leaf for (_, _), node in np.ndenumerate(curr.children): bool_matrix = np.zeros((2, query.shape[1])) bool_matrix[0, :] = query[0, :] <= _box[0, :] bool_matrix[1, :] = query[1, :] >= _box[1, :] if np.all( bool_matrix): # if query range contains node range count += node.n_count elif self.intersect(_box, query): queue.append(node) return float(count) def leafCover(self, loc): """ find a leaf node that cover the location """ gran_1st = len(self.root.children) x1 = min(gran_1st - 1, (loc[0] - self.root.n_box[0, 0]) * gran_1st / (self.root.n_box[1, 0] - self.root.n_box[0, 0])) y1 = min(gran_1st - 1, (loc[1] - self.root.n_box[0, 1]) * gran_1st / (self.root.n_box[1, 1] - self.root.n_box[0, 1])) node_1st = self.root.children[x1][y1] """ Note that there are cases when the actual count of first level cell is zero but the noisy count is > 0, thus the cell may be splited into a number of empty cells """ if node_1st.n_isLeaf or node_1st.children is None: return node_1st else: gran_2st = len(node_1st.children) x2 = min(gran_2st - 1, (loc[0] - node_1st.n_box[0, 0]) * gran_2st / (node_1st.n_box[1, 0] - node_1st.n_box[0, 0])) y2 = min(gran_2st - 1, (loc[1] - node_1st.n_box[0, 1]) * gran_2st / (node_1st.n_box[1, 1] - node_1st.n_box[0, 1])) return node_1st.children[x2][y2] def checkCorrectness(self, node, nodePoints=None): """ Total number of data points of all leaf nodes should equal to the total data points only check the FIRST time instance """ totalPoints = 0 if node is None: return 0 if (node.n_isLeaf and node.n_data is not None) or node.children is None: return node.a_count[0] for (_, _), child in np.ndenumerate(node.children): totalPoints += self.checkCorrectness(child) if nodePoints is None: return totalPoints if totalPoints == nodePoints: return True return False
def test_privateMedian(): f = open(res_dir + 'privateMedian', 'w') f_t = open(res_dir + 'privateMedian-time', 'w') data = np.sort(dataGen.data_gen(dist, NDIM, LO, HI, NDATA)).flatten() n = len(data) for i in range(10): print 'level ' + ` i ` container = np.zeros(6) container_t = np.zeros(6) for seed in seed_list: perturber = Differential(seed) for j in range(2**i): c_data = data[n * j / 2**i:n * (j + 1) / 2**i] c_len = len(c_data) # exponential mechanism start = time.clock() em = perturber.getSplit_exp(c_data, c_data[0], c_data[-1], eps, 1) end = time.clock() container_t[0] += end - start # smooth sensitivity (2-approx) start = time.clock() ls = perturber.getSplit_smooth(c_data, c_data[0], c_data[-1], eps, 1) end = time.clock() container_t[1] += end - start # exponential mechanism sampling start = time.clock() em_samp = perturber.getSplit_exp(c_data, c_data[0], c_data[-1], eps, srt) end = time.clock() container_t[2] += end - start # smooth sensitivity sampling start = time.clock() ls_samp = perturber.getSplit_smooth(c_data, c_data[0], c_data[-1], eps, srt) end = time.clock() container_t[3] += end - start # noisy mean approximation start = time.clock() nm = perturber.getSplit_noisyMean(c_data, c_data[0], c_data[-1], eps) end = time.clock() container_t[4] += end - start # noisy grid approximation start = time.clock() ng = perturber.getSplit_grid(c_data, c_data[0], c_data[-1], eps, unit) end = time.clock() container_t[5] += end - start res = [em, ls, em_samp, ls_samp, nm, ng] for k in range(6): if res[k] >= c_data[-1] or res[k] <= c_data[0]: container[k] += 1.0 else: r_k = np.searchsorted(c_data, res[k]) r_m = float(c_len) / 2 container[k] += abs(r_m - r_k) / r_m # end of j loop for k in range(6): f.write( ` container[k] / (2**i * len(seed_list)) ` + ' ') f.write('\n') for k in range(6): f_t.write( ` container_t[k] / (2**i * len(seed_list)) ` + ' ') f_t.write('\n') # end of i f.close() f_t.close()
# norm.stats(loc=mu, scale=sigma, moments="mv") mu, sigma = 4.26246819779, math.sqrt(3.68211892668) for i in range(200): print math.sqrt(random.gauss(mu, sigma)) # print norm.pdf(loc=mu, scale=sigma) # for rd in np.arange(1000, 3001, 50): # x2 = (rd/1000.0)**2 # print "%.1f\t%s" % (x2, norm.pdf(x2, loc=mu, scale=sigma)) p = Params(1000) p.select_dataset() dp = Differential(p.seed) # Randomly picking location in a small MBR of tdrive dataset. minLat, maxLat = 39.1232147, 40.7225952 minLon, maxLon = 115.3879166, 117.3795395 # diffLat = maxLat - minLat # diffLon = maxLon - minLon # # maxLat = maxLat - 0.95*diffLat # maxLon = maxLon - 0.95*diffLon samples = 200000 # sample size d2_list = [] for i in range(samples): # First point
class Kd_cell(Kd_pure): """ Kd tree based on syntatic data generation and a grid structure. See Y. Xiao, L. Xiong, and C. Yuan, Differentially private data release through multidimensional partitioning, in SDM Workshop, VLDB, 2010 """ def __init__(self, data, param): self.param = param self.differ = Differential(self.param.Seed) self.mapp = None self.root = KNode() self.realData = data self.root.n_box = None self.root.n_budget = Params.maxHeight def getCountBudget(self): count_eps = self.param.Eps * 0.5 H = Params.maxHeight if self.param.geoBudget == 'none': return [count_eps / (H + 1) for _ in range(H + 1)] elif self.param.geoBudget == 'aggressive': unit = count_eps / (2**(H + 1) - 1) return [unit * 2**i for i in range(H + 1)] elif self.param.geoBudget == 'quadratic': unit = count_eps * (np.sqrt(2) - 1) / (2**(0.5 * (H + 1)) - 1) return [unit * 2**(0.5 * i) for i in range(H + 1)] elif self.param.geoBudget == 'optimal': unit = count_eps * ((2**(1.0 / 3)) - 1) / (2**((1.0 / 3) * (H + 1)) - 1) return [unit * 2**((1.0 / 3) * i) for i in range(H + 1)] elif self.param.geoBudget == 'quartic': unit = count_eps * ((2**(1.0 / 4)) - 1) / (2**((1.0 / 4) * (H + 1)) - 1) return [unit * 2**((1.0 / 4) * i) for i in range(H + 1)] else: logging.error('No such geoBudget scheme') sys.exit(1) def synthetic_gen(self): """Apply a grid structure on the domain and perturb the count using half of the available privacy budget """ logging.debug('generating synthetic map...') data = self.realData unit = Params.unitGrid x_min = np.floor(Params.LOW[0] / unit) * unit x_max = np.ceil(Params.HIGH[0] / unit) * unit y_min = np.floor(Params.LOW[1] / unit) * unit y_max = np.ceil(Params.HIGH[1] / unit) * unit x_CELL = int(np.rint((x_max - x_min) / unit)) y_CELL = int(np.rint((y_max - y_min) / unit)) self.root.n_box = np.array([[x_min, y_min], [x_max, y_max]]) self.mapp = np.zeros( (x_CELL, y_CELL)) - 1 # ## initialize every cell with -1 for i in range(Params.NDATA): # ## populate the map point = data[:, i] cell_x = int(np.floor((point[0] - x_min) / unit)) cell_y = int(np.floor((point[1] - y_min) / unit)) if self.mapp[cell_x, cell_y] != -1: self.mapp[cell_x, cell_y] += 1 else: self.mapp[cell_x, cell_y] = 1 for i in range(x_CELL): # ## perturb the counts for j in range(y_CELL): if self.mapp[i, j] != -1: self.mapp[i, j] += np.rint( self.differ.getNoise(1, 0.5 * self.param.Eps)) else: self.mapp[i, j] = np.rint( self.differ.getNoise(1, 0.5 * self.param.Eps)) # if noisy count is negative, ignore the noise and generate no points if self.mapp[i, j] < 0: self.mapp[i, j] = 0 def cell_setLeaf(self, curr): """ Throw away the counts based on the syntatic data """ curr.n_count = 0 return def testLeaf(self, curr): if (curr.n_count <= self.param.minPartSize) or ( curr.n_depth == Params.maxHeight) or (self.uniform_test( curr, self.param.cellDistance)): return True return False def uniform_test(self, curr, distance): """ One of the stopping conditions: cell is uniform according to some threshold 'distance') """ unit = Params.unitGrid x_min = int(np.rint((curr.n_box[0, 0] - self.root.n_box[0, 0]) / unit)) x_max = int(np.rint((curr.n_box[1, 0] - self.root.n_box[0, 0]) / unit)) y_min = int(np.rint((curr.n_box[0, 1] - self.root.n_box[0, 1]) / unit)) y_max = int(np.rint((curr.n_box[1, 1] - self.root.n_box[0, 1]) / unit)) data = self.mapp[x_min:x_max, y_min:y_max] total = np.sum(data) avg = total / ((x_max - x_min) * (y_max - y_min)) dist = np.sum(np.abs(data - avg)) if dist > distance: return False else: return True def buildIndex(self): stack = deque() stack.append(self.root) nleaf = 0 # leaf counter max_depth = -1 self.root.n_count = np.sum(self.mapp) while len(stack) > 0: curr = stack.popleft() if curr.n_depth > max_depth: max_depth = curr.n_depth if self.testLeaf(curr) is True: # curr is a leaf node nleaf += 1 curr.n_isLeaf = True self.cell_setLeaf(curr) else: # curr needs to split curr.n_budget -= 1 tmp = self.getCoordinates(curr) nw_node, ne_node, sw_node, se_node = KNode(), KNode(), KNode( ), KNode() # create sub-nodes nw_coord, ne_coord, count_tmp = tmp x_nw, y_nw = nw_coord x_se, y_se = ne_coord nw_node.n_box = np.array([[curr.n_box[0, 0], y_nw], [x_nw, curr.n_box[1, 1]]]) ne_node.n_box = np.array([[x_nw, y_se], [curr.n_box[1, 0], curr.n_box[1, 1]]]) sw_node.n_box = np.array([[curr.n_box[0, 0], curr.n_box[0, 1]], [x_se, y_nw]]) se_node.n_box = np.array([[x_se, curr.n_box[0, 1]], [curr.n_box[1, 0], y_se]]) c_t = 0 for sub_node in [nw_node, ne_node, sw_node, se_node]: sub_node.n_depth = curr.n_depth + 1 sub_node.n_count = count_tmp[c_t] sub_node.n_budget = curr.n_budget stack.append(sub_node) c_t += 1 curr.nw, curr.ne, curr.sw, curr.se = nw_node, ne_node, sw_node, se_node # end of while logging.debug("number of leaves: %d" % nleaf) logging.debug("max depth: %d" % max_depth) def getCoordinates(self, curr): dim_1 = curr.n_depth % Params.NDIM # primary split dimension UNIT = Params.unitGrid x_min = int(np.rint((curr.n_box[0, 0] - self.root.n_box[0, 0]) / UNIT)) x_max = int(np.rint((curr.n_box[1, 0] - self.root.n_box[0, 0]) / UNIT)) y_min = int(np.rint((curr.n_box[0, 1] - self.root.n_box[0, 1]) / UNIT)) y_max = int(np.rint((curr.n_box[1, 1] - self.root.n_box[0, 1]) / UNIT)) total = np.sum(self.mapp[x_min:x_max, y_min:y_max]) if dim_1 == 0: for i in range(x_max - x_min): if np.sum(self.mapp[x_min:x_min + i + 1, y_min:y_max]) >= total / 2: break split_prm = (x_min + i + 1) * UNIT + self.root.n_box[0, 0] half_1 = np.sum(self.mapp[x_min:x_min + i + 1, y_min:y_max]) half_2 = np.sum(self.mapp[x_min + i + 1:x_max, y_min:y_max]) for j in range(y_max - y_min): if np.sum(self.mapp[x_min:x_min + i + 1, y_min:y_min + j + 1]) >= half_1 / 2: break split_sec1 = self.root.n_box[0, 1] + (y_min + j + 1) * UNIT n_sw = np.sum(self.mapp[x_min:x_min + i + 1, y_min:y_min + j + 1]) n_nw = np.sum(self.mapp[x_min:x_min + i + 1, y_min + j + 1:y_max]) for k in range(y_max - y_min): if np.sum(self.mapp[x_min + i + 1:x_max, y_min:y_min + k + 1]) >= half_2 / 2: break split_sec2 = self.root.n_box[0, 1] + (y_min + k + 1) * UNIT n_se = np.sum(self.mapp[x_min + i + 1:x_max, y_min:y_min + k + 1]) n_ne = np.sum(self.mapp[x_min + i + 1:x_max, y_min + k + 1:y_max]) return (split_prm, split_sec1), (split_prm, split_sec2), (n_nw, n_ne, n_sw, n_se) else: for i in range(y_max - y_min): if np.sum(self.mapp[x_min:x_max, y_min:y_min + i + 1]) >= total / 2: break split_prm = self.root.n_box[0, 1] + (y_min + i + 1) * UNIT half_1 = np.sum(self.mapp[x_min:x_max, y_min:y_min + i + 1]) half_2 = np.sum(self.mapp[x_min:x_max, y_min + i + 1:y_max]) for j in range(x_max - x_min): if np.sum(self.mapp[x_min:x_min + j + 1, y_min:y_min + i + 1]) >= half_1 / 2: break split_sec1 = (x_min + j + 1) * UNIT + self.root.n_box[0, 0] n_sw = np.sum(self.mapp[x_min:x_min + j + 1, y_min:y_min + i + 1]) n_se = np.sum(self.mapp[x_min + j + 1:x_max, y_min:y_min + i + 1]) for k in range(x_max - x_min): if np.sum(self.mapp[x_min:x_min + k + 1, y_min + i + 1:y_max]) >= half_2 / 2: break split_sec2 = (x_min + k + 1) * UNIT + self.root.n_box[0, 0] n_nw = np.sum(self.mapp[x_min:x_min + k + 1, y_min + i + 1:y_max]) n_ne = np.sum(self.mapp[x_min + k + 1:x_max, y_min + i + 1:y_max]) return (split_sec2, split_prm), (split_sec1, split_prm), (n_nw, n_ne, n_sw, n_se) def populate_synthetic_tree(self): """ Populate real data to the synthetic tree """ logging.debug('populating synthetic tree...') a_data = self.realData ndata = a_data.shape[1] for i in range(ndata): ptx = a_data[0, i] pty = a_data[1, i] leaf = self.root.find_subnode(ptx, pty) leaf.n_count += 1 # traverse the tree and update leaf counts stack = deque() stack.append(self.root) while len(stack) > 0: cur_node = stack.popleft() if cur_node.n_isLeaf is True: # leaf cur_node.n_count += self.differ.getNoise( 1, 0.5 * self.param.Eps) else: stack.append(cur_node.nw) stack.append(cur_node.ne) stack.append(cur_node.sw) stack.append(cur_node.se)
class GenericT(object): """ Generic data structure, used for grid """ def __init__(self, data, param): self.param = param self.differ = Differential(self.param.Seed) # initialize the root self.root = NodeT() # self.children = [] # all level 2 grids self.root.n_data = data self.root.n_box = np.array([param.LOW, param.HIGH]) def getEqualSplit(self, partitions, min, max): """return equal split points, including both ends""" if min > max: logging.debug("getEqualSplit: Error: min > max") if partitions <= 1: return [min, max] return [min + (max - min) * i / partitions for i in range(partitions + 1)] def getCountBudget(self): """return noisy count budget for different levels of the indices""" raise NotImplementedError def getCoordinates(self, curr): """return the split dimension, the split points and the data points in each subnodes""" raise NotImplementedError def getCount(self, curr, epsilon): """ return true count or noisy count of a node, depending on epsilon. Note that the noisy count can be negative """ if curr.n_data is None: count = 0 else: count = curr.n_data.shape[1] if epsilon < 10 ** (-8): return count else: return count + self.differ.getNoise(1, epsilon) def intersect(self, hrect, query): """ checks if the hyper-rectangle intersects with the hyper-rectangle defined by the query in every dimension """ bool_m1 = query[0, :] >= hrect[1, :] bool_m2 = query[1, :] <= hrect[0, :] bool_m = np.logical_or(bool_m1, bool_m2) if np.any(bool_m): return False else: return True def testLeaf(self, curr): """ test whether a node should be a leaf node """ if (curr.n_depth == Params.maxHeightAdaptiveGrid) or \ (curr.n_data is None or curr.n_data.shape[1] == 0) or \ (curr.n_count <= self.param.minPartSize): return True return False def buildIndex(self): """build the grid structure.""" budget_c = self.getCountBudget() # an array with two elements # print budget_c self.root.n_count = self.getCount(self.root, 0) # add noisy count to the root queue = deque() queue.append(self.root) # ## main loop while len(queue) > 0: curr = queue.popleft() if curr.n_data is None: curr.a_count.append(0) else: curr.a_count.append(curr.n_data.shape[1]) if self.testLeaf(curr) is True: # if curr is a leaf node remainingEps = sum(budget_c[curr.n_depth:]) curr.n_count, curr.eps, curr.n_isLeaf = self.getCount(curr, remainingEps), remainingEps, True curr.l_count.append(curr.n_count) else: # curr needs to split --> find splitting granularity gran, split_arr_x, split_arr_y, n_data_matrix = self.getCoordinates(curr) if gran == 1: remainingEps = sum(budget_c[curr.n_depth:]) curr.n_count, curr.eps, curr.n_isLeaf = self.getCount(curr, remainingEps), remainingEps, True curr.children = None curr.l_count.append(curr.n_count) continue # if the first level cell is leaf node # add all nodes to queue for x in range(gran): for y in range(gran): node = NodeT() node.n_box = np.array( [[split_arr_x[x], split_arr_y[y]], [split_arr_x[x + 1], split_arr_y[y + 1]]]) node.index, node.parent, node.n_depth = x * gran + y, curr, curr.n_depth + 1 if n_data_matrix[x][y] is None: node.n_data = None else: node.n_data = np.transpose(n_data_matrix[x][y]) node.n_count = self.getCount(node, budget_c[node.n_depth]) node.eps = budget_c[node.n_depth] if node.n_depth == 2: node.n_isLeaf = True if curr.children is None: curr.children = np.ndarray(shape=(gran, gran), dtype=NodeT) curr.children[x][y] = node queue.append(node) curr.n_data = None # ## do not need the data points coordinates now # end of while # canonical range query does apply def rangeCount(self, query): """ Query answering function. Find the number of data points within a query rectangle. This function assume that the tree is constructed with noisy count for every node """ queue = deque() queue.append(self.root) count = 0.0 while len(queue) > 0: curr = queue.popleft() _box = curr.n_box if curr.n_isLeaf is True: frac = 1 if self.intersect(_box, query): for i in range(_box.shape[1]): if _box[1, i] == _box[0, i] or Params.WorstCase == True: frac *= 1 else: frac *= (min(query[1, i], _box[1, i]) - max(query[0, i], _box[0, i])) / ( _box[1, i] - _box[0, i]) count += curr.n_count * frac else: # if not leaf for (_, _), node in np.ndenumerate(curr.children): bool_matrix = np.zeros((2, query.shape[1])) bool_matrix[0, :] = query[0, :] <= _box[0, :] bool_matrix[1, :] = query[1, :] >= _box[1, :] if np.all(bool_matrix): # if query range contains node range count += node.n_count elif self.intersect(_box, query): queue.append(node) return float(count) def leafCover(self, loc): """ find a leaf node that cover the location """ gran_1st = len(self.root.children) x1 = min(gran_1st - 1, (loc[0] - self.root.n_box[0, 0]) * gran_1st / (self.root.n_box[1, 0] - self.root.n_box[0, 0])) y1 = min(gran_1st - 1, (loc[1] - self.root.n_box[0, 1]) * gran_1st / (self.root.n_box[1, 1] - self.root.n_box[0, 1])) node_1st = self.root.children[x1][y1] """ Note that there are cases when the actual count of first level cell is zero but the noisy count is > 0, thus the cell may be splited into a number of empty cells """ if node_1st.n_isLeaf or node_1st.children is None: return node_1st else: gran_2st = len(node_1st.children) x2 = min(gran_2st - 1, (loc[0] - node_1st.n_box[0, 0]) * gran_2st / (node_1st.n_box[1, 0] - node_1st.n_box[0, 0])) y2 = min(gran_2st - 1, (loc[1] - node_1st.n_box[0, 1]) * gran_2st / (node_1st.n_box[1, 1] - node_1st.n_box[0, 1])) return node_1st.children[x2][y2] def checkCorrectness(self, node, nodePoints=None): """ Total number of data points of all leaf nodes should equal to the total data points only check the FIRST time instance """ totalPoints = 0 if node is None: return 0 if (node.n_isLeaf and node.n_data is not None) or node.children is None: return node.a_count[0] for (_, _), child in np.ndenumerate(node.children): totalPoints += self.checkCorrectness(child) if nodePoints is None: return totalPoints if totalPoints == nodePoints: return True return False
r, theta = [ math.sqrt(random.uniform(0, 1)) * math.sqrt(1), 2 * math.pi * random.uniform(0, 1) ] y = [math.cos(theta) * r, math.sin(theta) * r] d = dist(x, y) total += d print("Expected dist: ", total / N) """ Simulated dataset """ p = Params(1000) p.select_dataset() reachable_range = Utils.reachableDistance() dp = Differential(p.seed) # Randomly picking location in a small MBR of tdrive dataset. minLat, maxLat = 39.1232147, 40.7225952 minLon, maxLon = 115.3879166, 117.3795395 diffLat = maxLat - minLat diffLon = maxLon - minLon maxLat = maxLat - 0.95 * diffLat maxLon = maxLon - 0.95 * diffLon # print ("diagonal dist: ", Utils.distance(minLat, minLon, maxLat, maxLon)) def probs_from_sampling(samples, step, d_prime_values, d_matches_values):
class KalmanFilterPID(Parser): """ generated source for class KalmanFilterPID """ # sampling rate def __init__(self, param): """ generated source for method __init__ """ Parser.__init__(self) self.param = param self.differ = Differential(self.param.Seed) self.predict = [] self.interval = None # Kalman Filter params self.P = 100 # estimation error covariance (over all time instance) self.Q = 1000 # process noise synthetic data self.R = 1000000 # measurement noise optimal for alpha = 1, synthetic data self.K = 0 # kalman gain # PID control params - default self.Cp = 0.9 # proportional gain, to keep output proportional to current error self.Ci = 0.1 # integral gain, to eliminate offset self.Cd = 0.0 # derivative gain, to ensure stability - prevent large error in future # fixed internally self.theta = 1 # magnitude of changes self.xi = 0.2 # gamma (10%) self.minIntvl = 1 # make sure the interval is greater than 1 self.windowPID = 5 # I(integration) window self.ratioM = 0.2 # sampling rate # self.isSampling = False def adjustParams(self): # adjust params if self.ratioM < 0.1: self.theta = 20 if 0.1 <= self.ratioM < 0.2: self.theta = 14 if 0.2 <= self.ratioM < 0.3: self.theta = 2 if 0.3 <= self.ratioM < 0.4: self.theta = 0.5 if 0.4 <= self.ratioM < 0.5: self.theta = 0.3 if 0.5 <= self.ratioM: self.theta = 0.1 # test @classmethod def main(self, args): """ generated source for method main """ if len(args) < 5: print "Usage: python KalmanFilterPID.py input output privacy-budget process-variance Cp(optional) Ci(optional) Cd(optional)" sys.exit() output = open(args[2], "w") budget = eval(args[3]) Q = float(args[4]) if budget <= 0 or Q <= 0: print "Usage: privacy-budget AND process-variance are positive values" sys.exit() p = Params(1000) kfPID = KalmanFilterPID(p) kfPID.setTotalBudget(budget) kfPID.setQ(Q) kfPID.orig = Parser.getData(args[1]) kfPID.publish = [None] * len(kfPID.orig) # adjust R based on T and alpha kfPID.setR(len(kfPID.orig) * len(kfPID.orig) / (0.0 + budget * budget)) # set optional control gains if len(args) >= 6: d = args[5] if d > 1: d = 1 kfPID.setCp(d) if len(args) >= 7: d = args[6] if d + kfPID.Cp > 1: d = 1 - kfPID.Cp kfPID.setCi(d) else: kfPID.setCi(1 - kfPID.Cp) if len(args) >= 8: d = args[7] if d + kfPID.Cp + kfPID.Ci > 1: d = 1 - kfPID.Cp - kfPID.Ci kfPID.setCd(d) else: kfPID.setCd(1 - kfPID.Cp - kfPID.Ci) # kfPID.adjustParams() start = time.time() kfPID.publishCounts() end = time.time() Parser.outputData(output, kfPID.publish) print "Method:\tKalman Filter with Adaptive Sampling" print "Data Series Length:\t" + str(len(kfPID.orig)) print "Queries Issued:\t" + str(kfPID.query.count(1)) print "Privacy Budget Used:\t" + str( kfPID.query.count(1) * kfPID.epsilon) print "Average Relative Error:\t" + str(kfPID.getRelError()) print "Time Used (in second):\t" + str(end - start) def kalmanFilter(self, orig, budget, samplingRate=None): self.totalBudget = budget self.orig = orig if samplingRate is not None: self.isSampling = True self.ratioM = samplingRate else: self.isSampling = False # self.adjustParams() self.publish = [None] * len(self.orig) # adjust R based on T and alpha self.setR(len(self.orig) * len(self.orig) / (0.0 + budget * budget)) self.publishCounts() return self.publish def getCount(self, value, epsilon): """ return true count or noisy count of a node, depending on epsilon. Note that the noisy count can be negative """ if epsilon < 10**(-8): return value else: return value + self.differ.getNoise(1, epsilon) # sensitivity is 1 # data publication procedure def publishCounts(self): """ generated source for method publish """ self.query = BitArray(len(self.orig)) self.predict = [None] * len(self.orig) # recalculate individual budget based on M if (self.isSampling): M = int(self.ratioM * (len(self.orig))) # 0.25 optimal percentile else: M = len(self.orig) if M <= 0: M = 1 self.epsilon = (self.totalBudget + 0.0) / M # error = 0 self.interval = 1 nextQuery = max(1, self.windowPID) + self.interval - 1 for i in range(len(self.orig)): if i == 0: # the first time instance self.publish[i] = self.getCount(self.orig[i], self.epsilon) self.query[i] = 1 self.correctKF(i, 0) else: predct = self.predictKF(i) self.predict[i] = predct if self.query.count(1) < self.windowPID and self.query.count( 1) < M: # i is NOT the sampling point self.publish[i] = self.getCount(self.orig[i], self.epsilon) self.query[i] = 1 # update count using observation self.correctKF(i, predct) elif i == nextQuery and self.query.count(1) < M: # if i is the sampling point # query self.publish[i] = self.getCount(self.orig[i], self.epsilon) self.query[i] = 1 # update count using observation self.correctKF(i, predct) # update freq if (self.isSampling): ratio = self.PID(i) frac = min(20, (ratio - self.xi) / self.xi) deltaI = self.theta * (1 - math.exp(frac)) deltaI = int(deltaI) + (random.random() < deltaI - int(deltaI)) self.interval += deltaI else: self.interval = 1 if self.interval < self.minIntvl: self.interval = self.minIntvl nextQuery += self.interval # nextQuery is ns in the paper else: # --> predict self.publish[i] = predct # del self.orig # del self.predict # del self.query # if self.isPostProcessing: # self.postProcessing() # def postProcessing(self): # print len(self.samples), self.samples # remainedEps = self.totalBudget - len(self.samples) * self.epsilon # self.epsilon = self.epsilon + remainedEps/len(self.samples) # # # recompute noisy counts # prev = 0 # for i in self.samples: # self.publish[i] = self.getCount(self.orig[i], self.epsilon) # if i > prev + 1: # self.publish[prev + 1 : i] = [self.publish[prev]] * (i - prev - 1) # prev = i def setR(self, r): """ generated source for method setR """ self.R = r def setQ(self, q): """ generated source for method setQ """ self.Q = q def setCp(self, cp): """ generated source for method setCp """ self.Cp = cp def setCi(self, ci): """ generated source for method setCi """ self.Ci = ci def setCd(self, cd): """ generated source for method setCd """ self.Cd = cd # prediction step def predictKF(self, curr): """ generated source for method predictKF """ # predict using Kalman Filter lastValue = self.getLastQuery(curr) # project estimation error self.P += self.Q # Q is gaussian noise return lastValue # correction step def correctKF(self, curr, predict): """ generated source for method correctKF """ self.K = (self.P + 0.0) / (self.P + self.R) correct = predict + self.K * (self.publish[curr] - predict) # publish[curr] = Math.max((int) correct, 0) if curr > 0: # only correct from 2nd values self.publish[curr] = correct # print correct, "\t", self.publish[curr], self.K, self.P # update estimation error variance self.P *= (1 - self.K) def getLastQuery(self, curr): """ generated source for method getLastQuery """ for i in reversed(range(curr)): if self.query[i]: break return self.publish[i] # adaptive sampling - return feedback error def PID(self, curr): """ generated source for method PID """ sum = 0 lastValue = 0 change = 0 timeDiff = 0 next = curr for j in reversed(range(self.windowPID - 1)): index = next while index >= 0: if self.query[index]: next = index - 1 # the last nextQuery break index -= 1 if j == self.windowPID - 1: lastValue = abs(self.publish[index] - self.predict[index]) / ( 0.0 + max(self.publish[index], 1)) change = abs(self.publish[index] - self.predict[index]) / ( 0.0 + max(self.publish[index], 1)) timeDiff = index if j == self.windowPID - 2: change -= abs(self.publish[index] - self.predict[index]) / ( 0.0 + max(self.publish[index], 1)) timeDiff -= index sum += (abs(self.publish[index] - self.predict[index]) / (0.0 + max(self.publish[index], 1))) ratio = self.Cp * lastValue + self.Ci * sum + self.Cd * change / ( 0.0 + timeDiff) return ratio