class KTree(object): """Generic tree template""" def __init__(self, data, param): self.param = param self.differ = Differential(self.param.Seed) # ## initialize the root self.root = KNode() self.root.n_data = data self.root.n_box = np.array([Params.LOW, Params.HIGH]) self.root.n_budget = Params.maxHeight def getSplitBudget(self): """return a list of h budget values for split""" raise NotImplementedError def getCountBudget(self): """return a list of (h+1) budget values for noisy count""" raise NotImplementedError def getNoisyMedian(self, array, left, right, epsilon): """return the split value of an array""" raise NotImplementedError def getCoordinates(self, curr): """ return the coordinate of lower-right point of the NW sub-node and the upper-left point of the SW sub-node and the data points in the four subnodes, i.e. return (x_nw,y_nw),(x_se,y_se), nw_data, ne_data, sw_data, se_data """ raise NotImplementedError def getSplit(self, array, left, right, epsilon): """ return the split point given an array, may be data-independent or true median or noisy median, depending on the type of the tree """ raise NotImplementedError def getCount(self, curr, epsilon): """ return true count or noisy count of a node, depending on epsilon""" if curr.n_data is None: count = 0 else: count = curr.n_data.shape[1] if epsilon < 10 ** (-6): return count else: return count + self.differ.getNoise(1, epsilon) def testLeaf(self, curr): """ test whether a node should be a leaf node """ if (curr.n_depth == Params.maxHeight) or \ (curr.n_budget <= 0) or \ (curr.n_data is None or curr.n_data.shape[1] == 0) or \ (curr.n_count <= self.param.minPartSize): return True return False def cell_setLeaf(self, curr): """ will be overrided in kd_cell """ return def buildIndex(self): """ Function to build the tree structure, fanout = 4 by default for spatial (2D) data """ budget_c = self.getCountBudget() self.root.n_count = self.getCount(self.root, budget_c[0]) # ## add noisy count to root stack = deque() stack.append(self.root) nleaf = 0 # ## leaf counter max_depth = -1 # ## main loop while len(stack) > 0: curr = stack.popleft() if curr.n_depth > max_depth: max_depth = curr.n_depth if self.testLeaf(curr) is True: # ## curr is a leaf node if curr.n_depth < Params.maxHeight: # ## if a node ends up earlier than maxHeight, it should be able to use the remaining count budget remainingEps = sum(budget_c[curr.n_depth + 1:]) curr.n_count = self.getCount(curr, remainingEps) nleaf += 1 curr.n_isLeaf = True self.cell_setLeaf(curr) else: # ## curr needs to split curr.n_budget -= 1 # ## some budget will be used regardless the split is successful or not tmp = self.getCoordinates(curr) nw_node, ne_node, sw_node, se_node = KNode(), KNode(), KNode(), KNode() # create sub-nodes nw_coord, ne_coord, nw_node.n_data, ne_node.n_data, sw_node.n_data, se_node.n_data = tmp x_nw, y_nw = nw_coord x_se, y_se = ne_coord # ## update bounding box, depth, count, budget for the four subnodes nw_node.n_box = np.array([[curr.n_box[0, 0], y_nw], [x_nw, curr.n_box[1, 1]]]) ne_node.n_box = np.array([[x_nw, y_se], [curr.n_box[1, 0], curr.n_box[1, 1]]]) sw_node.n_box = np.array([[curr.n_box[0, 0], curr.n_box[0, 1]], [x_se, y_nw]]) se_node.n_box = np.array([[x_se, curr.n_box[0, 1]], [curr.n_box[1, 0], y_se]]) for sub_node in [nw_node, ne_node, sw_node, se_node]: sub_node.n_depth = curr.n_depth + 1 # if (sub_node.n_depth == Params.maxHeight and sub_node.n_data is not None): # print len(sub_node.n_data[0]) sub_node.n_count = self.getCount(sub_node, budget_c[sub_node.n_depth]) sub_node.n_budget = curr.n_budget stack.append(sub_node) curr.n_data = None # ## do not need the data points coordinates now curr.nw, curr.ne, curr.sw, curr.se = nw_node, ne_node, sw_node, se_node # end of while logging.debug("number of leaves: %d" % nleaf) logging.debug("max depth: %d" % max_depth) def rect_intersect(self, hrect, query): """ checks if the hyper-rectangle intersects with the hyper-rectangle defined by the query in every dimension """ bool_m1 = query[0, :] >= hrect[1, :] bool_m2 = query[1, :] <= hrect[0, :] bool_m = np.logical_or(bool_m1, bool_m2) if np.any(bool_m): return False else: return True def rangeCount(self, query): """ Query answering function. Find the number of data points within a query rectangle. """ stack = deque() stack.append(self.root) count = 0.0 # ## Below are three variables recording the number of 1) whole leaf 2) partial leaf 3) whole internal node, # ## respectively, which contribute to the query answer. For debug purpose only. l_whole, l_part, i_whole = 0, 0, 0 while len(stack) > 0: curr = stack.popleft() _box = curr.n_box if curr.n_isLeaf is True: frac = 1 if self.rect_intersect(_box, query): for i in range(_box.shape[1]): if _box[1, i] == _box[0, i] or Params.WorstCase == True: frac *= 1 else: frac *= (min(query[1, i], _box[1, i]) - max(query[0, i], _box[0, i])) / ( _box[1, i] - _box[0, i]) count += curr.n_count * frac if 1.0 - frac < 10 ** (-6): l_whole += 1 else: l_part += 1 else: # ## if not leaf bool_matrix = np.zeros((2, query.shape[1])) bool_matrix[0, :] = query[0, :] <= _box[0, :] bool_matrix[1, :] = query[1, :] >= _box[1, :] if np.all(bool_matrix) and self.param.useLeafOnly is False: # ## if query range contains node range count += curr.n_count i_whole += 1 else: if self.rect_intersect(curr.nw.n_box, query): stack.append(curr.nw) if self.rect_intersect(curr.ne.n_box, query): stack.append(curr.ne) if self.rect_intersect(curr.sw.n_box, query): stack.append(curr.sw) if self.rect_intersect(curr.se.n_box, query): stack.append(curr.se) return float(count) # , i_whole, l_whole, l_part def adjustConsistency(self): """ Post processing for uniform noise across levels. Due to Michael Hay, Vibhor Rastogi, Gerome Miklau, Dan Suciu, Boosting the Accuracy of Differentially-Private Histograms Through Consistency, VLDB 2010 """ logging.debug('adjusting consistency...') # ## upward pass self.root.get_z() # ## downward pass queue = deque() queue.append(self.root) while len(queue) > 0: curr = queue.popleft() if curr.n_isLeaf is False: adjust = (curr.n_count - curr.nw.n_count - curr.ne.n_count - curr.sw.n_count - curr.se.n_count) / 4.0 for subnode in [curr.nw, curr.ne, curr.sw, curr.se]: subnode.n_count += adjust queue.append(subnode) def postProcessing(self): """ Post processing for general noise distribution across levels. Due to G. Cormode, M. Procopiuc, E. Shen, D. Srivastava and T. Yu, Differentially Private Spatial Decompositions, ICDE 2012. """ logging.debug("post-processing...") budget = self.getCountBudget() # ## count budget for h+1 levels H = Params.maxHeight # ## Phase 1 (top-down) queue = deque() self.root.n_count *= budget[self.root.n_depth] ** 2 queue.append(self.root) while len(queue) > 0: curr = queue.popleft() if curr.n_isLeaf is False: for subnode in [curr.nw, curr.ne, curr.sw, curr.se]: subnode.n_count = curr.n_count + subnode.n_count * (budget[subnode.n_depth] ** 2) queue.append(subnode) # ## Phase 2 (bottom-up) self.root.update_count() # ## Phase 3 (top-down) queue = deque() E_root = 0 for i in range(H + 1): E_root += 4 ** i * budget[H - i] * budget[H - i] self.root.n_count /= E_root self.root.n_F = 0 queue.append(self.root) while len(queue) > 0: curr = queue.popleft() if curr.n_isLeaf is False: h = H - curr.n_depth - 1 # ## height of curr's children E_h = 0 for i in range(h + 1): E_h += 4 ** i * budget[H - i] * budget[H - i] for subnode in [curr.nw, curr.ne, curr.sw, curr.se]: subnode.n_F = curr.n_F + curr.n_count * (budget[curr.n_depth] ** 2) subnode.n_count = (subnode.n_count - 4 ** h * subnode.n_F) / E_h queue.append(subnode) def pruning(self): """ If the tree is grown without the stopping condition of minLeafSize, prune it here after post processing """ logging.debug("pruning...") queue = deque() queue.append(self.root) while len(queue) > 0: curr = queue.popleft() if curr.n_isLeaf is False: if curr.n_count <= self.param.minPartSize: curr.n_isLeaf = True else: queue.append(curr.nw) queue.append(curr.ne) queue.append(curr.sw) queue.append(curr.se)
class Kd_cell(Kd_pure): """ Kd tree based on syntatic data generation and a grid structure. See Y. Xiao, L. Xiong, and C. Yuan, Differentially private data release through multidimensional partitioning, in SDM Workshop, VLDB, 2010 """ def __init__(self, data, param): self.param = param self.differ = Differential(self.param.Seed) self.mapp = None self.root = KNode() self.realData = data self.root.n_box = None self.root.n_budget = Params.maxHeight def getCountBudget(self): count_eps = self.param.Eps * 0.5 H = Params.maxHeight if self.param.geoBudget == 'none': return [count_eps / (H + 1) for _ in range(H + 1)] elif self.param.geoBudget == 'aggressive': unit = count_eps / (2**(H + 1) - 1) return [unit * 2**i for i in range(H + 1)] elif self.param.geoBudget == 'quadratic': unit = count_eps * (np.sqrt(2) - 1) / (2**(0.5 * (H + 1)) - 1) return [unit * 2**(0.5 * i) for i in range(H + 1)] elif self.param.geoBudget == 'optimal': unit = count_eps * ((2**(1.0 / 3)) - 1) / (2**((1.0 / 3) * (H + 1)) - 1) return [unit * 2**((1.0 / 3) * i) for i in range(H + 1)] elif self.param.geoBudget == 'quartic': unit = count_eps * ((2**(1.0 / 4)) - 1) / (2**((1.0 / 4) * (H + 1)) - 1) return [unit * 2**((1.0 / 4) * i) for i in range(H + 1)] else: logging.error('No such geoBudget scheme') sys.exit(1) def synthetic_gen(self): """Apply a grid structure on the domain and perturb the count using half of the available privacy budget """ logging.debug('generating synthetic map...') data = self.realData unit = Params.unitGrid x_min = np.floor(Params.LOW[0] / unit) * unit x_max = np.ceil(Params.HIGH[0] / unit) * unit y_min = np.floor(Params.LOW[1] / unit) * unit y_max = np.ceil(Params.HIGH[1] / unit) * unit x_CELL = int(np.rint((x_max - x_min) / unit)) y_CELL = int(np.rint((y_max - y_min) / unit)) self.root.n_box = np.array([[x_min, y_min], [x_max, y_max]]) self.mapp = np.zeros( (x_CELL, y_CELL)) - 1 # ## initialize every cell with -1 for i in range(Params.NDATA): # ## populate the map point = data[:, i] cell_x = int(np.floor((point[0] - x_min) / unit)) cell_y = int(np.floor((point[1] - y_min) / unit)) if self.mapp[cell_x, cell_y] != -1: self.mapp[cell_x, cell_y] += 1 else: self.mapp[cell_x, cell_y] = 1 for i in range(x_CELL): # ## perturb the counts for j in range(y_CELL): if self.mapp[i, j] != -1: self.mapp[i, j] += np.rint( self.differ.getNoise(1, 0.5 * self.param.Eps)) else: self.mapp[i, j] = np.rint( self.differ.getNoise(1, 0.5 * self.param.Eps)) # if noisy count is negative, ignore the noise and generate no points if self.mapp[i, j] < 0: self.mapp[i, j] = 0 def cell_setLeaf(self, curr): """ Throw away the counts based on the syntatic data """ curr.n_count = 0 return def testLeaf(self, curr): if (curr.n_count <= self.param.minPartSize) or ( curr.n_depth == Params.maxHeight) or (self.uniform_test( curr, self.param.cellDistance)): return True return False def uniform_test(self, curr, distance): """ One of the stopping conditions: cell is uniform according to some threshold 'distance') """ unit = Params.unitGrid x_min = int(np.rint((curr.n_box[0, 0] - self.root.n_box[0, 0]) / unit)) x_max = int(np.rint((curr.n_box[1, 0] - self.root.n_box[0, 0]) / unit)) y_min = int(np.rint((curr.n_box[0, 1] - self.root.n_box[0, 1]) / unit)) y_max = int(np.rint((curr.n_box[1, 1] - self.root.n_box[0, 1]) / unit)) data = self.mapp[x_min:x_max, y_min:y_max] total = np.sum(data) avg = total / ((x_max - x_min) * (y_max - y_min)) dist = np.sum(np.abs(data - avg)) if dist > distance: return False else: return True def buildIndex(self): stack = deque() stack.append(self.root) nleaf = 0 # leaf counter max_depth = -1 self.root.n_count = np.sum(self.mapp) while len(stack) > 0: curr = stack.popleft() if curr.n_depth > max_depth: max_depth = curr.n_depth if self.testLeaf(curr) is True: # curr is a leaf node nleaf += 1 curr.n_isLeaf = True self.cell_setLeaf(curr) else: # curr needs to split curr.n_budget -= 1 tmp = self.getCoordinates(curr) nw_node, ne_node, sw_node, se_node = KNode(), KNode(), KNode( ), KNode() # create sub-nodes nw_coord, ne_coord, count_tmp = tmp x_nw, y_nw = nw_coord x_se, y_se = ne_coord nw_node.n_box = np.array([[curr.n_box[0, 0], y_nw], [x_nw, curr.n_box[1, 1]]]) ne_node.n_box = np.array([[x_nw, y_se], [curr.n_box[1, 0], curr.n_box[1, 1]]]) sw_node.n_box = np.array([[curr.n_box[0, 0], curr.n_box[0, 1]], [x_se, y_nw]]) se_node.n_box = np.array([[x_se, curr.n_box[0, 1]], [curr.n_box[1, 0], y_se]]) c_t = 0 for sub_node in [nw_node, ne_node, sw_node, se_node]: sub_node.n_depth = curr.n_depth + 1 sub_node.n_count = count_tmp[c_t] sub_node.n_budget = curr.n_budget stack.append(sub_node) c_t += 1 curr.nw, curr.ne, curr.sw, curr.se = nw_node, ne_node, sw_node, se_node # end of while logging.debug("number of leaves: %d" % nleaf) logging.debug("max depth: %d" % max_depth) def getCoordinates(self, curr): dim_1 = curr.n_depth % Params.NDIM # primary split dimension UNIT = Params.unitGrid x_min = int(np.rint((curr.n_box[0, 0] - self.root.n_box[0, 0]) / UNIT)) x_max = int(np.rint((curr.n_box[1, 0] - self.root.n_box[0, 0]) / UNIT)) y_min = int(np.rint((curr.n_box[0, 1] - self.root.n_box[0, 1]) / UNIT)) y_max = int(np.rint((curr.n_box[1, 1] - self.root.n_box[0, 1]) / UNIT)) total = np.sum(self.mapp[x_min:x_max, y_min:y_max]) if dim_1 == 0: for i in range(x_max - x_min): if np.sum(self.mapp[x_min:x_min + i + 1, y_min:y_max]) >= total / 2: break split_prm = (x_min + i + 1) * UNIT + self.root.n_box[0, 0] half_1 = np.sum(self.mapp[x_min:x_min + i + 1, y_min:y_max]) half_2 = np.sum(self.mapp[x_min + i + 1:x_max, y_min:y_max]) for j in range(y_max - y_min): if np.sum(self.mapp[x_min:x_min + i + 1, y_min:y_min + j + 1]) >= half_1 / 2: break split_sec1 = self.root.n_box[0, 1] + (y_min + j + 1) * UNIT n_sw = np.sum(self.mapp[x_min:x_min + i + 1, y_min:y_min + j + 1]) n_nw = np.sum(self.mapp[x_min:x_min + i + 1, y_min + j + 1:y_max]) for k in range(y_max - y_min): if np.sum(self.mapp[x_min + i + 1:x_max, y_min:y_min + k + 1]) >= half_2 / 2: break split_sec2 = self.root.n_box[0, 1] + (y_min + k + 1) * UNIT n_se = np.sum(self.mapp[x_min + i + 1:x_max, y_min:y_min + k + 1]) n_ne = np.sum(self.mapp[x_min + i + 1:x_max, y_min + k + 1:y_max]) return (split_prm, split_sec1), (split_prm, split_sec2), (n_nw, n_ne, n_sw, n_se) else: for i in range(y_max - y_min): if np.sum(self.mapp[x_min:x_max, y_min:y_min + i + 1]) >= total / 2: break split_prm = self.root.n_box[0, 1] + (y_min + i + 1) * UNIT half_1 = np.sum(self.mapp[x_min:x_max, y_min:y_min + i + 1]) half_2 = np.sum(self.mapp[x_min:x_max, y_min + i + 1:y_max]) for j in range(x_max - x_min): if np.sum(self.mapp[x_min:x_min + j + 1, y_min:y_min + i + 1]) >= half_1 / 2: break split_sec1 = (x_min + j + 1) * UNIT + self.root.n_box[0, 0] n_sw = np.sum(self.mapp[x_min:x_min + j + 1, y_min:y_min + i + 1]) n_se = np.sum(self.mapp[x_min + j + 1:x_max, y_min:y_min + i + 1]) for k in range(x_max - x_min): if np.sum(self.mapp[x_min:x_min + k + 1, y_min + i + 1:y_max]) >= half_2 / 2: break split_sec2 = (x_min + k + 1) * UNIT + self.root.n_box[0, 0] n_nw = np.sum(self.mapp[x_min:x_min + k + 1, y_min + i + 1:y_max]) n_ne = np.sum(self.mapp[x_min + k + 1:x_max, y_min + i + 1:y_max]) return (split_sec2, split_prm), (split_sec1, split_prm), (n_nw, n_ne, n_sw, n_se) def populate_synthetic_tree(self): """ Populate real data to the synthetic tree """ logging.debug('populating synthetic tree...') a_data = self.realData ndata = a_data.shape[1] for i in range(ndata): ptx = a_data[0, i] pty = a_data[1, i] leaf = self.root.find_subnode(ptx, pty) leaf.n_count += 1 # traverse the tree and update leaf counts stack = deque() stack.append(self.root) while len(stack) > 0: cur_node = stack.popleft() if cur_node.n_isLeaf is True: # leaf cur_node.n_count += self.differ.getNoise( 1, 0.5 * self.param.Eps) else: stack.append(cur_node.nw) stack.append(cur_node.ne) stack.append(cur_node.sw) stack.append(cur_node.se)
class KalmanFilterPID(Parser): """ generated source for class KalmanFilterPID """ # sampling rate def __init__(self, param): """ generated source for method __init__ """ Parser.__init__(self) self.param = param self.differ = Differential(self.param.Seed) self.predict = [] self.interval = None # Kalman Filter params self.P = 100 # estimation error covariance (over all time instance) self.Q = 1000 # process noise synthetic data self.R = 1000000 # measurement noise optimal for alpha = 1, synthetic data self.K = 0 # kalman gain # PID control params - default self.Cp = 0.9 # proportional gain, to keep output proportional to current error self.Ci = 0.1 # integral gain, to eliminate offset self.Cd = 0.0 # derivative gain, to ensure stability - prevent large error in future # fixed internally self.theta = 1 # magnitude of changes self.xi = 0.2 # gamma (10%) self.minIntvl = 1 # make sure the interval is greater than 1 self.windowPID = 5 # I(integration) window self.ratioM = 0.2 # sampling rate # self.isSampling = False def adjustParams(self): # adjust params if self.ratioM < 0.1: self.theta = 20 if 0.1 <= self.ratioM < 0.2: self.theta = 14 if 0.2 <= self.ratioM < 0.3: self.theta = 2 if 0.3 <= self.ratioM < 0.4: self.theta = 0.5 if 0.4 <= self.ratioM < 0.5: self.theta = 0.3 if 0.5 <= self.ratioM: self.theta = 0.1 # test @classmethod def main(self, args): """ generated source for method main """ if len(args) < 5: print "Usage: python KalmanFilterPID.py input output privacy-budget process-variance Cp(optional) Ci(optional) Cd(optional)" sys.exit() output = open(args[2], "w") budget = eval(args[3]) Q = float(args[4]) if budget <= 0 or Q <= 0: print "Usage: privacy-budget AND process-variance are positive values" sys.exit() p = Params(1000) kfPID = KalmanFilterPID(p) kfPID.setTotalBudget(budget) kfPID.setQ(Q) kfPID.orig = Parser.getData(args[1]) kfPID.publish = [None] * len(kfPID.orig) # adjust R based on T and alpha kfPID.setR(len(kfPID.orig) * len(kfPID.orig) / (0.0 + budget * budget)) # set optional control gains if len(args) >= 6: d = args[5] if d > 1: d = 1 kfPID.setCp(d) if len(args) >= 7: d = args[6] if d + kfPID.Cp > 1: d = 1 - kfPID.Cp kfPID.setCi(d) else: kfPID.setCi(1 - kfPID.Cp) if len(args) >= 8: d = args[7] if d + kfPID.Cp + kfPID.Ci > 1: d = 1 - kfPID.Cp - kfPID.Ci kfPID.setCd(d) else: kfPID.setCd(1 - kfPID.Cp - kfPID.Ci) # kfPID.adjustParams() start = time.time() kfPID.publishCounts() end = time.time() Parser.outputData(output, kfPID.publish) print "Method:\tKalman Filter with Adaptive Sampling" print "Data Series Length:\t" + str(len(kfPID.orig)) print "Queries Issued:\t" + str(kfPID.query.count(1)) print "Privacy Budget Used:\t" + str(kfPID.query.count(1) * kfPID.epsilon) print "Average Relative Error:\t" + str(kfPID.getRelError()) print "Time Used (in second):\t" + str(end - start) def kalmanFilter(self, orig, budget, samplingRate=None): self.totalBudget = budget self.orig = orig if samplingRate is not None: self.isSampling = True self.ratioM = samplingRate else: self.isSampling = False # self.adjustParams() self.publish = [None] * len(self.orig) # adjust R based on T and alpha self.setR(len(self.orig) * len(self.orig) / (0.0 + budget * budget)) self.publishCounts() return self.publish def getCount(self, value, epsilon): """ return true count or noisy count of a node, depending on epsilon. Note that the noisy count can be negative """ if epsilon < 10 ** (-8): return value else: return value + self.differ.getNoise(1, epsilon) # sensitivity is 1 # data publication procedure def publishCounts(self): """ generated source for method publish """ self.query = BitArray(len(self.orig)) self.predict = [None] * len(self.orig) # recalculate individual budget based on M if (self.isSampling): M = int(self.ratioM * (len(self.orig))) # 0.25 optimal percentile else: M = len(self.orig) if M <= 0: M = 1 self.epsilon = (self.totalBudget + 0.0) / M # error = 0 self.interval = 1 nextQuery = max(1, self.windowPID) + self.interval - 1 for i in range(len(self.orig)): if i == 0: # the first time instance self.publish[i] = self.getCount(self.orig[i], self.epsilon) self.query[i] = 1 self.correctKF(i, 0) else: predct = self.predictKF(i) self.predict[i] = predct if self.query.count(1) < self.windowPID and self.query.count(1) < M: # i is NOT the sampling point self.publish[i] = self.getCount(self.orig[i], self.epsilon) self.query[i] = 1 # update count using observation self.correctKF(i, predct) elif i == nextQuery and self.query.count(1) < M: # if i is the sampling point # query self.publish[i] = self.getCount(self.orig[i], self.epsilon) self.query[i] = 1 # update count using observation self.correctKF(i, predct) # update freq if (self.isSampling): ratio = self.PID(i) frac = min(20, (ratio - self.xi) / self.xi) deltaI = self.theta * (1 - math.exp(frac)) deltaI = int(deltaI) + (random.random() < deltaI - int(deltaI)) self.interval += deltaI else: self.interval = 1 if self.interval < self.minIntvl: self.interval = self.minIntvl nextQuery += self.interval # nextQuery is ns in the paper else: # --> predict self.publish[i] = predct # del self.orig # del self.predict # del self.query # if self.isPostProcessing: # self.postProcessing() # def postProcessing(self): # print len(self.samples), self.samples # remainedEps = self.totalBudget - len(self.samples) * self.epsilon # self.epsilon = self.epsilon + remainedEps/len(self.samples) # # # recompute noisy counts # prev = 0 # for i in self.samples: # self.publish[i] = self.getCount(self.orig[i], self.epsilon) # if i > prev + 1: # self.publish[prev + 1 : i] = [self.publish[prev]] * (i - prev - 1) # prev = i def setR(self, r): """ generated source for method setR """ self.R = r def setQ(self, q): """ generated source for method setQ """ self.Q = q def setCp(self, cp): """ generated source for method setCp """ self.Cp = cp def setCi(self, ci): """ generated source for method setCi """ self.Ci = ci def setCd(self, cd): """ generated source for method setCd """ self.Cd = cd # prediction step def predictKF(self, curr): """ generated source for method predictKF """ # predict using Kalman Filter lastValue = self.getLastQuery(curr) # project estimation error self.P += self.Q # Q is gaussian noise return lastValue # correction step def correctKF(self, curr, predict): """ generated source for method correctKF """ self.K = (self.P + 0.0) / (self.P + self.R) correct = predict + self.K * (self.publish[curr] - predict) # publish[curr] = Math.max((int) correct, 0) if curr > 0: # only correct from 2nd values self.publish[curr] = correct # print correct, "\t", self.publish[curr], self.K, self.P # update estimation error variance self.P *= (1 - self.K) def getLastQuery(self, curr): """ generated source for method getLastQuery """ for i in reversed(range(curr)): if self.query[i]: break return self.publish[i] # adaptive sampling - return feedback error def PID(self, curr): """ generated source for method PID """ sum = 0 lastValue = 0 change = 0 timeDiff = 0 next = curr for j in reversed(range(self.windowPID - 1)): index = next while index >= 0: if self.query[index]: next = index - 1 # the last nextQuery break index -= 1 if j == self.windowPID - 1: lastValue = abs(self.publish[index] - self.predict[index]) / (0.0 + max(self.publish[index], 1)) change = abs(self.publish[index] - self.predict[index]) / (0.0 + max(self.publish[index], 1)) timeDiff = index if j == self.windowPID - 2: change -= abs(self.publish[index] - self.predict[index]) / (0.0 + max(self.publish[index], 1)) timeDiff -= index sum += (abs(self.publish[index] - self.predict[index]) / (0.0 + max(self.publish[index], 1))) ratio = self.Cp * lastValue + self.Ci * sum + self.Cd * change / (0.0 + timeDiff) return ratio
class Hilbert(Kd_standard): """ Hilbert R-tree """ def __init__(self, data, param): self.param = param self.differ = Differential(self.param.Seed) self.root = KNode() self.realData = data self.root.n_budget = Params.maxHeight def h_encode(self, x, y, r): """ (x,y) -> value h in Hilbert space, r is the resolution of the Hilbert curve """ mask = (1 << r) - 1 heven = x ^ y notx = ~x & mask noty = ~y & mask temp = notx ^ y v0, v1 = 0, 0 for k in range(r - 1): v1 = ((v1 & heven) | ((v0 ^ noty) & temp)) >> 1 v0 = ((v0 & (v1 ^ notx)) | (~v0 & (v1 ^ noty))) >> 1 hodd = (~v0 & (v1 ^ x)) | (v0 & (v1 ^ noty)) return self.interleaveBits(hodd, heven) def h_decode(self, h, r): """ h -> (x,y) """ heven, hodd = self.deleaveBits(h) mask = (1 << r) - 1 v0, v1 = 0, 0 temp1 = ~(heven | hodd) & mask temp0 = ~(heven ^ hodd) & mask for k in range(r - 1): v1 = (v1 ^ temp1) >> 1 v0 = (v0 ^ temp0) >> 1 return (v0 & ~heven) ^ v1 ^ hodd, (v0 | heven) ^ v1 ^ hodd def interleaveBits(self, hodd, heven): val = 0 maxx = max(hodd, heven) n = 0 while maxx > 0: n += 1 maxx >>= 1 for i in range(n): bitMask = 1 << i a = 1 << (2 * i) if (heven & bitMask) else 0 b = 1 << (2 * i + 1) if (hodd & bitMask) else 0 val += a + b return val def deleaveBitsOdd(self, x): x &= 0x5555555555555555 x = (x | (x >> 1)) & 0x3333333333333333 x = (x | (x >> 2)) & 0x0F0F0F0F0F0F0F0F x = (x | (x >> 4)) & 0x00FF00FF00FF00FF x = (x | (x >> 8)) & 0x0000FFFF0000FFFF x = (x | (x >> 16)) & 0x00000000FFFFFFFF return x def deleaveBits(self, x): return self.deleaveBitsOdd(x), self.deleaveBitsOdd(x >> 1) def get_Hcoord(self, x, y, R): hx = int((x - Params.LOW[0]) / (Params.HIGH[0] - Params.LOW[0] + 10**(-8)) * (2**R)) hy = int((y - Params.LOW[1]) / (Params.HIGH[1] - Params.LOW[1] + 10**(-8)) * (2**R)) return hx, hy def get_Rcoord(self, hx, hy, R): x = float(hx) / (2** R) * (Params.HIGH[0] - Params.LOW[0]) + Params.LOW[0] y = float(hy) / (2** R) * (Params.HIGH[1] - Params.LOW[1]) + Params.LOW[1] return x, y def getCount(self, curr, epsilon): count = len(curr.n_data) if epsilon < 10**(-6): return count else: return count + self.differ.getNoise(1, epsilon) def testLeaf(self, curr): """ test whether a node should be a leaf node """ if (curr.n_depth == Params.maxHeight) or \ (curr.n_budget <= 0) or \ (curr.n_count <= self.param.minPartSize): return True return False def buildIndex(self): budget_c = self.getCountBudget() logging.debug('encoding coordinates...') RES = self.param.Res # order of Hilbert curve ndata = self.realData.shape[1] hidx = np.zeros(ndata) for i in range(ndata): hx, hy = self.get_Hcoord(self.realData[0, i], self.realData[1, i], RES) hidx[i] = self.h_encode(hx, hy, RES) hidx = np.sort(hidx) logging.debug('building index...') self.root.n_data = hidx self.root.n_box = (0, 2**(2 * RES) - 1) self.root.n_count = self.getCount(self.root, budget_c[0]) stack = deque() stack.append(self.root) tree = [self.root] leaf_li = [] # storage of all leaves nleaf = 0 # leaf counter max_depth = -1 while len(stack) > 0: curr = stack.popleft() if curr.n_depth > max_depth: max_depth = curr.n_depth if self.testLeaf(curr) is True: # curr is a leaf node if curr.n_depth < Params.maxHeight: remainingEps = sum(budget_c[curr.n_depth + 1:]) curr.n_count = self.getCount(curr, remainingEps) nleaf += 1 curr.n_isLeaf = True leaf_li.append(curr) else: # curr needs to split curr.n_budget -= 1 tmp = self.getCoordinates(curr) if tmp is False: # if split fails stack.append(curr) continue nw_node, ne_node, sw_node, se_node = KNode(), KNode(), KNode( ), KNode() # create sub-nodes split_prm, split_sec1, split_sec2, nw_node.n_data, ne_node.n_data, sw_node.n_data, se_node.n_data = tmp nw_node.n_box = (curr.n_box[0], split_sec1) ne_node.n_box = (split_sec1, split_prm) sw_node.n_box = (split_prm, split_sec2) se_node.n_box = (split_sec2, curr.n_box[1]) for sub_node in [nw_node, ne_node, sw_node, se_node]: sub_node.n_depth = curr.n_depth + 1 sub_node.n_count = self.getCount( sub_node, budget_c[sub_node.n_depth]) sub_node.n_budget = curr.n_budget stack.append(sub_node) tree.append(sub_node) curr.n_data = None curr.nw, curr.ne, curr.sw, curr.se = nw_node, ne_node, sw_node, se_node # end of while logging.debug("number of leaves: %d" % nleaf) logging.debug("max depth: %d" % max_depth) # # convert hilbert values in leaf nodes to real coordinates and update bounding box logging.debug('decoding and updating bounding box...') for leaf in leaf_li: bbox = np.array([[1000.0, 1000.0], [-1000.0, -1000.0]], dtype='float64') for hvalue in leaf.n_data: hx, hy = self.h_decode(int(hvalue), RES) x, y = self.get_Rcoord(hx, hy, RES) bbox[0, 0] = x if x < bbox[0, 0] else bbox[0, 0] bbox[1, 0] = x if x > bbox[1, 0] else bbox[1, 0] bbox[0, 1] = y if y < bbox[0, 1] else bbox[0, 1] bbox[1, 1] = y if y > bbox[1, 1] else bbox[1, 1] leaf.n_box = bbox # # update bounding box bottom-up tree = sorted(tree, cmp=self.cmp_node) logging.debug('updating box for each node in the tree...') for node in tree: if node.n_data is None: node.n_box = np.zeros((2, 2)) node.n_box[0, 0] = min(node.ne.n_box[0, 0], node.nw.n_box[0, 0], node.se.n_box[0, 0], node.sw.n_box[0, 0]) node.n_box[0, 1] = min(node.ne.n_box[0, 1], node.nw.n_box[0, 1], node.se.n_box[0, 1], node.sw.n_box[0, 1]) node.n_box[1, 0] = max(node.ne.n_box[1, 0], node.nw.n_box[1, 0], node.se.n_box[1, 0], node.sw.n_box[1, 0]) node.n_box[1, 1] = max(node.ne.n_box[1, 1], node.nw.n_box[1, 1], node.se.n_box[1, 1], node.sw.n_box[1, 1]) def cmp_node(self, node1, node2): # reverse order return int(node2.n_depth - node1.n_depth) def getCoordinates(self, curr): budget_s = self.getSplitBudget() _data = curr.n_data _ndata = len(_data) split_1 = self.getSplit(_data, curr.n_box[0], curr.n_box[1], budget_s[curr.n_depth] / 2) pos_1 = np.searchsorted(_data, split_1) if pos_1 == 0 or pos_1 == _ndata: return False data_1 = _data[:pos_1] data_2 = _data[pos_1:] split_sec1 = self.getSplit(data_1, curr.n_box[0], split_1, budget_s[curr.n_depth] / 2) split_sec2 = self.getSplit(data_2, split_1, curr.n_box[1], budget_s[curr.n_depth] / 2) pos_sec1 = np.searchsorted(data_1, split_sec1) pos_sec2 = np.searchsorted(data_2, split_sec2) if pos_sec1 == 0 or pos_sec1 == len( data_1) or pos_sec2 == 0 or pos_sec2 == len(data_2): return False nw_data, ne_data, sw_data, se_data = data_1[:pos_sec1], data_1[ pos_sec1:], data_2[:pos_sec2], data_2[pos_sec2:] return split_1, split_sec1, split_sec2, nw_data, ne_data, sw_data, se_data
class Generic(object): """ Generic data structure, used for both htree and grid """ def __init__(self, data, param): self.param = param self.differ = Differential(self.param.Seed) # initialize the root self.root = Node() # self.children = [] # all level 2 grids self.root.n_data = data self.root.n_box = np.array([param.LOW, param.HIGH]) def getEqualSplit(self, partitions, min, max): """return equal split points, including both ends""" if min > max: logging.debug("getEqualSplit: Error: min > max") if partitions <= 1: return [min, max] return [min + (max - min) * i / partitions for i in range(partitions + 1)] def getCountBudget(self): """return noisy count budget for different levels of the indices""" raise NotImplementedError def getCoordinates(self, curr): """return the split dimension, the split points and the data points in each subnodes""" raise NotImplementedError def getCount(self, curr, epsilon): """ return true count or noisy count of a node, depending on epsilon. Note that the noisy count can be negative """ if curr.n_data is None: count = 0 else: count = curr.n_data.shape[1] if epsilon < 10 ** (-6): return count else: return count + self.differ.getNoise(1, epsilon) def testLeaf(self, curr): """test whether a node is a leaf node""" raise NotImplementedError def intersect(self, hrect, query): """ checks if the hyper-rectangle intersects with the hyper-rectangle defined by the query in every dimension """ bool_m1 = query[0, :] >= hrect[1, :] bool_m2 = query[1, :] <= hrect[0, :] bool_m = np.logical_or(bool_m1, bool_m2) if np.any(bool_m): return False else: return True def buildIndex(self): """build the htree & grid structure. htree is a high fanout and low level tree""" budget_c = self.getCountBudget() # an array with two elements self.root.n_count = self.getCount(self.root, 0) # add noisy count to the root queue = deque() queue.append(self.root) nleaf = 0 # number of leaf node, for debug only # ## main loop while len(queue) > 0: curr = queue.popleft() if self.testLeaf(curr) is True: # if curr is a leaf node if curr.n_depth < self.param.maxHeightHTree: remainingEps = sum(budget_c[curr.n_depth:]) curr.n_count = self.getCount(curr, remainingEps) curr.eps = remainingEps nleaf += 1 curr.n_isLeaf = True else: # curr needs to split split_arr, n_data_arr = self.getCoordinates(curr) if split_arr is None: if curr.n_depth < self.param.maxHeightHTree: remainingEps = sum(budget_c[curr.n_depth:]) curr.n_count = self.getCount(curr, remainingEps) curr.eps = remainingEps nleaf += 1 curr.n_isLeaf = True curr.children = [] continue # if the first level cell is leaf node for i in range(len(n_data_arr)): node = Node() if curr.n_depth % Params.NDIM == 0: # split by x coord node.n_box = np.array([[split_arr[i], curr.n_box[0, 1]], [split_arr[i + 1], curr.n_box[1, 1]]]) else: # split by y coord node.n_box = np.array([[curr.n_box[0, 0], split_arr[i]], [curr.n_box[1, 0], split_arr[i + 1]]]) node.index = i node.parent = curr node.n_depth = curr.n_depth + 1 node.n_data = n_data_arr[i] node.n_count = self.getCount(node, budget_c[node.n_depth]) node.eps = budget_c[node.n_depth] if curr.n_depth == 2: node.secondLevelPartitions = curr.secondLevelPartitions curr.children.append(node) queue.append(node) # if curr.n_depth == 2: # self.children.append(curr) curr.n_data = None # ## do not need the data points coordinates now # end of while logging.debug("Generic: number of leaves: %d" % nleaf) # canonical range query does apply def rangeCount(self, query): """ Query answering function. Find the number of data points within a query rectangle. This function assume that the tree is contructed with noisy count for every node """ queue = deque() queue.append(self.root) count = 0.0 while len(queue) > 0: curr = queue.popleft() _box = curr.n_box if curr.n_isLeaf is True: frac = 1 if self.intersect(_box, query): for i in range(_box.shape[1]): if _box[1, i] == _box[0, i] or Params.WorstCase == True: frac *= 1 else: frac *= (min(query[1, i], _box[1, i]) - max(query[0, i], _box[0, i])) / ( _box[1, i] - _box[0, i]) count += curr.n_count * frac else: # if not leaf for node in curr.children: bool_matrix = np.zeros((2, query.shape[1])) bool_matrix[0, :] = query[0, :] <= _box[0, :] bool_matrix[1, :] = query[1, :] >= _box[1, :] if np.all(bool_matrix): # if query range contains node range count += node.n_count elif self.intersect(_box, query): queue.append(node) return float(count) def leafCover(self, loc): """ find a leaf node that cover the location """ queue = deque() queue.append(self.root) while len(queue) > 0: curr = queue.popleft() _box = curr.n_box if curr.n_isLeaf is True: if is_rect_cover(_box, loc): return curr else: # if not leaf queue.extend(curr.children) def checkCorrectness(self, node, nodePoints=None): """ Total number of data points of all leaf nodes should equal to the total data points """ totalPoints = 0 if node is None: return 0 if node.n_isLeaf and node.n_data is not None: return node.n_data.shape[1] for child in node.children: totalPoints += self.checkCorrectness(child) if nodePoints is None: return totalPoints if totalPoints == nodePoints: return True return False
class GenericT(object): """ Generic data structure, used for grid """ def __init__(self, data, param): self.param = param self.differ = Differential(self.param.Seed) # initialize the root self.root = NodeT() # self.children = [] # all level 2 grids self.root.n_data = data self.root.n_box = np.array([param.LOW, param.HIGH]) def getEqualSplit(self, partitions, min, max): """return equal split points, including both ends""" if min > max: logging.debug("getEqualSplit: Error: min > max") if partitions <= 1: return [min, max] return [min + (max - min) * i / partitions for i in range(partitions + 1)] def getCountBudget(self): """return noisy count budget for different levels of the indices""" raise NotImplementedError def getCoordinates(self, curr): """return the split dimension, the split points and the data points in each subnodes""" raise NotImplementedError def getCount(self, curr, epsilon): """ return true count or noisy count of a node, depending on epsilon. Note that the noisy count can be negative """ if curr.n_data is None: count = 0 else: count = curr.n_data.shape[1] if epsilon < 10 ** (-8): return count else: return count + self.differ.getNoise(1, epsilon) def intersect(self, hrect, query): """ checks if the hyper-rectangle intersects with the hyper-rectangle defined by the query in every dimension """ bool_m1 = query[0, :] >= hrect[1, :] bool_m2 = query[1, :] <= hrect[0, :] bool_m = np.logical_or(bool_m1, bool_m2) if np.any(bool_m): return False else: return True def testLeaf(self, curr): """ test whether a node should be a leaf node """ if (curr.n_depth == Params.maxHeightAdaptiveGrid) or \ (curr.n_data is None or curr.n_data.shape[1] == 0) or \ (curr.n_count <= self.param.minPartSize): return True return False def buildIndex(self): """build the grid structure.""" budget_c = self.getCountBudget() # an array with two elements # print budget_c self.root.n_count = self.getCount(self.root, 0) # add noisy count to the root queue = deque() queue.append(self.root) # ## main loop while len(queue) > 0: curr = queue.popleft() if curr.n_data is None: curr.a_count.append(0) else: curr.a_count.append(curr.n_data.shape[1]) if self.testLeaf(curr) is True: # if curr is a leaf node remainingEps = sum(budget_c[curr.n_depth:]) curr.n_count, curr.eps, curr.n_isLeaf = self.getCount(curr, remainingEps), remainingEps, True curr.l_count.append(curr.n_count) else: # curr needs to split --> find splitting granularity gran, split_arr_x, split_arr_y, n_data_matrix = self.getCoordinates(curr) if gran == 1: remainingEps = sum(budget_c[curr.n_depth:]) curr.n_count, curr.eps, curr.n_isLeaf = self.getCount(curr, remainingEps), remainingEps, True curr.children = None curr.l_count.append(curr.n_count) continue # if the first level cell is leaf node # add all nodes to queue for x in range(gran): for y in range(gran): node = NodeT() node.n_box = np.array( [[split_arr_x[x], split_arr_y[y]], [split_arr_x[x + 1], split_arr_y[y + 1]]]) node.index, node.parent, node.n_depth = x * gran + y, curr, curr.n_depth + 1 if n_data_matrix[x][y] is None: node.n_data = None else: node.n_data = np.transpose(n_data_matrix[x][y]) node.n_count = self.getCount(node, budget_c[node.n_depth]) node.eps = budget_c[node.n_depth] if node.n_depth == 2: node.n_isLeaf = True if curr.children is None: curr.children = np.ndarray(shape=(gran, gran), dtype=NodeT) curr.children[x][y] = node queue.append(node) curr.n_data = None # ## do not need the data points coordinates now # end of while # canonical range query does apply def rangeCount(self, query): """ Query answering function. Find the number of data points within a query rectangle. This function assume that the tree is constructed with noisy count for every node """ queue = deque() queue.append(self.root) count = 0.0 while len(queue) > 0: curr = queue.popleft() _box = curr.n_box if curr.n_isLeaf is True: frac = 1 if self.intersect(_box, query): for i in range(_box.shape[1]): if _box[1, i] == _box[0, i] or Params.WorstCase == True: frac *= 1 else: frac *= (min(query[1, i], _box[1, i]) - max(query[0, i], _box[0, i])) / ( _box[1, i] - _box[0, i]) count += curr.n_count * frac else: # if not leaf for (_, _), node in np.ndenumerate(curr.children): bool_matrix = np.zeros((2, query.shape[1])) bool_matrix[0, :] = query[0, :] <= _box[0, :] bool_matrix[1, :] = query[1, :] >= _box[1, :] if np.all(bool_matrix): # if query range contains node range count += node.n_count elif self.intersect(_box, query): queue.append(node) return float(count) def leafCover(self, loc): """ find a leaf node that cover the location """ gran_1st = len(self.root.children) x1 = min(gran_1st - 1, (loc[0] - self.root.n_box[0, 0]) * gran_1st / (self.root.n_box[1, 0] - self.root.n_box[0, 0])) y1 = min(gran_1st - 1, (loc[1] - self.root.n_box[0, 1]) * gran_1st / (self.root.n_box[1, 1] - self.root.n_box[0, 1])) node_1st = self.root.children[x1][y1] """ Note that there are cases when the actual count of first level cell is zero but the noisy count is > 0, thus the cell may be splited into a number of empty cells """ if node_1st.n_isLeaf or node_1st.children is None: return node_1st else: gran_2st = len(node_1st.children) x2 = min(gran_2st - 1, (loc[0] - node_1st.n_box[0, 0]) * gran_2st / (node_1st.n_box[1, 0] - node_1st.n_box[0, 0])) y2 = min(gran_2st - 1, (loc[1] - node_1st.n_box[0, 1]) * gran_2st / (node_1st.n_box[1, 1] - node_1st.n_box[0, 1])) return node_1st.children[x2][y2] def checkCorrectness(self, node, nodePoints=None): """ Total number of data points of all leaf nodes should equal to the total data points only check the FIRST time instance """ totalPoints = 0 if node is None: return 0 if (node.n_isLeaf and node.n_data is not None) or node.children is None: return node.a_count[0] for (_, _), child in np.ndenumerate(node.children): totalPoints += self.checkCorrectness(child) if nodePoints is None: return totalPoints if totalPoints == nodePoints: return True return False
class GenericT(object): """ Generic data structure, used for grid """ def __init__(self, data, param): self.param = param self.differ = Differential(self.param.Seed) # initialize the root self.root = NodeT() # self.children = [] # all level 2 grids self.root.n_data = data self.root.n_box = np.array([param.LOW, param.HIGH]) def getEqualSplit(self, partitions, min, max): """return equal split points, including both ends""" if min > max: logging.debug("getEqualSplit: Error: min > max") if partitions <= 1: return [min, max] return [ min + (max - min) * i / partitions for i in range(partitions + 1) ] def getCountBudget(self): """return noisy count budget for different levels of the indices""" raise NotImplementedError def getCoordinates(self, curr): """return the split dimension, the split points and the data points in each subnodes""" raise NotImplementedError def getCount(self, curr, epsilon): """ return true count or noisy count of a node, depending on epsilon. Note that the noisy count can be negative """ if curr.n_data is None: count = 0 else: count = curr.n_data.shape[1] if epsilon < 10**(-8): return count else: return count + self.differ.getNoise(1, epsilon) def intersect(self, hrect, query): """ checks if the hyper-rectangle intersects with the hyper-rectangle defined by the query in every dimension """ bool_m1 = query[0, :] >= hrect[1, :] bool_m2 = query[1, :] <= hrect[0, :] bool_m = np.logical_or(bool_m1, bool_m2) if np.any(bool_m): return False else: return True def testLeaf(self, curr): """ test whether a node should be a leaf node """ if (curr.n_depth == Params.maxHeightAdaptiveGrid) or \ (curr.n_data is None or curr.n_data.shape[1] == 0) or \ (curr.n_count <= self.param.minPartSize): return True return False def buildIndex(self): """build the grid structure.""" budget_c = self.getCountBudget() # an array with two elements # print budget_c self.root.n_count = self.getCount(self.root, 0) # add noisy count to the root queue = deque() queue.append(self.root) # ## main loop while len(queue) > 0: curr = queue.popleft() if curr.n_data is None: curr.a_count.append(0) else: curr.a_count.append(curr.n_data.shape[1]) if self.testLeaf(curr) is True: # if curr is a leaf node remainingEps = sum(budget_c[curr.n_depth:]) curr.n_count, curr.eps, curr.n_isLeaf = self.getCount( curr, remainingEps), remainingEps, True curr.l_count.append(curr.n_count) else: # curr needs to split --> find splitting granularity gran, split_arr_x, split_arr_y, n_data_matrix = self.getCoordinates( curr) if gran == 1: remainingEps = sum(budget_c[curr.n_depth:]) curr.n_count, curr.eps, curr.n_isLeaf = self.getCount( curr, remainingEps), remainingEps, True curr.children = None curr.l_count.append(curr.n_count) continue # if the first level cell is leaf node # add all nodes to queue for x in range(gran): for y in range(gran): node = NodeT() node.n_box = np.array( [[split_arr_x[x], split_arr_y[y]], [split_arr_x[x + 1], split_arr_y[y + 1]]]) node.index, node.parent, node.n_depth = x * gran + y, curr, curr.n_depth + 1 if n_data_matrix[x][y] is None: node.n_data = None else: node.n_data = np.transpose(n_data_matrix[x][y]) node.n_count = self.getCount(node, budget_c[node.n_depth]) node.eps = budget_c[node.n_depth] if node.n_depth == 2: node.n_isLeaf = True if curr.children is None: curr.children = np.ndarray(shape=(gran, gran), dtype=NodeT) curr.children[x][y] = node queue.append(node) curr.n_data = None # ## do not need the data points coordinates now # end of while # canonical range query does apply def rangeCount(self, query): """ Query answering function. Find the number of data points within a query rectangle. This function assume that the tree is constructed with noisy count for every node """ queue = deque() queue.append(self.root) count = 0.0 while len(queue) > 0: curr = queue.popleft() _box = curr.n_box if curr.n_isLeaf is True: frac = 1 if self.intersect(_box, query): for i in range(_box.shape[1]): if _box[1, i] == _box[0, i] or Params.WorstCase == True: frac *= 1 else: frac *= (min(query[1, i], _box[1, i]) - max(query[0, i], _box[0, i])) / ( _box[1, i] - _box[0, i]) count += curr.n_count * frac else: # if not leaf for (_, _), node in np.ndenumerate(curr.children): bool_matrix = np.zeros((2, query.shape[1])) bool_matrix[0, :] = query[0, :] <= _box[0, :] bool_matrix[1, :] = query[1, :] >= _box[1, :] if np.all( bool_matrix): # if query range contains node range count += node.n_count elif self.intersect(_box, query): queue.append(node) return float(count) def leafCover(self, loc): """ find a leaf node that cover the location """ gran_1st = len(self.root.children) x1 = min(gran_1st - 1, (loc[0] - self.root.n_box[0, 0]) * gran_1st / (self.root.n_box[1, 0] - self.root.n_box[0, 0])) y1 = min(gran_1st - 1, (loc[1] - self.root.n_box[0, 1]) * gran_1st / (self.root.n_box[1, 1] - self.root.n_box[0, 1])) node_1st = self.root.children[x1][y1] """ Note that there are cases when the actual count of first level cell is zero but the noisy count is > 0, thus the cell may be splited into a number of empty cells """ if node_1st.n_isLeaf or node_1st.children is None: return node_1st else: gran_2st = len(node_1st.children) x2 = min(gran_2st - 1, (loc[0] - node_1st.n_box[0, 0]) * gran_2st / (node_1st.n_box[1, 0] - node_1st.n_box[0, 0])) y2 = min(gran_2st - 1, (loc[1] - node_1st.n_box[0, 1]) * gran_2st / (node_1st.n_box[1, 1] - node_1st.n_box[0, 1])) return node_1st.children[x2][y2] def checkCorrectness(self, node, nodePoints=None): """ Total number of data points of all leaf nodes should equal to the total data points only check the FIRST time instance """ totalPoints = 0 if node is None: return 0 if (node.n_isLeaf and node.n_data is not None) or node.children is None: return node.a_count[0] for (_, _), child in np.ndenumerate(node.children): totalPoints += self.checkCorrectness(child) if nodePoints is None: return totalPoints if totalPoints == nodePoints: return True return False
class Hilbert(Kd_standard): """ Hilbert R-tree """ def __init__(self, data, param): self.param = param self.differ = Differential(self.param.Seed) self.root = KNode() self.realData = data self.root.n_budget = Params.maxHeight def h_encode(self, x, y, r): """ (x,y) -> value h in Hilbert space, r is the resolution of the Hilbert curve """ mask = (1 << r) - 1 heven = x ^ y notx = ~x & mask noty = ~y & mask temp = notx ^ y v0, v1 = 0, 0 for k in range(r - 1): v1 = ((v1 & heven) | ((v0 ^ noty) & temp)) >> 1 v0 = ((v0 & (v1 ^ notx)) | (~v0 & (v1 ^ noty))) >> 1 hodd = (~v0 & (v1 ^ x)) | (v0 & (v1 ^ noty)) return self.interleaveBits(hodd, heven) def h_decode(self, h, r): """ h -> (x,y) """ heven, hodd = self.deleaveBits(h) mask = (1 << r) - 1 v0, v1 = 0, 0 temp1 = ~(heven | hodd) & mask temp0 = ~(heven ^ hodd) & mask for k in range(r - 1): v1 = (v1 ^ temp1) >> 1 v0 = (v0 ^ temp0) >> 1 return (v0 & ~heven) ^ v1 ^ hodd, (v0 | heven) ^ v1 ^ hodd def interleaveBits(self, hodd, heven): val = 0 maxx = max(hodd, heven) n = 0 while maxx > 0: n += 1 maxx >>= 1 for i in range(n): bitMask = 1 << i a = 1 << (2 * i) if (heven & bitMask) else 0 b = 1 << (2 * i + 1) if (hodd & bitMask) else 0 val += a + b return val def deleaveBitsOdd(self, x): x &= 0x5555555555555555 x = (x | (x >> 1)) & 0x3333333333333333 x = (x | (x >> 2)) & 0x0F0F0F0F0F0F0F0F x = (x | (x >> 4)) & 0x00FF00FF00FF00FF x = (x | (x >> 8)) & 0x0000FFFF0000FFFF x = (x | (x >> 16)) & 0x00000000FFFFFFFF return x def deleaveBits(self, x): return self.deleaveBitsOdd(x), self.deleaveBitsOdd(x >> 1) def get_Hcoord(self, x, y, R): hx = int((x - Params.LOW[0]) / (Params.HIGH[0] - Params.LOW[0] + 10 ** (-8)) * (2 ** R)) hy = int((y - Params.LOW[1]) / (Params.HIGH[1] - Params.LOW[1] + 10 ** (-8)) * (2 ** R)) return hx, hy def get_Rcoord(self, hx, hy, R): x = float(hx) / (2 ** R) * (Params.HIGH[0] - Params.LOW[0]) + Params.LOW[0] y = float(hy) / (2 ** R) * (Params.HIGH[1] - Params.LOW[1]) + Params.LOW[1] return x, y def getCount(self, curr, epsilon): count = len(curr.n_data) if epsilon < 10 ** (-6): return count else: return count + self.differ.getNoise(1, epsilon) def testLeaf(self, curr): """ test whether a node should be a leaf node """ if (curr.n_depth == Params.maxHeight) or \ (curr.n_budget <= 0) or \ (curr.n_count <= self.param.minPartSize): return True return False def buildIndex(self): budget_c = self.getCountBudget() logging.debug('encoding coordinates...') RES = self.param.Res # order of Hilbert curve ndata = self.realData.shape[1] hidx = np.zeros(ndata) for i in range(ndata): hx, hy = self.get_Hcoord(self.realData[0, i], self.realData[1, i], RES) hidx[i] = self.h_encode(hx, hy, RES) hidx = np.sort(hidx) logging.debug('building index...') self.root.n_data = hidx self.root.n_box = (0, 2 ** (2 * RES) - 1) self.root.n_count = self.getCount(self.root, budget_c[0]) stack = deque() stack.append(self.root) tree = [self.root] leaf_li = [] # storage of all leaves nleaf = 0 # leaf counter max_depth = -1 while len(stack) > 0: curr = stack.popleft() if curr.n_depth > max_depth: max_depth = curr.n_depth if self.testLeaf(curr) is True: # curr is a leaf node if curr.n_depth < Params.maxHeight: remainingEps = sum(budget_c[curr.n_depth + 1:]) curr.n_count = self.getCount(curr, remainingEps) nleaf += 1 curr.n_isLeaf = True leaf_li.append(curr) else: # curr needs to split curr.n_budget -= 1 tmp = self.getCoordinates(curr) if tmp is False: # if split fails stack.append(curr) continue nw_node, ne_node, sw_node, se_node = KNode(), KNode(), KNode(), KNode() # create sub-nodes split_prm, split_sec1, split_sec2, nw_node.n_data, ne_node.n_data, sw_node.n_data, se_node.n_data = tmp nw_node.n_box = (curr.n_box[0], split_sec1) ne_node.n_box = (split_sec1, split_prm) sw_node.n_box = (split_prm, split_sec2) se_node.n_box = (split_sec2, curr.n_box[1]) for sub_node in [nw_node, ne_node, sw_node, se_node]: sub_node.n_depth = curr.n_depth + 1 sub_node.n_count = self.getCount(sub_node, budget_c[sub_node.n_depth]) sub_node.n_budget = curr.n_budget stack.append(sub_node) tree.append(sub_node) curr.n_data = None curr.nw, curr.ne, curr.sw, curr.se = nw_node, ne_node, sw_node, se_node # end of while logging.debug("number of leaves: %d" % nleaf) logging.debug("max depth: %d" % max_depth) # # convert hilbert values in leaf nodes to real coordinates and update bounding box logging.debug('decoding and updating bounding box...') for leaf in leaf_li: bbox = np.array([[1000.0, 1000.0], [-1000.0, -1000.0]], dtype='float64') for hvalue in leaf.n_data: hx, hy = self.h_decode(int(hvalue), RES) x, y = self.get_Rcoord(hx, hy, RES) bbox[0, 0] = x if x < bbox[0, 0] else bbox[0, 0] bbox[1, 0] = x if x > bbox[1, 0] else bbox[1, 0] bbox[0, 1] = y if y < bbox[0, 1] else bbox[0, 1] bbox[1, 1] = y if y > bbox[1, 1] else bbox[1, 1] leaf.n_box = bbox # # update bounding box bottom-up tree = sorted(tree, cmp=self.cmp_node) logging.debug('updating box for each node in the tree...') for node in tree: if node.n_data is None: node.n_box = np.zeros((2, 2)) node.n_box[0, 0] = min(node.ne.n_box[0, 0], node.nw.n_box[0, 0], node.se.n_box[0, 0], node.sw.n_box[0, 0]) node.n_box[0, 1] = min(node.ne.n_box[0, 1], node.nw.n_box[0, 1], node.se.n_box[0, 1], node.sw.n_box[0, 1]) node.n_box[1, 0] = max(node.ne.n_box[1, 0], node.nw.n_box[1, 0], node.se.n_box[1, 0], node.sw.n_box[1, 0]) node.n_box[1, 1] = max(node.ne.n_box[1, 1], node.nw.n_box[1, 1], node.se.n_box[1, 1], node.sw.n_box[1, 1]) def cmp_node(self, node1, node2): # reverse order return int(node2.n_depth - node1.n_depth) def getCoordinates(self, curr): budget_s = self.getSplitBudget() _data = curr.n_data _ndata = len(_data) split_1 = self.getSplit(_data, curr.n_box[0], curr.n_box[1], budget_s[curr.n_depth] / 2) pos_1 = np.searchsorted(_data, split_1) if pos_1 == 0 or pos_1 == _ndata: return False data_1 = _data[:pos_1] data_2 = _data[pos_1:] split_sec1 = self.getSplit(data_1, curr.n_box[0], split_1, budget_s[curr.n_depth] / 2) split_sec2 = self.getSplit(data_2, split_1, curr.n_box[1], budget_s[curr.n_depth] / 2) pos_sec1 = np.searchsorted(data_1, split_sec1) pos_sec2 = np.searchsorted(data_2, split_sec2) if pos_sec1 == 0 or pos_sec1 == len(data_1) or pos_sec2 == 0 or pos_sec2 == len(data_2): return False nw_data, ne_data, sw_data, se_data = data_1[:pos_sec1], data_1[pos_sec1:], data_2[:pos_sec2], data_2[pos_sec2:] return split_1, split_sec1, split_sec2, nw_data, ne_data, sw_data, se_data
class Kd_cell(Kd_pure): """ Kd tree based on syntatic data generation and a grid structure. See Y. Xiao, L. Xiong, and C. Yuan, Differentially private data release through multidimensional partitioning, in SDM Workshop, VLDB, 2010 """ def __init__(self, data, param): self.param = param self.differ = Differential(self.param.Seed) self.mapp = None self.root = KNode() self.realData = data self.root.n_box = None self.root.n_budget = Params.maxHeight def getCountBudget(self): count_eps = self.param.Eps * 0.5 H = Params.maxHeight if self.param.geoBudget == 'none': return [count_eps / (H + 1) for _ in range(H + 1)] elif self.param.geoBudget == 'aggressive': unit = count_eps / (2 ** (H + 1) - 1) return [unit * 2 ** i for i in range(H + 1)] elif self.param.geoBudget == 'quadratic': unit = count_eps * (np.sqrt(2) - 1) / (2 ** (0.5 * (H + 1)) - 1) return [unit * 2 ** (0.5 * i) for i in range(H + 1)] elif self.param.geoBudget == 'optimal': unit = count_eps * ((2 ** (1.0 / 3)) - 1) / (2 ** ((1.0 / 3) * (H + 1)) - 1) return [unit * 2 ** ((1.0 / 3) * i) for i in range(H + 1)] elif self.param.geoBudget == 'quartic': unit = count_eps * ((2 ** (1.0 / 4)) - 1) / (2 ** ((1.0 / 4) * (H + 1)) - 1) return [unit * 2 ** ((1.0 / 4) * i) for i in range(H + 1)] else: logging.error('No such geoBudget scheme') sys.exit(1) def synthetic_gen(self): """Apply a grid structure on the domain and perturb the count using half of the available privacy budget """ logging.debug('generating synthetic map...') data = self.realData unit = Params.unitGrid x_min = np.floor(Params.LOW[0] / unit) * unit x_max = np.ceil(Params.HIGH[0] / unit) * unit y_min = np.floor(Params.LOW[1] / unit) * unit y_max = np.ceil(Params.HIGH[1] / unit) * unit x_CELL = int(np.rint((x_max - x_min) / unit)) y_CELL = int(np.rint((y_max - y_min) / unit)) self.root.n_box = np.array([[x_min, y_min], [x_max, y_max]]) self.mapp = np.zeros((x_CELL, y_CELL)) - 1 # ## initialize every cell with -1 for i in range(Params.NDATA): # ## populate the map point = data[:, i] cell_x = int(np.floor((point[0] - x_min) / unit)) cell_y = int(np.floor((point[1] - y_min) / unit)) if self.mapp[cell_x, cell_y] != -1: self.mapp[cell_x, cell_y] += 1 else: self.mapp[cell_x, cell_y] = 1 for i in range(x_CELL): # ## perturb the counts for j in range(y_CELL): if self.mapp[i, j] != -1: self.mapp[i, j] += np.rint(self.differ.getNoise(1, 0.5 * self.param.Eps)) else: self.mapp[i, j] = np.rint(self.differ.getNoise(1, 0.5 * self.param.Eps)) # if noisy count is negative, ignore the noise and generate no points if self.mapp[i, j] < 0: self.mapp[i, j] = 0 def cell_setLeaf(self, curr): """ Throw away the counts based on the syntatic data """ curr.n_count = 0 return def testLeaf(self, curr): if (curr.n_count <= self.param.minPartSize) or (curr.n_depth == Params.maxHeight) or ( self.uniform_test(curr, self.param.cellDistance)): return True return False def uniform_test(self, curr, distance): """ One of the stopping conditions: cell is uniform according to some threshold 'distance') """ unit = Params.unitGrid x_min = int(np.rint((curr.n_box[0, 0] - self.root.n_box[0, 0]) / unit)) x_max = int(np.rint((curr.n_box[1, 0] - self.root.n_box[0, 0]) / unit)) y_min = int(np.rint((curr.n_box[0, 1] - self.root.n_box[0, 1]) / unit)) y_max = int(np.rint((curr.n_box[1, 1] - self.root.n_box[0, 1]) / unit)) data = self.mapp[x_min:x_max, y_min:y_max] total = np.sum(data) avg = total / ((x_max - x_min) * (y_max - y_min)) dist = np.sum(np.abs(data - avg)) if dist > distance: return False else: return True def buildIndex(self): stack = deque() stack.append(self.root) nleaf = 0 # leaf counter max_depth = -1 self.root.n_count = np.sum(self.mapp) while len(stack) > 0: curr = stack.popleft() if curr.n_depth > max_depth: max_depth = curr.n_depth if self.testLeaf(curr) is True: # curr is a leaf node nleaf += 1 curr.n_isLeaf = True self.cell_setLeaf(curr) else: # curr needs to split curr.n_budget -= 1 tmp = self.getCoordinates(curr) nw_node, ne_node, sw_node, se_node = KNode(), KNode(), KNode(), KNode() # create sub-nodes nw_coord, ne_coord, count_tmp = tmp x_nw, y_nw = nw_coord x_se, y_se = ne_coord nw_node.n_box = np.array([[curr.n_box[0, 0], y_nw], [x_nw, curr.n_box[1, 1]]]) ne_node.n_box = np.array([[x_nw, y_se], [curr.n_box[1, 0], curr.n_box[1, 1]]]) sw_node.n_box = np.array([[curr.n_box[0, 0], curr.n_box[0, 1]], [x_se, y_nw]]) se_node.n_box = np.array([[x_se, curr.n_box[0, 1]], [curr.n_box[1, 0], y_se]]) c_t = 0 for sub_node in [nw_node, ne_node, sw_node, se_node]: sub_node.n_depth = curr.n_depth + 1 sub_node.n_count = count_tmp[c_t] sub_node.n_budget = curr.n_budget stack.append(sub_node) c_t += 1 curr.nw, curr.ne, curr.sw, curr.se = nw_node, ne_node, sw_node, se_node # end of while logging.debug("number of leaves: %d" % nleaf) logging.debug("max depth: %d" % max_depth) def getCoordinates(self, curr): dim_1 = curr.n_depth % Params.NDIM # primary split dimension UNIT = Params.unitGrid x_min = int(np.rint((curr.n_box[0, 0] - self.root.n_box[0, 0]) / UNIT)) x_max = int(np.rint((curr.n_box[1, 0] - self.root.n_box[0, 0]) / UNIT)) y_min = int(np.rint((curr.n_box[0, 1] - self.root.n_box[0, 1]) / UNIT)) y_max = int(np.rint((curr.n_box[1, 1] - self.root.n_box[0, 1]) / UNIT)) total = np.sum(self.mapp[x_min:x_max, y_min:y_max]) if dim_1 == 0: for i in range(x_max - x_min): if np.sum(self.mapp[x_min:x_min + i + 1, y_min:y_max]) >= total / 2: break split_prm = (x_min + i + 1) * UNIT + self.root.n_box[0, 0] half_1 = np.sum(self.mapp[x_min:x_min + i + 1, y_min:y_max]) half_2 = np.sum(self.mapp[x_min + i + 1:x_max, y_min:y_max]) for j in range(y_max - y_min): if np.sum(self.mapp[x_min:x_min + i + 1, y_min:y_min + j + 1]) >= half_1 / 2: break split_sec1 = self.root.n_box[0, 1] + (y_min + j + 1) * UNIT n_sw = np.sum(self.mapp[x_min:x_min + i + 1, y_min:y_min + j + 1]) n_nw = np.sum(self.mapp[x_min:x_min + i + 1, y_min + j + 1:y_max]) for k in range(y_max - y_min): if np.sum(self.mapp[x_min + i + 1:x_max, y_min:y_min + k + 1]) >= half_2 / 2: break split_sec2 = self.root.n_box[0, 1] + (y_min + k + 1) * UNIT n_se = np.sum(self.mapp[x_min + i + 1:x_max, y_min:y_min + k + 1]) n_ne = np.sum(self.mapp[x_min + i + 1:x_max, y_min + k + 1:y_max]) return (split_prm, split_sec1), (split_prm, split_sec2), (n_nw, n_ne, n_sw, n_se) else: for i in range(y_max - y_min): if np.sum(self.mapp[x_min:x_max, y_min:y_min + i + 1]) >= total / 2: break split_prm = self.root.n_box[0, 1] + (y_min + i + 1) * UNIT half_1 = np.sum(self.mapp[x_min:x_max, y_min:y_min + i + 1]) half_2 = np.sum(self.mapp[x_min:x_max, y_min + i + 1:y_max]) for j in range(x_max - x_min): if np.sum(self.mapp[x_min:x_min + j + 1, y_min:y_min + i + 1]) >= half_1 / 2: break split_sec1 = (x_min + j + 1) * UNIT + self.root.n_box[0, 0] n_sw = np.sum(self.mapp[x_min:x_min + j + 1, y_min:y_min + i + 1]) n_se = np.sum(self.mapp[x_min + j + 1:x_max, y_min:y_min + i + 1]) for k in range(x_max - x_min): if np.sum(self.mapp[x_min:x_min + k + 1, y_min + i + 1:y_max]) >= half_2 / 2: break split_sec2 = (x_min + k + 1) * UNIT + self.root.n_box[0, 0] n_nw = np.sum(self.mapp[x_min:x_min + k + 1, y_min + i + 1:y_max]) n_ne = np.sum(self.mapp[x_min + k + 1:x_max, y_min + i + 1:y_max]) return (split_sec2, split_prm), (split_sec1, split_prm), (n_nw, n_ne, n_sw, n_se) def populate_synthetic_tree(self): """ Populate real data to the synthetic tree """ logging.debug('populating synthetic tree...') a_data = self.realData ndata = a_data.shape[1] for i in range(ndata): ptx = a_data[0, i] pty = a_data[1, i] leaf = self.root.find_subnode(ptx, pty) leaf.n_count += 1 # traverse the tree and update leaf counts stack = deque() stack.append(self.root) while len(stack) > 0: cur_node = stack.popleft() if cur_node.n_isLeaf is True: # leaf cur_node.n_count += self.differ.getNoise(1, 0.5 * self.param.Eps) else: stack.append(cur_node.nw) stack.append(cur_node.ne) stack.append(cur_node.sw) stack.append(cur_node.se)
class KalmanFilterPID(Parser): """ generated source for class KalmanFilterPID """ # sampling rate def __init__(self, param): """ generated source for method __init__ """ Parser.__init__(self) self.param = param self.differ = Differential(self.param.Seed) self.predict = [] self.interval = None # Kalman Filter params self.P = 100 # estimation error covariance (over all time instance) self.Q = 1000 # process noise synthetic data self.R = 1000000 # measurement noise optimal for alpha = 1, synthetic data self.K = 0 # kalman gain # PID control params - default self.Cp = 0.9 # proportional gain, to keep output proportional to current error self.Ci = 0.1 # integral gain, to eliminate offset self.Cd = 0.0 # derivative gain, to ensure stability - prevent large error in future # fixed internally self.theta = 1 # magnitude of changes self.xi = 0.2 # gamma (10%) self.minIntvl = 1 # make sure the interval is greater than 1 self.windowPID = 5 # I(integration) window self.ratioM = 0.2 # sampling rate # self.isSampling = False def adjustParams(self): # adjust params if self.ratioM < 0.1: self.theta = 20 if 0.1 <= self.ratioM < 0.2: self.theta = 14 if 0.2 <= self.ratioM < 0.3: self.theta = 2 if 0.3 <= self.ratioM < 0.4: self.theta = 0.5 if 0.4 <= self.ratioM < 0.5: self.theta = 0.3 if 0.5 <= self.ratioM: self.theta = 0.1 # test @classmethod def main(self, args): """ generated source for method main """ if len(args) < 5: print "Usage: python KalmanFilterPID.py input output privacy-budget process-variance Cp(optional) Ci(optional) Cd(optional)" sys.exit() output = open(args[2], "w") budget = eval(args[3]) Q = float(args[4]) if budget <= 0 or Q <= 0: print "Usage: privacy-budget AND process-variance are positive values" sys.exit() p = Params(1000) kfPID = KalmanFilterPID(p) kfPID.setTotalBudget(budget) kfPID.setQ(Q) kfPID.orig = Parser.getData(args[1]) kfPID.publish = [None] * len(kfPID.orig) # adjust R based on T and alpha kfPID.setR(len(kfPID.orig) * len(kfPID.orig) / (0.0 + budget * budget)) # set optional control gains if len(args) >= 6: d = args[5] if d > 1: d = 1 kfPID.setCp(d) if len(args) >= 7: d = args[6] if d + kfPID.Cp > 1: d = 1 - kfPID.Cp kfPID.setCi(d) else: kfPID.setCi(1 - kfPID.Cp) if len(args) >= 8: d = args[7] if d + kfPID.Cp + kfPID.Ci > 1: d = 1 - kfPID.Cp - kfPID.Ci kfPID.setCd(d) else: kfPID.setCd(1 - kfPID.Cp - kfPID.Ci) # kfPID.adjustParams() start = time.time() kfPID.publishCounts() end = time.time() Parser.outputData(output, kfPID.publish) print "Method:\tKalman Filter with Adaptive Sampling" print "Data Series Length:\t" + str(len(kfPID.orig)) print "Queries Issued:\t" + str(kfPID.query.count(1)) print "Privacy Budget Used:\t" + str( kfPID.query.count(1) * kfPID.epsilon) print "Average Relative Error:\t" + str(kfPID.getRelError()) print "Time Used (in second):\t" + str(end - start) def kalmanFilter(self, orig, budget, samplingRate=None): self.totalBudget = budget self.orig = orig if samplingRate is not None: self.isSampling = True self.ratioM = samplingRate else: self.isSampling = False # self.adjustParams() self.publish = [None] * len(self.orig) # adjust R based on T and alpha self.setR(len(self.orig) * len(self.orig) / (0.0 + budget * budget)) self.publishCounts() return self.publish def getCount(self, value, epsilon): """ return true count or noisy count of a node, depending on epsilon. Note that the noisy count can be negative """ if epsilon < 10**(-8): return value else: return value + self.differ.getNoise(1, epsilon) # sensitivity is 1 # data publication procedure def publishCounts(self): """ generated source for method publish """ self.query = BitArray(len(self.orig)) self.predict = [None] * len(self.orig) # recalculate individual budget based on M if (self.isSampling): M = int(self.ratioM * (len(self.orig))) # 0.25 optimal percentile else: M = len(self.orig) if M <= 0: M = 1 self.epsilon = (self.totalBudget + 0.0) / M # error = 0 self.interval = 1 nextQuery = max(1, self.windowPID) + self.interval - 1 for i in range(len(self.orig)): if i == 0: # the first time instance self.publish[i] = self.getCount(self.orig[i], self.epsilon) self.query[i] = 1 self.correctKF(i, 0) else: predct = self.predictKF(i) self.predict[i] = predct if self.query.count(1) < self.windowPID and self.query.count( 1) < M: # i is NOT the sampling point self.publish[i] = self.getCount(self.orig[i], self.epsilon) self.query[i] = 1 # update count using observation self.correctKF(i, predct) elif i == nextQuery and self.query.count(1) < M: # if i is the sampling point # query self.publish[i] = self.getCount(self.orig[i], self.epsilon) self.query[i] = 1 # update count using observation self.correctKF(i, predct) # update freq if (self.isSampling): ratio = self.PID(i) frac = min(20, (ratio - self.xi) / self.xi) deltaI = self.theta * (1 - math.exp(frac)) deltaI = int(deltaI) + (random.random() < deltaI - int(deltaI)) self.interval += deltaI else: self.interval = 1 if self.interval < self.minIntvl: self.interval = self.minIntvl nextQuery += self.interval # nextQuery is ns in the paper else: # --> predict self.publish[i] = predct # del self.orig # del self.predict # del self.query # if self.isPostProcessing: # self.postProcessing() # def postProcessing(self): # print len(self.samples), self.samples # remainedEps = self.totalBudget - len(self.samples) * self.epsilon # self.epsilon = self.epsilon + remainedEps/len(self.samples) # # # recompute noisy counts # prev = 0 # for i in self.samples: # self.publish[i] = self.getCount(self.orig[i], self.epsilon) # if i > prev + 1: # self.publish[prev + 1 : i] = [self.publish[prev]] * (i - prev - 1) # prev = i def setR(self, r): """ generated source for method setR """ self.R = r def setQ(self, q): """ generated source for method setQ """ self.Q = q def setCp(self, cp): """ generated source for method setCp """ self.Cp = cp def setCi(self, ci): """ generated source for method setCi """ self.Ci = ci def setCd(self, cd): """ generated source for method setCd """ self.Cd = cd # prediction step def predictKF(self, curr): """ generated source for method predictKF """ # predict using Kalman Filter lastValue = self.getLastQuery(curr) # project estimation error self.P += self.Q # Q is gaussian noise return lastValue # correction step def correctKF(self, curr, predict): """ generated source for method correctKF """ self.K = (self.P + 0.0) / (self.P + self.R) correct = predict + self.K * (self.publish[curr] - predict) # publish[curr] = Math.max((int) correct, 0) if curr > 0: # only correct from 2nd values self.publish[curr] = correct # print correct, "\t", self.publish[curr], self.K, self.P # update estimation error variance self.P *= (1 - self.K) def getLastQuery(self, curr): """ generated source for method getLastQuery """ for i in reversed(range(curr)): if self.query[i]: break return self.publish[i] # adaptive sampling - return feedback error def PID(self, curr): """ generated source for method PID """ sum = 0 lastValue = 0 change = 0 timeDiff = 0 next = curr for j in reversed(range(self.windowPID - 1)): index = next while index >= 0: if self.query[index]: next = index - 1 # the last nextQuery break index -= 1 if j == self.windowPID - 1: lastValue = abs(self.publish[index] - self.predict[index]) / ( 0.0 + max(self.publish[index], 1)) change = abs(self.publish[index] - self.predict[index]) / ( 0.0 + max(self.publish[index], 1)) timeDiff = index if j == self.windowPID - 2: change -= abs(self.publish[index] - self.predict[index]) / ( 0.0 + max(self.publish[index], 1)) timeDiff -= index sum += (abs(self.publish[index] - self.predict[index]) / (0.0 + max(self.publish[index], 1))) ratio = self.Cp * lastValue + self.Ci * sum + self.Cd * change / ( 0.0 + timeDiff) return ratio