def find(data, domain, goal_number, failure, eps, sparse=True): """ Based on "Locating a Small Cluster Privately" by Kobbi Nissim, Uri Stemmer, and Salil Vadhan. PODS 2016. Given a data set, finds the radius of an approximately minimal cluster of points with approximately the desired amount of points :param data: list of points in R^dimension :param domain: tuple(absolute value of domain's end as int, minimum intervals in domain as float) :param goal_number: the number of desired points in the resulting cluster :param failure: 0 < float < 1. chances that the procedure will fail to return an answer :param eps: float > 0. privacy parameter :param sparse: 1 > float > 0. privacy parameter :return: the radius of the resulting cluster """ # max(abs(np.min(data)), np.max(data)) all_distances = distances(data) # TODO change variable name # 'a' need to greater than - log(domain[0] / failure) / eps a = 2 * log(domain[0] / failure) / eps thresh = goal_number - a - log(1 / failure) / eps # TODO verify that the noise addition is correct if __max_average_ball__(0, all_distances, goal_number) + laplace(0, 1 / eps, 1) > thresh: return 0 dimension = data.shape[1] # TODO maybe a little less sparse? if sparse: new_domain = __sparse_domain__(domain, dimension) else: new_domain = __create_regular_domain__(domain, dimension) def quality(d, r): return min(goal_number - __max_average_ball__(r / 2, all_distances, goal_number), __max_average_ball__(r, all_distances, goal_number) - goal_number + 2*a) / 2 return exponential_mechanism_big(data, new_domain, quality, eps / 2)
def find_cluster(data_set, k): """ :param data_set: :param k: number of desired points in cluster :return: """ distance = distances(data_set) for point in data_set: point_index = where(data_set == point)[0][0] near_k = np.sort(distance[point_index])[k] try: if near_k < r: r, c = near_k, point except NameError: r, c = near_k, point return r, c
def find(data, goal_number, failure, eps, delta, promise=-1): # TODO docstring """ :param data: :param goal_number: :param failure: :param eps: :param delta: :param promise: :return: """ domain = abs(max(np.max(data, axis=0)) - min(np.min(data, axis=0))) if promise == -1: promise = __promise__(data, domain, eps, delta, failure) all_distances = distances(data) if __max_average_ball__(0, all_distances, goal_number) + laplace(0, 4/eps, 1) >\ goal_number - 2*promise - 4/eps*log(2/failure): return 0 extended_domain = 2**int(ceil(log2(domain))) max_averages_by_radius = [ __max_average_ball__(r, all_distances, goal_number) for r in arange(0, extended_domain, 0.5) ] def quality(d, r): try: return min( goal_number - max_averages_by_radius[r], max_averages_by_radius[2 * r] - goal_number + 4 * promise) / 2 except IndexError: raise IndexError('error while trying to qualify %f' % r) # TODO must complete those two def radius_interval_bounding(data_set, domain_end, j): return max( min(quality(data_set, i) for i in xrange(a, a + 2**j)) for a in xrange(domain_end - 2**j)) def max_radius_in_interval(data_set, i): return max(quality(data_set, r) for r in i) return evaluate(data, domain, quality, promise, 0.5, eps, delta, radius_interval_bounding, max_radius_in_interval)