def evaluate(data, range_max_value, quality_function, quality_promise, approximation, eps, delta, intervals_bounding, max_in_interval, use_exponential=True): """ RecConcave algorithm for the specific case of N=2 :param data: the main data-set :param range_max_value: maximum possible output (the minimum output is 0) :param quality_function: function that gets a domain-elements and returns its quality (in float) :param quality_promise: float, quality value that we can assure that there exist a domain element with at least that quality :param approximation: 0 < float < 1. the approximation level of the result :param eps: float > 0. privacy parameter :param delta: 1 > float > 0. privacy parameter :param intervals_bounding: function L(data,domain_element) :param max_in_interval: function u(data,interval) that returns the maximum of quality_function(data,j) for j in the interval :param use_exponential: the original version uses A_dist mechanism. for utility reasons the exponential-mechanism is the default. turn to False to use A_dist instead :return: an element of domain with approximately maximum value of quality function """ # step 2 # print "step 2" log_of_range = int(math.ceil(math.log(range_max_value, 2))) range_max_value_tag = 2**log_of_range def extended_quality_function(data_base, j): if range_max_value < j <= range_max_value_tag: return min(0, quality_function(data_base, range_max_value)) else: return quality_function(data_base, j) # step 4 # print "step 4" def recursive_quality_function(data_base, j): return min( intervals_bounding(data_base, range_max_value_tag, j) - (1 - approximation) * quality_promise, quality_promise - intervals_bounding(data_base, range_max_value_tag, j + 1)) # step 6 # print "step 6" recursion_returned = basicdp.exponential_mechanism_big( data, range(log_of_range + 1), recursive_quality_function, eps) good_interval = 8 * (2**recursion_returned) # print "good interval: %d" % good_interval # step 7 # print "step 7" first_intervals = __build_intervals_set__(data, good_interval, 0, range_max_value_tag) second_intervals = __build_intervals_set__(data, good_interval, 0, range_max_value_tag, True) max_quality = partial(max_in_interval, interval_length=good_interval) # step 9 ( using 'dist' algorithm ) # print "step 9" # TODO should I add switch for sparse? # TODO make sure it is still generic!!!!!!!!!!!!!! if use_exponential: first_full_domain = xrange(0, range_max_value, good_interval) second_full_domain = xrange(good_interval / 2, range_max_value, good_interval) first_chosen_interval = basicdp.sparse_domain( basicdp.exponential_mechanism_big, data, first_full_domain, first_intervals, max_quality, eps) second_chosen_interval = basicdp.sparse_domain( basicdp.exponential_mechanism_big, data, second_full_domain, second_intervals, max_quality, eps) else: first_chosen_interval = basicdp.a_dist(data, first_intervals, max_quality, eps, delta) second_chosen_interval = basicdp.a_dist(data, second_intervals, max_quality, eps, delta) if type(first_chosen_interval) == str and type( second_chosen_interval) == str: raise ValueError("stability problem, try taking more samples!") # step 10 # print "step 10" if type(first_chosen_interval) == str: first_chosen_interval_as_list = [] else: first_chosen_interval_as_list = range( first_chosen_interval, first_chosen_interval + good_interval) if type(second_chosen_interval) == str: second_chosen_interval_as_list = [] else: second_chosen_interval_as_list = range( second_chosen_interval, second_chosen_interval + good_interval) return basicdp.exponential_mechanism_big( data, first_chosen_interval_as_list + second_chosen_interval_as_list, extended_quality_function, eps)
def __rec_sanitize__(samples, domain_range, alpha, beta, eps, delta, dimension): # print domain_range # print calls global calls global san_data # step 1 if calls == 0: return calls -= 1 # step 2 # the use of partial is redundant samples_domain_points = partial(points_in_subset, samples) noisy_points_in_range = samples_domain_points(subset=domain_range) + laplace(0, 1/eps, 1) sample_size = len(samples) # step 3 if noisy_points_in_range < alpha*sample_size/8: base_range = domain_range san_data.extend(base_range[1] * noisy_points_in_range) return san_data # step 4 domain_size = domain_range[1] - domain_range[0] + 1 log_size = int(ceil(log(domain_size, 2))) # not needed # size_tag = 2**log_size # step 6 def quality(data, j): return min(point_count_intervals_bounding(data, domain_range, j)-alpha * sample_size / 32, 3 * alpha * sample_size / 32 - point_count_intervals_bounding(data, domain_range, j-1)) # not needed if using exponential_mechanism # step 7 # promise = alpha * sample_size / 32 # step 8 new_eps = eps/3/log_star(dimension) # new_delta = delta/3/log_star(dimension) # note the use of exponential_mechanism instead of rec_concave z_tag = exponential_mechanism(samples, range(log_size+1), quality, new_eps) z = 2 ** z_tag # step 9 if z_tag == 0: point_counter = Counter(samples) def special_quality(data, b): return point_counter[b] b = choosing_mechanism(samples, range(domain_range[0], domain_range[1] + 1), special_quality, 1, alpha/64., beta, eps, delta) a = b # step 10 else: first_intervals = __build_intervals_set__(samples, 2*z, domain_range[0], domain_range[1] + 1) second_intervals = __build_intervals_set__(samples, 2*z_tag, domain_range[0], domain_range[1] + 1, True) intervals = [(i, i+2*z-1) for i in first_intervals+second_intervals] a, b = choosing_mechanism(samples, intervals, points_in_subset, 2, alpha/64., beta, eps, delta) if type(a) == str: raise ValueError("stability problem - choosing_mechanism returned 'bottom'") # step 11 # although not mentioned I assume the noisy value should be rounded noisy_count_ab = int(samples_domain_points((a, b)) + laplace(0, 1/eps, 1)) san_data.extend([b] * noisy_count_ab) # step 12 if a > domain_range[0]: rec_range = (domain_range[0], a - 1) __rec_sanitize__(samples, rec_range, alpha, beta, eps, delta, dimension) if b < domain_range[1]: rec_range = (b + 1, domain_range[1]) __rec_sanitize__(samples, rec_range, alpha, beta, eps, delta, dimension) return san_data
def evaluate( data, range_max_value, quality_function, quality_promise, approximation, eps, delta, intervals_bounding, max_in_interval, use_exponential=True, ): """ RecConcave algorithm for the specific case of N=2 :param data: the main data-set :param range_max_value: maximum possible output (the minimum output is 0) :param quality_function: function that gets a domain-elements and returns its quality (in float) :param quality_promise: float, quality value that we can assure that there exist a domain element with at least that quality :param approximation: 0 < float < 1. the approximation level of the result :param eps: float > 0. privacy parameter :param delta: 1 > float > 0. privacy parameter :param intervals_bounding: function L(data,domain_element) :param max_in_interval: function u(data,interval) that returns the maximum of quality_function(data,j) for j in the interval :param use_exponential: the original version uses A_dist mechanism. for utility reasons the exponential-mechanism is the default. turn to False to use A_dist instead :return: an element of domain with approximately maximum value of quality function """ # step 2 # print "step 2" log_of_range = int(math.ceil(math.log(range_max_value, 2))) range_max_value_tag = 2 ** log_of_range def extended_quality_function(data_base, j): if range_max_value < j <= range_max_value_tag: return min(0, quality_function(data_base, range_max_value)) else: return quality_function(data_base, j) # step 4 # print "step 4" def recursive_quality_function(data_base, j): return min( intervals_bounding(data_base, range_max_value_tag, j) - (1 - approximation) * quality_promise, quality_promise - intervals_bounding(data_base, range_max_value_tag, j + 1), ) # step 6 # print "step 6" recursion_returned = basicdp.exponential_mechanism_big( data, range(log_of_range + 1), recursive_quality_function, eps ) good_interval = 8 * (2 ** recursion_returned) # print "good interval: %d" % good_interval # step 7 # print "step 7" first_intervals = __build_intervals_set__(data, good_interval, 0, range_max_value_tag) second_intervals = __build_intervals_set__(data, good_interval, 0, range_max_value_tag, True) max_quality = partial(max_in_interval, interval_length=good_interval) # step 9 ( using 'dist' algorithm ) # print "step 9" # TODO should I add switch for sparse? # TODO make sure it is still generic!!!!!!!!!!!!!! if use_exponential: first_full_domain = xrange(0, range_max_value, good_interval) second_full_domain = xrange(good_interval / 2, range_max_value, good_interval) first_chosen_interval = basicdp.sparse_domain( basicdp.exponential_mechanism_big, data, first_full_domain, first_intervals, max_quality, eps ) second_chosen_interval = basicdp.sparse_domain( basicdp.exponential_mechanism_big, data, second_full_domain, second_intervals, max_quality, eps ) else: first_chosen_interval = basicdp.a_dist(data, first_intervals, max_quality, eps, delta) second_chosen_interval = basicdp.a_dist(data, second_intervals, max_quality, eps, delta) if type(first_chosen_interval) == str and type(second_chosen_interval) == str: raise ValueError("stability problem, try taking more samples!") # step 10 # print "step 10" if type(first_chosen_interval) == str: first_chosen_interval_as_list = [] else: first_chosen_interval_as_list = range(first_chosen_interval, first_chosen_interval + good_interval) if type(second_chosen_interval) == str: second_chosen_interval_as_list = [] else: second_chosen_interval_as_list = range(second_chosen_interval, second_chosen_interval + good_interval) return basicdp.exponential_mechanism_big( data, first_chosen_interval_as_list + second_chosen_interval_as_list, extended_quality_function, eps )