Esempio n. 1
0
def find(data, domain, goal_number, failure, eps, sparse=True):
    """
    Based on "Locating a Small Cluster Privately" by Kobbi Nissim, Uri Stemmer, and Salil Vadhan. PODS 2016.
    Given a data set, finds the radius of an approximately minimal cluster of points with
    approximately the desired amount of points
    :param data: list of points in R^dimension
    :param domain: tuple(absolute value of domain's end as int, minimum intervals in domain as float)
    :param goal_number: the number of desired points in the resulting cluster
    :param failure: 0 < float < 1. chances that the procedure will fail to return an answer
    :param eps: float > 0. privacy parameter
    :param sparse: 1 > float > 0. privacy parameter
    :return: the radius of the resulting cluster
    """
    # max(abs(np.min(data)), np.max(data))
    all_distances = distances(data)
    # TODO change variable name
    # 'a' need to greater than - log(domain[0] / failure) / eps
    a = 2 * log(domain[0] / failure) / eps
    thresh = goal_number - a - log(1 / failure) / eps
    # TODO verify that the noise addition is correct
    if __max_average_ball__(0, all_distances, goal_number) + laplace(0, 1 / eps, 1) > thresh:
        return 0

    dimension = data.shape[1]
    # TODO maybe a little less sparse?
    if sparse:
        new_domain = __sparse_domain__(domain, dimension)
    else:
        new_domain = __create_regular_domain__(domain, dimension)

    def quality(d, r):
        return min(goal_number - __max_average_ball__(r / 2, all_distances, goal_number),
                   __max_average_ball__(r, all_distances, goal_number) - goal_number + 2*a) / 2

    return exponential_mechanism_big(data, new_domain, quality, eps / 2)
Esempio n. 2
0
def evaluate(data,
             range_max_value,
             quality_function,
             quality_promise,
             approximation,
             eps,
             delta,
             intervals_bounding,
             max_in_interval,
             use_exponential=True):
    """
    RecConcave algorithm for the specific case of N=2
    :param data: the main data-set
    :param range_max_value: maximum possible output (the minimum output is 0)
    :param quality_function: function that gets a domain-elements and returns its quality (in float)
    :param quality_promise: float, quality value that we can assure that there exist a domain element with at least that quality
    :param approximation: 0 < float < 1. the approximation level of the result
    :param eps: float > 0. privacy parameter
    :param delta: 1 > float > 0. privacy parameter
    :param intervals_bounding: function L(data,domain_element)
    :param max_in_interval: function u(data,interval) that returns the maximum of quality_function(data,j)
    for j in the interval
    :param use_exponential: the original version uses A_dist mechanism. for utility reasons the exponential-mechanism
    is the default. turn to False to use A_dist instead
    :return: an element of domain with approximately maximum value of quality function
    """

    # step 2
    # print "step 2"
    log_of_range = int(math.ceil(math.log(range_max_value, 2)))
    range_max_value_tag = 2**log_of_range

    def extended_quality_function(data_base, j):
        if range_max_value < j <= range_max_value_tag:
            return min(0, quality_function(data_base, range_max_value))
        else:
            return quality_function(data_base, j)

    # step 4
    # print "step 4"

    def recursive_quality_function(data_base, j):
        return min(
            intervals_bounding(data_base, range_max_value_tag, j) -
            (1 - approximation) * quality_promise, quality_promise -
            intervals_bounding(data_base, range_max_value_tag, j + 1))

    # step 6
    # print "step 6"
    recursion_returned = basicdp.exponential_mechanism_big(
        data, range(log_of_range + 1), recursive_quality_function, eps)

    good_interval = 8 * (2**recursion_returned)
    # print "good interval: %d" % good_interval

    # step 7
    # print "step 7"
    first_intervals = __build_intervals_set__(data, good_interval, 0,
                                              range_max_value_tag)
    second_intervals = __build_intervals_set__(data, good_interval, 0,
                                               range_max_value_tag, True)
    max_quality = partial(max_in_interval, interval_length=good_interval)

    # step 9 ( using 'dist' algorithm )
    # print "step 9"
    # TODO should I add switch for sparse?
    # TODO make sure it is still generic!!!!!!!!!!!!!!
    if use_exponential:
        first_full_domain = xrange(0, range_max_value, good_interval)
        second_full_domain = xrange(good_interval / 2, range_max_value,
                                    good_interval)
        first_chosen_interval = basicdp.sparse_domain(
            basicdp.exponential_mechanism_big, data, first_full_domain,
            first_intervals, max_quality, eps)
        second_chosen_interval = basicdp.sparse_domain(
            basicdp.exponential_mechanism_big, data, second_full_domain,
            second_intervals, max_quality, eps)
    else:
        first_chosen_interval = basicdp.a_dist(data, first_intervals,
                                               max_quality, eps, delta)
        second_chosen_interval = basicdp.a_dist(data, second_intervals,
                                                max_quality, eps, delta)

    if type(first_chosen_interval) == str and type(
            second_chosen_interval) == str:
        raise ValueError("stability problem, try taking more samples!")

    # step 10
    # print "step 10"
    if type(first_chosen_interval) == str:
        first_chosen_interval_as_list = []
    else:
        first_chosen_interval_as_list = range(
            first_chosen_interval, first_chosen_interval + good_interval)
    if type(second_chosen_interval) == str:
        second_chosen_interval_as_list = []
    else:
        second_chosen_interval_as_list = range(
            second_chosen_interval, second_chosen_interval + good_interval)

    return basicdp.exponential_mechanism_big(
        data, first_chosen_interval_as_list + second_chosen_interval_as_list,
        extended_quality_function, eps)
Esempio n. 3
0
def evaluate(
    data,
    range_max_value,
    quality_function,
    quality_promise,
    approximation,
    eps,
    delta,
    intervals_bounding,
    max_in_interval,
    use_exponential=True,
):
    """
    RecConcave algorithm for the specific case of N=2
    :param data: the main data-set
    :param range_max_value: maximum possible output (the minimum output is 0)
    :param quality_function: function that gets a domain-elements and returns its quality (in float)
    :param quality_promise: float, quality value that we can assure that there exist a domain element with at least that quality
    :param approximation: 0 < float < 1. the approximation level of the result
    :param eps: float > 0. privacy parameter
    :param delta: 1 > float > 0. privacy parameter
    :param intervals_bounding: function L(data,domain_element)
    :param max_in_interval: function u(data,interval) that returns the maximum of quality_function(data,j)
    for j in the interval
    :param use_exponential: the original version uses A_dist mechanism. for utility reasons the exponential-mechanism
    is the default. turn to False to use A_dist instead
    :return: an element of domain with approximately maximum value of quality function
    """

    # step 2
    # print "step 2"
    log_of_range = int(math.ceil(math.log(range_max_value, 2)))
    range_max_value_tag = 2 ** log_of_range

    def extended_quality_function(data_base, j):
        if range_max_value < j <= range_max_value_tag:
            return min(0, quality_function(data_base, range_max_value))
        else:
            return quality_function(data_base, j)

    # step 4
    # print "step 4"

    def recursive_quality_function(data_base, j):
        return min(
            intervals_bounding(data_base, range_max_value_tag, j) - (1 - approximation) * quality_promise,
            quality_promise - intervals_bounding(data_base, range_max_value_tag, j + 1),
        )

    # step 6
    # print "step 6"
    recursion_returned = basicdp.exponential_mechanism_big(
        data, range(log_of_range + 1), recursive_quality_function, eps
    )

    good_interval = 8 * (2 ** recursion_returned)
    # print "good interval: %d" % good_interval

    # step 7
    # print "step 7"
    first_intervals = __build_intervals_set__(data, good_interval, 0, range_max_value_tag)
    second_intervals = __build_intervals_set__(data, good_interval, 0, range_max_value_tag, True)
    max_quality = partial(max_in_interval, interval_length=good_interval)

    # step 9 ( using 'dist' algorithm )
    # print "step 9"
    # TODO should I add switch for sparse?
    # TODO make sure it is still generic!!!!!!!!!!!!!!
    if use_exponential:
        first_full_domain = xrange(0, range_max_value, good_interval)
        second_full_domain = xrange(good_interval / 2, range_max_value, good_interval)
        first_chosen_interval = basicdp.sparse_domain(
            basicdp.exponential_mechanism_big, data, first_full_domain, first_intervals, max_quality, eps
        )
        second_chosen_interval = basicdp.sparse_domain(
            basicdp.exponential_mechanism_big, data, second_full_domain, second_intervals, max_quality, eps
        )
    else:
        first_chosen_interval = basicdp.a_dist(data, first_intervals, max_quality, eps, delta)
        second_chosen_interval = basicdp.a_dist(data, second_intervals, max_quality, eps, delta)

    if type(first_chosen_interval) == str and type(second_chosen_interval) == str:
        raise ValueError("stability problem, try taking more samples!")

    # step 10
    # print "step 10"
    if type(first_chosen_interval) == str:
        first_chosen_interval_as_list = []
    else:
        first_chosen_interval_as_list = range(first_chosen_interval, first_chosen_interval + good_interval)
    if type(second_chosen_interval) == str:
        second_chosen_interval_as_list = []
    else:
        second_chosen_interval_as_list = range(second_chosen_interval, second_chosen_interval + good_interval)

    return basicdp.exponential_mechanism_big(
        data, first_chosen_interval_as_list + second_chosen_interval_as_list, extended_quality_function, eps
    )