Example #1
0
def find(data, domain, goal_number, failure, eps, sparse=True):
    """
    Based on "Locating a Small Cluster Privately" by Kobbi Nissim, Uri Stemmer, and Salil Vadhan. PODS 2016.
    Given a data set, finds the radius of an approximately minimal cluster of points with
    approximately the desired amount of points
    :param data: list of points in R^dimension
    :param domain: tuple(absolute value of domain's end as int, minimum intervals in domain as float)
    :param goal_number: the number of desired points in the resulting cluster
    :param failure: 0 < float < 1. chances that the procedure will fail to return an answer
    :param eps: float > 0. privacy parameter
    :param sparse: 1 > float > 0. privacy parameter
    :return: the radius of the resulting cluster
    """
    # max(abs(np.min(data)), np.max(data))
    all_distances = distances(data)
    # TODO change variable name
    # 'a' need to greater than - log(domain[0] / failure) / eps
    a = 2 * log(domain[0] / failure) / eps
    thresh = goal_number - a - log(1 / failure) / eps
    # TODO verify that the noise addition is correct
    if __max_average_ball__(0, all_distances, goal_number) + laplace(0, 1 / eps, 1) > thresh:
        return 0

    dimension = data.shape[1]
    # TODO maybe a little less sparse?
    if sparse:
        new_domain = __sparse_domain__(domain, dimension)
    else:
        new_domain = __create_regular_domain__(domain, dimension)

    def quality(d, r):
        return min(goal_number - __max_average_ball__(r / 2, all_distances, goal_number),
                   __max_average_ball__(r, all_distances, goal_number) - goal_number + 2*a) / 2

    return exponential_mechanism_big(data, new_domain, quality, eps / 2)
Example #2
0
def find_cluster(data_set, k):
    """

    :param data_set:
    :param k: number of desired points in cluster
    :return:
    """
    distance = distances(data_set)
    for point in data_set:
        point_index = where(data_set == point)[0][0]
        near_k = np.sort(distance[point_index])[k]
        try:
            if near_k < r:
                r, c = near_k, point
        except NameError:
            r, c = near_k, point
    return r, c
Example #3
0
def find_cluster(data_set, k):
    """

    :param data_set:
    :param k: number of desired points in cluster
    :return:
    """
    distance = distances(data_set)
    for point in data_set:
        point_index = where(data_set == point)[0][0]
        near_k = np.sort(distance[point_index])[k]
        try:
            if near_k < r:
                r, c = near_k, point
        except NameError:
            r, c = near_k, point
    return r, c
Example #4
0
def find(data, goal_number, failure, eps, delta, promise=-1):
    # TODO docstring
    """

    :param data:
    :param goal_number:
    :param failure:
    :param eps:
    :param delta:
    :param promise:
    :return:
    """
    domain = abs(max(np.max(data, axis=0)) - min(np.min(data, axis=0)))
    if promise == -1:
        promise = __promise__(data, domain, eps, delta, failure)
    all_distances = distances(data)
    if __max_average_ball__(0, all_distances, goal_number) + laplace(0, 4/eps, 1) >\
                            goal_number - 2*promise - 4/eps*log(2/failure):
        return 0

    extended_domain = 2**int(ceil(log2(domain)))
    max_averages_by_radius = [
        __max_average_ball__(r, all_distances, goal_number)
        for r in arange(0, extended_domain, 0.5)
    ]

    def quality(d, r):
        try:
            return min(
                goal_number - max_averages_by_radius[r],
                max_averages_by_radius[2 * r] - goal_number + 4 * promise) / 2
        except IndexError:
            raise IndexError('error while trying to qualify %f' % r)

    # TODO must complete those two
    def radius_interval_bounding(data_set, domain_end, j):
        return max(
            min(quality(data_set, i) for i in xrange(a, a + 2**j))
            for a in xrange(domain_end - 2**j))

    def max_radius_in_interval(data_set, i):
        return max(quality(data_set, r) for r in i)

    return evaluate(data, domain, quality, promise, 0.5, eps, delta,
                    radius_interval_bounding, max_radius_in_interval)