Exemple #1
0
def get_opt_privacy_allocation(epsilon,
                               delta,
                               stats_parameters,
                               excess_noise_ratio,
                               sigma_tol=DEFAULT_SIGMA_TOLERANCE,
                               epsilon_tol=DEFAULT_EPSILON_TOLERANCE,
                               sigma_ratio_tol=DEFAULT_SIGMA_RATIO_TOLERANCE):
    '''
    search for sigma ratio (and resulting epsilon allocation) that just
    consumes epsilon budget
    '''
    # get allocation that is optimal for approximate sigmas to get sigma ratio bounds
    approx_epsilons, approx_sigmas = get_approximate_privacy_allocation(
        epsilon, delta, stats_parameters, sigma_tol=sigma_tol)
    # ratios of sigma to expected value
    min_sigma_ratio = None
    max_sigma_ratio = None
    for param, (sensitivity, val) in stats_parameters.iteritems():
        ratio = get_expected_noise_ratio(excess_noise_ratio,
                                         approx_sigmas[param], val)
        if (min_sigma_ratio is None) or (ratio < min_sigma_ratio):
            min_sigma_ratio = ratio
        if (max_sigma_ratio is None) or (ratio > max_sigma_ratio):
            max_sigma_ratio = ratio
    # get optimal sigma ratio
    opt_sigma_ratio = interval_boolean_binary_search(\
        lambda x: sum((get_epsilon_consumed(stats_parameters, excess_noise_ratio, x, delta,
            tol=epsilon_tol)).itervalues()) <= epsilon,
        min_sigma_ratio, max_sigma_ratio, sigma_ratio_tol, return_true=True)
    # compute epsilon allocation that achieves optimal sigma ratio
    opt_epsilons = get_epsilon_consumed(stats_parameters,
                                        excess_noise_ratio,
                                        opt_sigma_ratio,
                                        delta,
                                        tol=epsilon_tol)
    # turn opt sigma ratio into per-parameter sigmas
    opt_sigmas = dict()
    zero_sigmas = []
    low_sigmas = []
    for param, (sensitivity, val) in stats_parameters.iteritems():
        opt_sigma = get_sigma(excess_noise_ratio, opt_sigma_ratio, val)
        # Check if the sigma is too small
        if param != DEFAULT_DUMMY_COUNTER_NAME:
            if opt_sigma == 0.0:
                zero_sigmas.append(param)
            elif opt_sigma < DEFAULT_SIGMA_TOLERANCE:
                low_sigmas.append(param)
        opt_sigmas[param] = opt_sigma

    if len(zero_sigmas) > 0:
        logging.error(
            "sigmas for {} are zero, this provides no differential privacy for these statistics"
            .format(summarise_list(zero_sigmas)))

    if len(low_sigmas) > 0:
        logging.warning(
            "sigmas for {} are less than the sigma tolerance {}, their calculated values may be inaccurate and may vary each time they are calculated"
            .format(summarise_list(low_sigmas), sigma_tol))

    return (opt_epsilons, opt_sigmas, opt_sigma_ratio)
Exemple #2
0
def exact_match_validate_item(exact_obj,
                              search_string,
                              original_list,
                              match_onion_md5=False):
    '''
    Search exact_obj for search_string.

    If match_onion_md5 is True, also try to match:
        hashlib.md5(search_obj + '.onion').hexdigest()
    If search_obj is a string, it is lowercased before hashing.

    If the search fails, log a warning using original_list, and raise an
    exception.
    '''
    try:
        assert exact_match(exact_obj,
                           search_string,
                           match_onion_md5=match_onion_md5)
    except:
        onion_match = " (and onion md5)" if match_onion_md5 else ""
        logging.warning(
            "Validating exact {}{} failed:\nOriginal:\n{}\nSet:\n{}".format(
                search_string, onion_match, summarise_list(original_list),
                summarise_list(exact_obj)))
        logging.debug(
            "Validating exact {}{} failed:\nOriginal (full):\n{}\nSet (full):\n{}"
            .format(search_string, onion_match, original_list, exact_obj))
        raise
Exemple #3
0
def exact_match_prepare_collection(exact_collection,
                                   existing_exacts=None,
                                   validate=True):
    '''
    Prepare a hashable object collection for efficient exact matching.
    If the objects in the collection are strings, lowercases them.

    existing_exacts is a list of previously prepared collections.

    If existing_exacts is not None, append the new collection to
    existing_exacts, as well as returning the prepared collection.

    If multiple lists are prepared using the same existing_exacts, then
    the final lists are disjoint. Any duplicate items are ignored, and a
    warning is logged. (An item that appears in multiple input lists is
    output in the earliest list it appears in.)

    If validate is True, checks that exact_match() returns True for each
    item in exact_collection.

    Returns an object that can be passed to exact_match().
    This object must be treated as opaque and read-only.
    '''
    assert exact_collection is not None
    # Set matching uses a hash table, so it's more efficient
    exact_collection_lower = [
        lower_if_hasattr(obj) for obj in exact_collection
    ]
    exact_set = frozenset(exact_collection_lower)
    # Log a message if there were any duplicates
    # Finding each duplicate takes a lot longer
    if len(exact_collection) != len(exact_set):
        dups = [
            obj for obj in exact_set
            if exact_collection_lower.count(lower_if_hasattr(obj)) > 1
        ]
        logging.warning("Removing {} duplicates within this collection".format(
            summarise_list(dups)))
    # the encoded json measures transmission size, not RAM size
    logging.info("Exact match prepared {} items ({})".format(
        len(exact_set), format_bytes(len(json_serialise(list(exact_set))))))

    # Check that each item actually matches the list
    if validate:
        for item in exact_collection:
            exact_match_validate_item(exact_set, item, exact_collection)

    if existing_exacts is None:
        return exact_set
    else:
        # Remove any items that appear in earlier lists
        disjoint_exact_set = exact_set.difference(*existing_exacts)
        duplicate_exact_set = exact_set.difference(disjoint_exact_set)
        if len(duplicate_exact_set) > 0:
            logging.warning(
                "Removing {} duplicates that are also in an earlier collection"
                .format(summarise_list(duplicate_exact_set)))
        existing_exacts.append(disjoint_exact_set)
        return disjoint_exact_set
Exemple #4
0
def exact_match_validate_item(exact_obj, search_string, original_list):
    '''
    Search exact_obj for search_string.

    If the search fails, log a warning using original_list, and raise an
    exception.
    '''
    try:
        assert exact_match(exact_obj, search_string)
    except:
        logging.warning(
            "Validating exact {} failed:\nOriginal:\n{}\nSet:\n{}".format(
                search_string, summarise_list(original_list),
                summarise_list(exact_obj)))
        logging.debug(
            "Validating exact {} failed:\nOriginal (full):\n{}\nSet (full):\n{}"
            .format(search_string, original_list, exact_obj))
        raise
Exemple #5
0
 def summarise_match_maps(deepcopied_start_config, match_map_key):
     '''
     If deepcopied_start_config contains match_map_key, and it contains
     any match maps, summarise them.
     You must deep-copy start_config before calling this function.
     '''
     for k in deepcopied_start_config.get(match_map_key, {}):
         match_map = deepcopied_start_config[match_map_key][k]
         short_string = summarise_list(
             match_map.splitlines(),
             max_obj_str_len=PrivCountNode.MAX_MATCH_LEN,
             sort_output=False)
         deepcopied_start_config[match_map_key][k] = short_string
Exemple #6
0
 def summarise_match_lists(deepcopied_start_config, match_list_key):
     '''
     If deepcopied_start_config contains match_list_key, and it contains
     any match lists, summarise them.
     You must deep-copy start_config before calling this function.
     '''
     for i in xrange(len(deepcopied_start_config.get(match_list_key, []))):
         match_list = deepcopied_start_config[match_list_key][i]
         short_string = summarise_list(
             match_list,
             max_obj_str_len=PrivCountNode.MAX_MATCH_LEN,
             sort_output=False)
         deepcopied_start_config[match_list_key][i] = short_string
Exemple #7
0
def suffix_match_validate_item(suffix_obj,
                               search_string,
                               original_list,
                               separator="",
                               expected_collection_tag=-1,
                               reject_overlapping_lists=True):
    '''
    Search suffix_obj for search_string using separator.
    If reject_overlapping_lists is True, make sure it yields
    expected_collection_tag. Otherwise, make sure it yields a collection tag
    that is not None.

    If the search fails, log a warning using original_list, and raise an
    exception.
    '''
    try:
        found_collection_tag = suffix_match(suffix_obj,
                                            search_string,
                                            separator=separator)
        if reject_overlapping_lists:
            assert found_collection_tag == expected_collection_tag
        else:
            assert found_collection_tag is not None
    except:
        logging.warning(
            "Validating suffix {} -> {} found {} ({}):\nOriginal:\n{}\nTree:\n{}"
            .format(
                search_string, expected_collection_tag, found_collection_tag,
                "rejected overlaps" if reject_overlapping_lists else
                "allowed overlaps", summarise_list(original_list),
                summarise_list(suffix_obj.keys())))
        logging.debug(
            "Validating suffix {} -> {} found {} ({}):\nOriginal (full):\n{}\nTree (full):\n{}"
            .format(
                search_string, expected_collection_tag, found_collection_tag,
                "rejected overlaps" if reject_overlapping_lists else
                "allowed overlaps", original_list, suffix_obj))
        raise
Exemple #8
0
 def summarise_match_suffixes(deepcopied_start_config, match_suffix_key):
     '''
     If deepcopied_start_config contains match_suffix_key, and it contains
     any match suffixes, summarise them.
     You must deep-copy start_config before calling this function.
     '''
     # this is not a very good summary, but it duplicates the lists, so
     # that's ok
     match_suffixes = deepcopied_start_config.get(match_suffix_key, {})
     if len(match_suffixes) > 0:
         short_string = summarise_list(
             match_suffixes.keys(),
             max_obj_str_len=PrivCountNode.MAX_MATCH_LEN,
             sort_output=False)
         deepcopied_start_config[match_suffix_key] = short_string
Exemple #9
0
def get_noise_allocation(noise_parameters,
                         sanity_check=DEFAULT_DUMMY_COUNTER_NAME,
                         circuit_sample_rate=1.0):
    '''
    An adapter which wraps get_opt_privacy_allocation, extracting the
    parameters from the noise_parameters data structure, and updating
    noise_parameters with the calculated values.
    If sanity_check is not None, adds a sanity check counter to the result,
    with the counter name supplied in sanity_check, and values created using
    get_sanity_check_counter().
    Scales expected circuit counter values by circuit_sample_rate before
    allocating noise.
    Returns a data structure containing the results on success.
    Raises a ValueError on failure.
    The format of noise_parameters is:
    privacy:
        epsilon: float in
        delta: float in
        excess_noise_ratio: float in
        sigma_tolerance: float in optional default 1e-6
        epsilon_tolerance: float in optional default 1e-15
        sigma_ratio_tolerance: float in optional default 1e-6
        sigma_ratio: float out
    counters:
        'CounterName': multiple
            bins: optional, unused
                - [float, float, long optional] multiple, unused
            sensitivity: float in
            estimated_value: float in
            sigma: float out
            epsilon: float out
            expected_noise_ratio: float out
    The expected noise ratio should be identical for each counter, except for
    floating-point inaccuracies.
    '''
    assert circuit_sample_rate >= 0.0
    assert circuit_sample_rate <= 1.0
    # extract the top-level structures
    noise = noise_parameters['privacy']
    counters = noise_parameters['counters']
    excess_noise_ratio = noise['excess_noise_ratio']
    # rearrange the counter values, and produce the parameter-only structure
    stats_parameters = {}
    zero_sigmas = []
    for stat in counters:
        sensitivity = counters[stat]['sensitivity']
        estimated_value = counters[stat]['estimated_value']
        if is_circuit_sample_counter(stat):
            estimated_value *= circuit_sample_rate
        if sensitivity == 0 and stat != DEFAULT_DUMMY_COUNTER_NAME:
            zero_sigmas.append(stat)
        statistics = (sensitivity, estimated_value)
        stats_parameters[stat] = statistics

    if len(zero_sigmas) > 0:
        # If you want a counter with no noise, try using 1e-6 instead
        logging.error(
            "sensitivity for {} is zero, calculated sigmas will be zero for all statistics"
            .format(summarise_list(zero_sigmas)))

    # calculate the noise allocations
    # and update the structure with defaults, if not already present
    epsilons, sigmas, sigma_ratio = \
        get_opt_privacy_allocation(noise['epsilon'],
                                   noise['delta'],
                                   stats_parameters,
                                   excess_noise_ratio,
                                   sigma_tol=noise.setdefault(
                                                 'sigma_tolerance',
                                                 DEFAULT_SIGMA_TOLERANCE),
                                   epsilon_tol=noise.setdefault(
                                                 'epsilon_tolerance',
                                                 DEFAULT_EPSILON_TOLERANCE),
                                   sigma_ratio_tol=noise.setdefault(
                                                 'sigma_ratio_tolerance',
                                                 DEFAULT_SIGMA_RATIO_TOLERANCE)
                               )
    # update the structure with the results
    noise['sigma_ratio'] = sigma_ratio
    for stat in counters:
        counters[stat]['epsilon'] = epsilons[stat]
        counters[stat]['sigma'] = sigmas[stat]
        noise_ratio = get_expected_noise_ratio(
            excess_noise_ratio, counters[stat]['sigma'],
            counters[stat]['estimated_value'])
        counters[stat]['expected_noise_ratio'] = noise_ratio
    if sanity_check is not None:
        counters[sanity_check] = get_sanity_check_counter()
    return noise_parameters
Exemple #10
0
def suffix_match_prepare_collection(suffix_collection,
                                    separator="",
                                    existing_suffixes=None,
                                    collection_tag=-1,
                                    validate=True):
    '''
    Prepare a collection of strings for efficient suffix matching.
    If specified, the separator is also required before the suffix.
    For example, domain suffixes use "." as a separator between components.

    existing_suffixes is a previously prepared suffix data structure.

    If existing_suffixes is not None, add the new suffixes to
    existing_suffixes, and return existing_suffixes. Each suffix is terminated
    with collection_tag, which can be used to distinguish between suffixes
    from different lists. collection_tag must be a non-dict type that is not
    None.

    If multiple lists are prepared using the same existing_suffixes, then the
    final suffix data structure is disjoint. Any duplicate or longer
    suffixes are eliminated, and a warning is logged. (A suffix that appears
    in multiple input lists is output in the earliest list it appears in.
    A shorter suffix in a later list replaces any longer suffixes in earlier
    lists.)

    If validate is True, checks that suffix_match() returns True for each
    item in suffix_collection.

    Returns a tuple containing an object that can be passed to suffix_match(),
    and a boolean that is True if any duplicate domains were found.
    The object must be treated as opaque and read-only.
    '''
    assert suffix_collection is not None
    assert is_collection_tag_valid(collection_tag)
    #assert type(existing_suffixes) == dict
    # Create a tree of suffix components using nested python dicts
    # the terminal object is an empty dict
    if existing_suffixes is None:
        suffix_obj = {}
    else:
        suffix_obj = existing_suffixes
    longer_suffix_list = []
    duplicates = False
    for insert_string in suffix_collection:
        #assert type(suffix_obj) == dict
        insert_list = suffix_match_split(insert_string, separator=separator)
        prev_suffix_node = None
        suffix_node = suffix_obj
        # did we terminate the loop early due to a longer suffix?
        has_longer_suffix = False
        for insert_component in insert_list:
            # since we have stripped periods from the start and end, a double
            # dot is almost certainly a typo
            assert len(insert_component) > 0

            # we are allowed to add any child to the root
            # but we cannot add a child to an existing terminal object
            # because the existing tree contains a shorter suffix of
            # the string we are inserting
            #assert type(suffix_node) == dict
            next_suffix_node = suffix_node.get(insert_component)
            if (is_collection_tag_valid(next_suffix_node)):
                # there is an existing suffix that terminates here, and we are
                # about to insert a longer suffix. Instead, ignore the longer
                # suffix
                has_longer_suffix = True
                longer_suffix_list.append(insert_string)
                break

            # walk the tree, adding an entry for this suffix
            prev_suffix_node = suffix_node
            suffix_node = (next_suffix_node if next_suffix_node is not None
                           else suffix_node.setdefault(insert_component, {}))

        # we cannot have children in our terminal object
        # because the existing tree contains longer strings, and we are
        # a shorter suffix of all those existing strings
        if (not has_longer_suffix and not is_collection_tag_valid(suffix_node)
                and len(suffix_node) > 0):
            duplicates = True
            child_summary = summarise_list(suffix_node.keys())
            child_all = " ".join(suffix_node.keys())
            logging.warning(
                "Adding shorter suffix {} for collection {}, pruning existing children {}"
                .format(insert_string, collection_tag, child_summary))
            logging.debug(
                "Adding shorter suffix {} for collection {}, pruning existing children {}"
                .format(insert_string, collection_tag, child_all))

        # now, place (or replace) the end of the domain with the collection tag
        if not has_longer_suffix:
            #assert prev_suffix_node is not None
            prev_suffix_node[insert_component] = collection_tag

        # Now check that each item actually matches one of the lists
        # Allow the lists to have overlaps, we'll check that later
        if validate:
            suffix_match_validate_item(suffix_obj,
                                       insert_string,
                                       suffix_collection,
                                       separator=separator,
                                       expected_collection_tag=collection_tag,
                                       reject_overlapping_lists=False)

    if len(longer_suffix_list) > 0:
        duplicates = True
        suffix_summary = summarise_list(longer_suffix_list)
        suffix_all = " ".join(longer_suffix_list)
        logging.warning(
            "Suffix match for {} ignored longer suffixes {}".format(
                collection_tag, suffix_summary))
        logging.debug("Suffix match for {} ignored longer suffixes {}".format(
            collection_tag, suffix_all))

    # the encoded json measures transmission size, not RAM size
    logging.info("Suffix match {} prepared {} items ({})".format(
        collection_tag, len(suffix_collection),
        format_bytes(len(json_serialise(suffix_obj)))))

    return (suffix_obj, duplicates)
Exemple #11
0
def exact_match_prepare_collection(exact_collection,
                                   existing_exacts=None,
                                   validate=True,
                                   match_onion_md5=False):
    '''
    Prepare a hashable object collection for efficient exact matching.
    If the objects in the collection are strings, lowercases them.

    existing_exacts is a list of previously prepared collections.

    If existing_exacts is not None, append the new collection to
    existing_exacts, as well as returning the prepared collection.

    If multiple lists are prepared using the same existing_exacts, then
    the final lists are disjoint. Any duplicate items are ignored, and a
    warning is logged. (An item that appears in multiple input lists is
    output in the earliest list it appears in.)

    If validate is True, checks that exact_match() returns True for each
    item in exact_collection.

    If match_onion_md5 is True, also try to validate matches for:
        hashlib.md5(item + '.onion').hexdigest()
    If item is a string, it is lowercased before hashing.

    Returns an object that can be passed to exact_match().
    This object must be treated as opaque and read-only.
    '''
    assert exact_collection is not None
    onion_match = " (and onion md5)" if match_onion_md5 else ""
    # Set matching uses a hash table, so it's more efficient
    exact_collection_lower = [
        lower_if_hasattr(obj) for obj in exact_collection
    ]
    exact_set = frozenset(exact_collection_lower)
    # Log a message if there were any duplicates
    # Finding each duplicate takes a lot longer
    if len(exact_collection) != len(exact_set):
        dups = [
            obj for obj in exact_set
            if exact_collection_lower.count(lower_if_hasattr(obj)) > 1
        ]
        logging.warning(
            "Removing {} duplicates{} within this collection".format(
                summarise_list(dups), onion_match))
    # the encoded json measures transmission size, not RAM size
    logging.info("Exact match prepared {}{} items ({})".format(
        len(exact_set), onion_match,
        format_bytes(len(json_serialise(list(exact_set))))))

    # Check that each item actually matches the list
    if validate:
        for item in exact_collection:
            exact_match_validate_item(exact_set,
                                      item,
                                      exact_collection,
                                      match_onion_md5=match_onion_md5)

    if existing_exacts is None:
        return exact_set
    else:
        # Remove any items that appear in earlier lists
        #
        # If an onion address is in a different format in an earlier list,
        # this code won't remove it.
        #
        # Depending on the exact configuration:
        # - the tally server will fail to validate the lists, or
        # - the data collectors will count events for the duplicate onion
        #   address against the first list that contains the address
        #   (in any format).
        disjoint_exact_set = exact_set.difference(*existing_exacts)
        duplicate_exact_set = exact_set.difference(disjoint_exact_set)
        if len(duplicate_exact_set) > 0:
            logging.warning(
                "Removing {} duplicates{} that are also in an earlier collection"
                .format(summarise_list(duplicate_exact_set), onion_match))
        existing_exacts.append(disjoint_exact_set)
        return disjoint_exact_set