def get_opt_privacy_allocation(epsilon, delta, stats_parameters, excess_noise_ratio, sigma_tol=DEFAULT_SIGMA_TOLERANCE, epsilon_tol=DEFAULT_EPSILON_TOLERANCE, sigma_ratio_tol=DEFAULT_SIGMA_RATIO_TOLERANCE): ''' search for sigma ratio (and resulting epsilon allocation) that just consumes epsilon budget ''' # get allocation that is optimal for approximate sigmas to get sigma ratio bounds approx_epsilons, approx_sigmas = get_approximate_privacy_allocation( epsilon, delta, stats_parameters, sigma_tol=sigma_tol) # ratios of sigma to expected value min_sigma_ratio = None max_sigma_ratio = None for param, (sensitivity, val) in stats_parameters.iteritems(): ratio = get_expected_noise_ratio(excess_noise_ratio, approx_sigmas[param], val) if (min_sigma_ratio is None) or (ratio < min_sigma_ratio): min_sigma_ratio = ratio if (max_sigma_ratio is None) or (ratio > max_sigma_ratio): max_sigma_ratio = ratio # get optimal sigma ratio opt_sigma_ratio = interval_boolean_binary_search(\ lambda x: sum((get_epsilon_consumed(stats_parameters, excess_noise_ratio, x, delta, tol=epsilon_tol)).itervalues()) <= epsilon, min_sigma_ratio, max_sigma_ratio, sigma_ratio_tol, return_true=True) # compute epsilon allocation that achieves optimal sigma ratio opt_epsilons = get_epsilon_consumed(stats_parameters, excess_noise_ratio, opt_sigma_ratio, delta, tol=epsilon_tol) # turn opt sigma ratio into per-parameter sigmas opt_sigmas = dict() zero_sigmas = [] low_sigmas = [] for param, (sensitivity, val) in stats_parameters.iteritems(): opt_sigma = get_sigma(excess_noise_ratio, opt_sigma_ratio, val) # Check if the sigma is too small if param != DEFAULT_DUMMY_COUNTER_NAME: if opt_sigma == 0.0: zero_sigmas.append(param) elif opt_sigma < DEFAULT_SIGMA_TOLERANCE: low_sigmas.append(param) opt_sigmas[param] = opt_sigma if len(zero_sigmas) > 0: logging.error( "sigmas for {} are zero, this provides no differential privacy for these statistics" .format(summarise_list(zero_sigmas))) if len(low_sigmas) > 0: logging.warning( "sigmas for {} are less than the sigma tolerance {}, their calculated values may be inaccurate and may vary each time they are calculated" .format(summarise_list(low_sigmas), sigma_tol)) return (opt_epsilons, opt_sigmas, opt_sigma_ratio)
def exact_match_validate_item(exact_obj, search_string, original_list, match_onion_md5=False): ''' Search exact_obj for search_string. If match_onion_md5 is True, also try to match: hashlib.md5(search_obj + '.onion').hexdigest() If search_obj is a string, it is lowercased before hashing. If the search fails, log a warning using original_list, and raise an exception. ''' try: assert exact_match(exact_obj, search_string, match_onion_md5=match_onion_md5) except: onion_match = " (and onion md5)" if match_onion_md5 else "" logging.warning( "Validating exact {}{} failed:\nOriginal:\n{}\nSet:\n{}".format( search_string, onion_match, summarise_list(original_list), summarise_list(exact_obj))) logging.debug( "Validating exact {}{} failed:\nOriginal (full):\n{}\nSet (full):\n{}" .format(search_string, onion_match, original_list, exact_obj)) raise
def exact_match_prepare_collection(exact_collection, existing_exacts=None, validate=True): ''' Prepare a hashable object collection for efficient exact matching. If the objects in the collection are strings, lowercases them. existing_exacts is a list of previously prepared collections. If existing_exacts is not None, append the new collection to existing_exacts, as well as returning the prepared collection. If multiple lists are prepared using the same existing_exacts, then the final lists are disjoint. Any duplicate items are ignored, and a warning is logged. (An item that appears in multiple input lists is output in the earliest list it appears in.) If validate is True, checks that exact_match() returns True for each item in exact_collection. Returns an object that can be passed to exact_match(). This object must be treated as opaque and read-only. ''' assert exact_collection is not None # Set matching uses a hash table, so it's more efficient exact_collection_lower = [ lower_if_hasattr(obj) for obj in exact_collection ] exact_set = frozenset(exact_collection_lower) # Log a message if there were any duplicates # Finding each duplicate takes a lot longer if len(exact_collection) != len(exact_set): dups = [ obj for obj in exact_set if exact_collection_lower.count(lower_if_hasattr(obj)) > 1 ] logging.warning("Removing {} duplicates within this collection".format( summarise_list(dups))) # the encoded json measures transmission size, not RAM size logging.info("Exact match prepared {} items ({})".format( len(exact_set), format_bytes(len(json_serialise(list(exact_set)))))) # Check that each item actually matches the list if validate: for item in exact_collection: exact_match_validate_item(exact_set, item, exact_collection) if existing_exacts is None: return exact_set else: # Remove any items that appear in earlier lists disjoint_exact_set = exact_set.difference(*existing_exacts) duplicate_exact_set = exact_set.difference(disjoint_exact_set) if len(duplicate_exact_set) > 0: logging.warning( "Removing {} duplicates that are also in an earlier collection" .format(summarise_list(duplicate_exact_set))) existing_exacts.append(disjoint_exact_set) return disjoint_exact_set
def exact_match_validate_item(exact_obj, search_string, original_list): ''' Search exact_obj for search_string. If the search fails, log a warning using original_list, and raise an exception. ''' try: assert exact_match(exact_obj, search_string) except: logging.warning( "Validating exact {} failed:\nOriginal:\n{}\nSet:\n{}".format( search_string, summarise_list(original_list), summarise_list(exact_obj))) logging.debug( "Validating exact {} failed:\nOriginal (full):\n{}\nSet (full):\n{}" .format(search_string, original_list, exact_obj)) raise
def summarise_match_maps(deepcopied_start_config, match_map_key): ''' If deepcopied_start_config contains match_map_key, and it contains any match maps, summarise them. You must deep-copy start_config before calling this function. ''' for k in deepcopied_start_config.get(match_map_key, {}): match_map = deepcopied_start_config[match_map_key][k] short_string = summarise_list( match_map.splitlines(), max_obj_str_len=PrivCountNode.MAX_MATCH_LEN, sort_output=False) deepcopied_start_config[match_map_key][k] = short_string
def summarise_match_lists(deepcopied_start_config, match_list_key): ''' If deepcopied_start_config contains match_list_key, and it contains any match lists, summarise them. You must deep-copy start_config before calling this function. ''' for i in xrange(len(deepcopied_start_config.get(match_list_key, []))): match_list = deepcopied_start_config[match_list_key][i] short_string = summarise_list( match_list, max_obj_str_len=PrivCountNode.MAX_MATCH_LEN, sort_output=False) deepcopied_start_config[match_list_key][i] = short_string
def suffix_match_validate_item(suffix_obj, search_string, original_list, separator="", expected_collection_tag=-1, reject_overlapping_lists=True): ''' Search suffix_obj for search_string using separator. If reject_overlapping_lists is True, make sure it yields expected_collection_tag. Otherwise, make sure it yields a collection tag that is not None. If the search fails, log a warning using original_list, and raise an exception. ''' try: found_collection_tag = suffix_match(suffix_obj, search_string, separator=separator) if reject_overlapping_lists: assert found_collection_tag == expected_collection_tag else: assert found_collection_tag is not None except: logging.warning( "Validating suffix {} -> {} found {} ({}):\nOriginal:\n{}\nTree:\n{}" .format( search_string, expected_collection_tag, found_collection_tag, "rejected overlaps" if reject_overlapping_lists else "allowed overlaps", summarise_list(original_list), summarise_list(suffix_obj.keys()))) logging.debug( "Validating suffix {} -> {} found {} ({}):\nOriginal (full):\n{}\nTree (full):\n{}" .format( search_string, expected_collection_tag, found_collection_tag, "rejected overlaps" if reject_overlapping_lists else "allowed overlaps", original_list, suffix_obj)) raise
def summarise_match_suffixes(deepcopied_start_config, match_suffix_key): ''' If deepcopied_start_config contains match_suffix_key, and it contains any match suffixes, summarise them. You must deep-copy start_config before calling this function. ''' # this is not a very good summary, but it duplicates the lists, so # that's ok match_suffixes = deepcopied_start_config.get(match_suffix_key, {}) if len(match_suffixes) > 0: short_string = summarise_list( match_suffixes.keys(), max_obj_str_len=PrivCountNode.MAX_MATCH_LEN, sort_output=False) deepcopied_start_config[match_suffix_key] = short_string
def get_noise_allocation(noise_parameters, sanity_check=DEFAULT_DUMMY_COUNTER_NAME, circuit_sample_rate=1.0): ''' An adapter which wraps get_opt_privacy_allocation, extracting the parameters from the noise_parameters data structure, and updating noise_parameters with the calculated values. If sanity_check is not None, adds a sanity check counter to the result, with the counter name supplied in sanity_check, and values created using get_sanity_check_counter(). Scales expected circuit counter values by circuit_sample_rate before allocating noise. Returns a data structure containing the results on success. Raises a ValueError on failure. The format of noise_parameters is: privacy: epsilon: float in delta: float in excess_noise_ratio: float in sigma_tolerance: float in optional default 1e-6 epsilon_tolerance: float in optional default 1e-15 sigma_ratio_tolerance: float in optional default 1e-6 sigma_ratio: float out counters: 'CounterName': multiple bins: optional, unused - [float, float, long optional] multiple, unused sensitivity: float in estimated_value: float in sigma: float out epsilon: float out expected_noise_ratio: float out The expected noise ratio should be identical for each counter, except for floating-point inaccuracies. ''' assert circuit_sample_rate >= 0.0 assert circuit_sample_rate <= 1.0 # extract the top-level structures noise = noise_parameters['privacy'] counters = noise_parameters['counters'] excess_noise_ratio = noise['excess_noise_ratio'] # rearrange the counter values, and produce the parameter-only structure stats_parameters = {} zero_sigmas = [] for stat in counters: sensitivity = counters[stat]['sensitivity'] estimated_value = counters[stat]['estimated_value'] if is_circuit_sample_counter(stat): estimated_value *= circuit_sample_rate if sensitivity == 0 and stat != DEFAULT_DUMMY_COUNTER_NAME: zero_sigmas.append(stat) statistics = (sensitivity, estimated_value) stats_parameters[stat] = statistics if len(zero_sigmas) > 0: # If you want a counter with no noise, try using 1e-6 instead logging.error( "sensitivity for {} is zero, calculated sigmas will be zero for all statistics" .format(summarise_list(zero_sigmas))) # calculate the noise allocations # and update the structure with defaults, if not already present epsilons, sigmas, sigma_ratio = \ get_opt_privacy_allocation(noise['epsilon'], noise['delta'], stats_parameters, excess_noise_ratio, sigma_tol=noise.setdefault( 'sigma_tolerance', DEFAULT_SIGMA_TOLERANCE), epsilon_tol=noise.setdefault( 'epsilon_tolerance', DEFAULT_EPSILON_TOLERANCE), sigma_ratio_tol=noise.setdefault( 'sigma_ratio_tolerance', DEFAULT_SIGMA_RATIO_TOLERANCE) ) # update the structure with the results noise['sigma_ratio'] = sigma_ratio for stat in counters: counters[stat]['epsilon'] = epsilons[stat] counters[stat]['sigma'] = sigmas[stat] noise_ratio = get_expected_noise_ratio( excess_noise_ratio, counters[stat]['sigma'], counters[stat]['estimated_value']) counters[stat]['expected_noise_ratio'] = noise_ratio if sanity_check is not None: counters[sanity_check] = get_sanity_check_counter() return noise_parameters
def suffix_match_prepare_collection(suffix_collection, separator="", existing_suffixes=None, collection_tag=-1, validate=True): ''' Prepare a collection of strings for efficient suffix matching. If specified, the separator is also required before the suffix. For example, domain suffixes use "." as a separator between components. existing_suffixes is a previously prepared suffix data structure. If existing_suffixes is not None, add the new suffixes to existing_suffixes, and return existing_suffixes. Each suffix is terminated with collection_tag, which can be used to distinguish between suffixes from different lists. collection_tag must be a non-dict type that is not None. If multiple lists are prepared using the same existing_suffixes, then the final suffix data structure is disjoint. Any duplicate or longer suffixes are eliminated, and a warning is logged. (A suffix that appears in multiple input lists is output in the earliest list it appears in. A shorter suffix in a later list replaces any longer suffixes in earlier lists.) If validate is True, checks that suffix_match() returns True for each item in suffix_collection. Returns a tuple containing an object that can be passed to suffix_match(), and a boolean that is True if any duplicate domains were found. The object must be treated as opaque and read-only. ''' assert suffix_collection is not None assert is_collection_tag_valid(collection_tag) #assert type(existing_suffixes) == dict # Create a tree of suffix components using nested python dicts # the terminal object is an empty dict if existing_suffixes is None: suffix_obj = {} else: suffix_obj = existing_suffixes longer_suffix_list = [] duplicates = False for insert_string in suffix_collection: #assert type(suffix_obj) == dict insert_list = suffix_match_split(insert_string, separator=separator) prev_suffix_node = None suffix_node = suffix_obj # did we terminate the loop early due to a longer suffix? has_longer_suffix = False for insert_component in insert_list: # since we have stripped periods from the start and end, a double # dot is almost certainly a typo assert len(insert_component) > 0 # we are allowed to add any child to the root # but we cannot add a child to an existing terminal object # because the existing tree contains a shorter suffix of # the string we are inserting #assert type(suffix_node) == dict next_suffix_node = suffix_node.get(insert_component) if (is_collection_tag_valid(next_suffix_node)): # there is an existing suffix that terminates here, and we are # about to insert a longer suffix. Instead, ignore the longer # suffix has_longer_suffix = True longer_suffix_list.append(insert_string) break # walk the tree, adding an entry for this suffix prev_suffix_node = suffix_node suffix_node = (next_suffix_node if next_suffix_node is not None else suffix_node.setdefault(insert_component, {})) # we cannot have children in our terminal object # because the existing tree contains longer strings, and we are # a shorter suffix of all those existing strings if (not has_longer_suffix and not is_collection_tag_valid(suffix_node) and len(suffix_node) > 0): duplicates = True child_summary = summarise_list(suffix_node.keys()) child_all = " ".join(suffix_node.keys()) logging.warning( "Adding shorter suffix {} for collection {}, pruning existing children {}" .format(insert_string, collection_tag, child_summary)) logging.debug( "Adding shorter suffix {} for collection {}, pruning existing children {}" .format(insert_string, collection_tag, child_all)) # now, place (or replace) the end of the domain with the collection tag if not has_longer_suffix: #assert prev_suffix_node is not None prev_suffix_node[insert_component] = collection_tag # Now check that each item actually matches one of the lists # Allow the lists to have overlaps, we'll check that later if validate: suffix_match_validate_item(suffix_obj, insert_string, suffix_collection, separator=separator, expected_collection_tag=collection_tag, reject_overlapping_lists=False) if len(longer_suffix_list) > 0: duplicates = True suffix_summary = summarise_list(longer_suffix_list) suffix_all = " ".join(longer_suffix_list) logging.warning( "Suffix match for {} ignored longer suffixes {}".format( collection_tag, suffix_summary)) logging.debug("Suffix match for {} ignored longer suffixes {}".format( collection_tag, suffix_all)) # the encoded json measures transmission size, not RAM size logging.info("Suffix match {} prepared {} items ({})".format( collection_tag, len(suffix_collection), format_bytes(len(json_serialise(suffix_obj))))) return (suffix_obj, duplicates)
def exact_match_prepare_collection(exact_collection, existing_exacts=None, validate=True, match_onion_md5=False): ''' Prepare a hashable object collection for efficient exact matching. If the objects in the collection are strings, lowercases them. existing_exacts is a list of previously prepared collections. If existing_exacts is not None, append the new collection to existing_exacts, as well as returning the prepared collection. If multiple lists are prepared using the same existing_exacts, then the final lists are disjoint. Any duplicate items are ignored, and a warning is logged. (An item that appears in multiple input lists is output in the earliest list it appears in.) If validate is True, checks that exact_match() returns True for each item in exact_collection. If match_onion_md5 is True, also try to validate matches for: hashlib.md5(item + '.onion').hexdigest() If item is a string, it is lowercased before hashing. Returns an object that can be passed to exact_match(). This object must be treated as opaque and read-only. ''' assert exact_collection is not None onion_match = " (and onion md5)" if match_onion_md5 else "" # Set matching uses a hash table, so it's more efficient exact_collection_lower = [ lower_if_hasattr(obj) for obj in exact_collection ] exact_set = frozenset(exact_collection_lower) # Log a message if there were any duplicates # Finding each duplicate takes a lot longer if len(exact_collection) != len(exact_set): dups = [ obj for obj in exact_set if exact_collection_lower.count(lower_if_hasattr(obj)) > 1 ] logging.warning( "Removing {} duplicates{} within this collection".format( summarise_list(dups), onion_match)) # the encoded json measures transmission size, not RAM size logging.info("Exact match prepared {}{} items ({})".format( len(exact_set), onion_match, format_bytes(len(json_serialise(list(exact_set)))))) # Check that each item actually matches the list if validate: for item in exact_collection: exact_match_validate_item(exact_set, item, exact_collection, match_onion_md5=match_onion_md5) if existing_exacts is None: return exact_set else: # Remove any items that appear in earlier lists # # If an onion address is in a different format in an earlier list, # this code won't remove it. # # Depending on the exact configuration: # - the tally server will fail to validate the lists, or # - the data collectors will count events for the duplicate onion # address against the first list that contains the address # (in any format). disjoint_exact_set = exact_set.difference(*existing_exacts) duplicate_exact_set = exact_set.difference(disjoint_exact_set) if len(duplicate_exact_set) > 0: logging.warning( "Removing {} duplicates{} that are also in an earlier collection" .format(summarise_list(duplicate_exact_set), onion_match)) existing_exacts.append(disjoint_exact_set) return disjoint_exact_set