コード例 #1
0
    def test_float_64_larger_than_2_pow_64(self):
        secure_sum_f = secure.SecureSumFactory(
            upper_bound_threshold=np.array(2**66, dtype=np.float64))
        process = secure_sum_f.create(computation_types.to_type(tf.float64))
        client_data = [
            np.array(2**65, np.float64),
            np.array(2**65, np.float64),
            np.array(2**66, np.float64)
        ]

        state = process.initialize()
        output = process.next(state, client_data)
        self.assertAllClose(np.array(2**67, np.float64), output.result)
        self._check_measurements(
            output.measurements,
            expected_secure_upper_clipped_count=0,
            expected_secure_lower_clipped_count=0,
            expected_secure_upper_threshold=np.array(2**66, np.float64),
            expected_secure_lower_threshold=np.array(-2**66, np.float64))
コード例 #2
0
    def _build_auto_l2_clip_process(self, target_quantile, learning_rate,
                                    clip_count_stddev):
        """Builds a `tff.templates.EstimationProcess` for adaptive L2 clipping.

    Specifically, we use the private quantile estimation algorithm described in
    https://arxiv.org/abs/1905.03871 for choosing the adaptive L2 clip norm.
    The default noise level for the procedure follows the paper and the
    implementation of `tff.aggregators.DifferentiallyPrivateFactory`.

    Note that for consistency with the use of secure aggregation for the client
    values, the binary flags as part of the quantile estimation procedure
    indicating whether client L2 norms are below the current estimate are also
    securely aggregated.

    Args:
      target_quantile: See `auto_l2_target_quantile` at __init__ docstring.
      learning_rate: See `auto_l2_lr` at __init__ docstring.
      clip_count_stddev: See `auto_l2_clip_count_stddev` at __init__ docstring.

    Returns:
      The `EstimationProcess` for adaptive L2 clipping and the required noise
      multiplier for the record aggregation.
    """
        value_noise_mult, clip_count_stddev = (
            differential_privacy.adaptive_clip_noise_params(
                self._noise_multiplier, self._num_clients, clip_count_stddev))

        estimator_query = tfp.QuantileEstimatorQuery(
            initial_estimate=self._initial_l2_clip,
            target_quantile=target_quantile,
            learning_rate=learning_rate,
            below_estimate_stddev=clip_count_stddev,
            expected_num_records=self._num_clients,
            geometric_update=True)
        # Note also that according to https://arxiv.org/abs/1905.03871, the binary
        # flags for quantile estimation are shifted from [0, 1] to [-0.5, 0.5], so
        # we set the SecAgg input bounds accordingly.
        estimator_process = quantile_estimation.PrivateQuantileEstimationProcess(
            quantile_estimator_query=estimator_query,
            record_aggregation_factory=secure.SecureSumFactory(
                upper_bound_threshold=0.5, lower_bound_threshold=-0.5))

        return estimator_process, value_noise_mult
コード例 #3
0
def _default_clipping(
        inner_factory: factory.AggregationFactory,
        secure_estimation: bool = False) -> factory.AggregationFactory:
    """The default adaptive clipping wrapper."""

    # Adapts relatively quickly to a moderately high norm.
    clipping_norm = quantile_estimation.PrivateQuantileEstimationProcess.no_noise(
        initial_estimate=1.0,
        target_quantile=0.8,
        learning_rate=0.2,
        secure_estimation=secure_estimation)
    if secure_estimation:
        secure_count_factory = secure.SecureSumFactory(upper_bound_threshold=1,
                                                       lower_bound_threshold=0)
        return robust.clipping_factory(
            clipping_norm,
            inner_factory,
            clipped_count_sum_factory=secure_count_factory)
    else:
        return robust.clipping_factory(clipping_norm, inner_factory)
コード例 #4
0
    def test_type_properties_adaptive_bounds(self, value_type, dtype):
        upper_bound_process = _test_estimation_process(1)
        lower_bound_process = _test_estimation_process(-1)
        secure_sum_f = secure.SecureSumFactory(
            upper_bound_threshold=upper_bound_process,
            lower_bound_threshold=lower_bound_process)
        self.assertIsInstance(secure_sum_f,
                              factory.UnweightedAggregationFactory)
        value_type = computation_types.to_type(value_type)
        process = secure_sum_f.create(value_type)
        self.assertIsInstance(process, aggregation_process.AggregationProcess)

        threshold_type = upper_bound_process.report.type_signature.result.member
        expected_state_type = computation_types.at_server(
            computation_types.to_type((threshold_type, threshold_type)))
        expected_measurements_type = _measurements_type(dtype)

        expected_initialize_type = computation_types.FunctionType(
            parameter=None, result=expected_state_type)
        self.assertTrue(
            process.initialize.type_signature.is_equivalent_to(
                expected_initialize_type))

        expected_next_type = computation_types.FunctionType(
            parameter=collections.OrderedDict(
                state=expected_state_type,
                value=computation_types.at_clients(value_type)),
            result=measured_process.MeasuredProcessOutput(
                state=expected_state_type,
                result=computation_types.at_server(value_type),
                measurements=expected_measurements_type))
        self.assertTrue(
            process.next.type_signature.is_equivalent_to(expected_next_type))
        try:
            static_assert.assert_not_contains_unsecure_aggregation(
                process.next)
        except:  # pylint: disable=bare-except
            self.fail('Factory returned an AggregationProcess containing '
                      'non-secure aggregation.')
コード例 #5
0
def _default_zeroing(
        inner_factory: factory.AggregationFactory,
        secure_estimation: bool = False) -> factory.AggregationFactory:
    """The default adaptive zeroing wrapper."""

    # Adapts very quickly to a value somewhat higher than the highest values so
    # far seen.
    zeroing_norm = quantile_estimation.PrivateQuantileEstimationProcess.no_noise(
        initial_estimate=10.0,
        target_quantile=0.98,
        learning_rate=math.log(10.0),
        multiplier=2.0,
        increment=1.0,
        secure_estimation=secure_estimation)
    if secure_estimation:
        secure_count_factory = secure.SecureSumFactory(upper_bound_threshold=1,
                                                       lower_bound_threshold=0)
        return robust.zeroing_factory(
            zeroing_norm,
            inner_factory,
            zeroed_count_sum_factory=secure_count_factory)
    else:
        return robust.zeroing_factory(zeroing_norm, inner_factory)
コード例 #6
0
def secure_aggregator(
        zeroing: bool = True,
        clipping: bool = True) -> factory.WeightedAggregationFactory:
    """Creates secure aggregator with adaptive zeroing and clipping.

  Zeroes out extremely large values for robustness to data corruption on
  clients, clips to moderately high norm for robustness to outliers. After
  weighting in mean, the weighted values are summed using cryptographic protocol
  ensuring that the server cannot see individual updates until sufficient number
  of updates have been added together. For details, see Bonawitz et al. (2017)
  https://dl.acm.org/doi/abs/10.1145/3133956.3133982. In TFF, this is realized
  using the `tff.federated_secure_sum` operator.

  Args:
    zeroing: Whether to enable adaptive zeroing for data corruption mitigation.
    clipping: Whether to enable adaptive clipping in the L2 norm for robustness.
      Note this clipping is performed prior to the per-coordinate clipping
      required for secure aggregation.

  Returns:
    A `tff.aggregators.WeightedAggregationFactory`.
  """
    secure_clip_bound = quantile_estimation.PrivateQuantileEstimationProcess.no_noise(
        initial_estimate=50.0,
        target_quantile=0.95,
        learning_rate=1.0,
        multiplier=2.0)
    factory_ = mean.MeanFactory(secure.SecureSumFactory(secure_clip_bound))

    if clipping:
        factory_ = _default_clipping(factory_)

    if zeroing:
        factory_ = _default_zeroing(factory_)

    return factory_
コード例 #7
0
 def test_incorrect_value_type_raises(self, bad_value_type):
     secure_sum_f = secure.SecureSumFactory(1.0, -1.0)
     with self.assertRaises(TypeError):
         secure_sum_f.create(bad_value_type)
コード例 #8
0
 def test_value_type_incompatible_with_config_mode_raises_two_processes(
         self):
     secure_sum_f = secure.SecureSumFactory(_test_estimation_process(1),
                                            _test_estimation_process(-1))
     with self.assertRaises(TypeError):
         secure_sum_f.create(computation_types.TensorType(tf.int32))
コード例 #9
0
 def test_value_type_incompatible_with_config_mode_raises_float(
         self, upper, lower):
     secure_sum_f = secure.SecureSumFactory(upper, lower)
     with self.assertRaises(TypeError):
         secure_sum_f.create(computation_types.TensorType(tf.int32))
コード例 #10
0
 def test_int_ranges_beyond_2_pow_32(self):
     secure_sum_f = secure.SecureSumFactory(2**33, -2**33)
     # Bounds this large should be provided only with tf.int64 value_type.
     process = secure_sum_f.create(computation_types.TensorType(tf.int64))
     self.assertEqual(
         process.next.type_signature.result.result.member.dtype, tf.int64)
コード例 #11
0
 def test_upper_bound_not_larger_than_lower_bound_raises(
         self, upper, lower):
     with self.assertRaises(ValueError):
         secure.SecureSumFactory(upper, lower)
コード例 #12
0
def secure_sum_then_finalize(
    metric_finalizers: model_lib.MetricFinalizersType,
    local_unfinalized_metrics_type: computation_types.StructWithPythonType,
    metric_value_ranges: Optional[MetricValueRangeDict] = None
) -> computation_base.Computation:
    """Creates a TFF computation that aggregates metrics using secure summation.

  The returned federated TFF computation has the following type signature:

  ```
  (local_unfinalized_metrics@CLIENTS ->
   <aggregated_metrics@SERVER, secure_sum_measurements@SERVER)
  ```

  where the input is given by
  `tff.learning.Model.report_local_unfinalized_metrics()` at `CLIENTS`, and the
  first output (`aggregated_metrics`) is computed by first securely summing the
  unfinalized metrics from `CLIENTS`, followed by applying the finalizers at
  `SERVER`. The second output (`secure_sum_measurements`) is an `OrderedDict`
  that maps from `factory_key`s to the secure summation measurements (e.g. the
  number of clients gets clipped. See `tff.aggregators.SecureSumFactory` for
  details). A `factory_key` is uniquely defined by three scalars: lower bound,
  upper bound, and tensor dtype (denoted as datatype enum). Metric values of the
  same `factory_key` are grouped and aggegrated together (and hence, the
  `secure_sum_measurements` are also computed at a group level).

  Since secure summation works in fixed-point arithmetic space, floating point
  numbers must be encoding using integer quantization. By default, each tensor
  in `local_unfinalized_metrics_type` will be clipped to `[0, 2**20 - 1]` and
  encoded to integers inside `tff.aggregators.SecureSumFactory`. Callers can
  change this range by setting `metric_value_ranges`, which may be a partial
  tree matching the structure of `local_unfinalized_metrics_type`.

  Example partial value range specification:

  >>> finalizers = ...
  >>> metrics_type = tff.to_type(collections.OrderedDict(
      a=tff.types.TensorType(tf.int32),
      b=tff.types.TensorType(tf.float32),
      c=[tff.types.TensorType(tf.float32), tff.types.TensorType(tf.float32)])
  >>> value_ranges = collections.OrderedDict(
      b=(0.0, 1.0),
      c=[None, (0.0, 1.0)])
  >>> aggregator = tff.learning.metrics.secure_sum_then_finalize(
      finalizers, metrics_type, value_ranges)

  This sets the range of the *second* tensor of `b` in the dictionary, using the
  range for the first tensor, and the `a` tensor.

  Args:
    metric_finalizers: An `OrderedDict` of `string` metric names to finalizer
      functions returned by `tff.learning.Model.metric_finalizers()`. It should
      have the same keys (i.e., metric names) as the `OrderedDict` returned by
      `tff.learning.Model.report_local_unfinalized_metrics()`. A finalizer is a
      callable (typically `tf.function` or `tff.tf_computation` decoreated
      function) that takes in a metric's unfinalized values, and returns the
      finalized values.
    local_unfinalized_metrics_type: A `tff.types.StructWithPythonType` (with
      `OrderedDict` as the Python container) of a client's local unfinalized
      metrics. Let `local_unfinalized_metrics` be the output of
      `tff.learning.Model.report_local_unfinalized_metrics()`. Its type can be
      obtained by `tff.framework.type_from_tensors(local_unfinalized_metrics)`.
    metric_value_ranges: A `collections.OrderedDict` that matches the structure
      of `local_unfinalized_metrics_type` (a value for each
      `tff.types.TensorType` in the type tree). Each leaf in the tree should
      have a 2-tuple that defines the range of expected values for that variable
      in the metric. If the entire structure is `None`, a default range of
      `[0.0, 2.0**20 - 1]` will be applied to all variables. Each leaf may also
      be `None`, which will also get the default range; allowing partial user
      sepcialization. At runtime, values that fall outside the ranges specified
      at the leaves, those values will be clipped to within the range.

  Returns:
    A federated TFF computation that securely sums the unfinalized metrics from
    `CLIENTS`, and applies the correponding finalizers at `SERVER`.

  Raises:
    TypeError: If the inputs are of the wrong types.
    ValueError: If the keys (i.e., metric names) in `metric_finalizers` are not
      the same as those expected by `local_unfinalized_metrics_type`.
  """
    check_metric_finalizers(metric_finalizers)
    check_local_unfinalzied_metrics_type(local_unfinalized_metrics_type)
    check_finalizers_matches_unfinalized_metrics(
        metric_finalizers, local_unfinalized_metrics_type)

    default_metric_value_ranges = create_default_secure_sum_quantization_ranges(
        local_unfinalized_metrics_type)
    if metric_value_ranges is None:
        metric_value_ranges = default_metric_value_ranges

    # Walk the incoming `metric_value_ranges` and `default_metric_value_ranges`
    # and fill in any missing ranges using the defaults.
    def fill_missing_values_with_defaults(default_values, user_values):
        if isinstance(default_values, collections.abc.Mapping):
            if user_values is None:
                user_values = {}
            return type(default_values)(
                (key,
                 fill_missing_values_with_defaults(default_value,
                                                   user_values.get(key)))
                for key, default_value in default_values.items())
        elif isinstance(default_values, list):
            if user_values is None:
                user_values = [None] * len(default_values)
            return [
                fill_missing_values_with_defaults(default_value,
                                                  user_values[idx])
                for idx, default_value in enumerate(default_values)
            ]
        elif user_values is None:
            return _MetricRange(*default_values)
        else:
            _check_range(user_values)
            return _MetricRange(*user_values)

    try:
        metric_value_ranges = fill_missing_values_with_defaults(
            default_metric_value_ranges, metric_value_ranges)
    except TypeError as e:
        raise TypeError('Failed to create encoding value range from: '
                        f'{metric_value_ranges}') from e

    # Create an aggregator factory for each unique value range, rather than each
    # leaf tensor (which could introduce a lot of duplication).
    aggregator_factories = {
        value_range: secure.SecureSumFactory(value_range.upper,
                                             value_range.lower)
        for value_range in set(tree.flatten(metric_value_ranges))
    }
    # Construct a python container of `tff.TensorType` so we can traverse it in
    # parallel with the value ranges during AggregationProcess construction.
    # Otherwise we have a `tff.Type` but `metric_value_ranges` is a Python
    # container which are difficult to traverse in parallel.
    structure_of_tensor_types = type_conversions.structure_from_tensor_type_tree(
        lambda t: t, local_unfinalized_metrics_type)

    # We will construct groups of tensors with the same dtype and quantization
    # value range so that we can construct fewer aggregations-of-structures,
    # rather than a large structure-of-aggregations. Without this, the TFF
    # compiler pipeline results in large slow downs (see b/218312198).
    factory_key_by_path = collections.OrderedDict()
    value_range_by_factory_key = collections.OrderedDict()
    path_list_by_factory_key = collections.defaultdict(list)
    # Maintain a flattened list of paths. This is useful to flatten the aggregated
    # values, which will then be used by `tf.nest.pack_sequence_as`.
    flattened_path_list = []
    for (path, tensor_spec), (_, value_range) in zip(
            tree.flatten_with_path(structure_of_tensor_types),
            tree.flatten_with_path(metric_value_ranges)):
        factory_key = _create_factory_key(value_range.lower, value_range.upper,
                                          tensor_spec.dtype)
        factory_key_by_path[path] = factory_key
        value_range_by_factory_key[factory_key] = value_range
        path_list_by_factory_key[factory_key].append(path)
        flattened_path_list.append(path)

    @tensorflow_computation.tf_computation(local_unfinalized_metrics_type)
    def group_value_by_factory_key(local_unfinalized_metrics):
        """Groups client local metrics into a map of `factory_key` to value list."""
        # We cannot use `collections.defaultdict(list)` here because its result is
        # incompatible with `structure_from_tensor_type_tree`.
        value_list_by_factory_key = collections.OrderedDict()
        for path, value in tree.flatten_with_path(local_unfinalized_metrics):
            factory_key = factory_key_by_path[path]
            if factory_key in value_list_by_factory_key:
                value_list_by_factory_key[factory_key].append(value)
            else:
                value_list_by_factory_key[factory_key] = [value]
        return value_list_by_factory_key

    def flatten_grouped_values(value_list_by_factory_key):
        """Flatten the values in the same order as in `flattened_path_list`."""
        value_by_path = collections.OrderedDict()
        for factory_key in value_list_by_factory_key:
            path_list = path_list_by_factory_key[factory_key]
            value_list = value_list_by_factory_key[factory_key]
            for path, value in zip(path_list, value_list):
                value_by_path[path] = value
        flattened_value_list = [
            value_by_path[path] for path in flattened_path_list
        ]
        return flattened_value_list

    # Create a aggregation process for each factory key.
    aggregation_process_by_factory_key = collections.OrderedDict()
    # Construct a python container of `tff.TensorType` so we can traverse it and
    # create aggregation processes from the factories.
    tensor_type_list_by_factory_key = (
        type_conversions.structure_from_tensor_type_tree(
            lambda t: t, group_value_by_factory_key.type_signature.result))
    for factory_key, tensor_type_list in tensor_type_list_by_factory_key.items(
    ):
        value_range = value_range_by_factory_key[factory_key]
        aggregation_process_by_factory_key[
            factory_key] = aggregator_factories.get(value_range).create(
                computation_types.to_type(tensor_type_list))

    @federated_computation.federated_computation(
        computation_types.at_clients(local_unfinalized_metrics_type))
    def aggregator_computation(client_local_unfinalized_metrics):
        unused_state = intrinsics.federated_value((), placements.SERVER)

        client_local_grouped_unfinalized_metrics = intrinsics.federated_map(
            group_value_by_factory_key, client_local_unfinalized_metrics)
        metrics_aggregation_output = collections.OrderedDict()
        for factory_key, process in aggregation_process_by_factory_key.items():
            metrics_aggregation_output[factory_key] = process.next(
                unused_state,
                client_local_grouped_unfinalized_metrics[factory_key])

        metrics_aggregation_output = intrinsics.federated_zip(
            metrics_aggregation_output)

        @tensorflow_computation.tf_computation(
            metrics_aggregation_output.type_signature.member)
        def finalizer_computation(grouped_aggregation_output):

            # One minor downside of grouping the aggregation processes is that the
            # SecAgg measurements (e.g., clipped_count) are computed at a group level
            # (a group means all metric values belonging to the same `factory_key`).
            secure_sum_measurements = collections.OrderedDict(
                (factory_key, output.measurements)
                for factory_key, output in grouped_aggregation_output.items())
            finalized_metrics = collections.OrderedDict(
                secure_sum_measurements=secure_sum_measurements)
            grouped_unfinalized_metrics = collections.OrderedDict(
                (factory_key, output.result)
                for factory_key, output in grouped_aggregation_output.items())
            flattened_unfinalized_metrics_list = flatten_grouped_values(
                grouped_unfinalized_metrics)
            unfinalized_metrics = tf.nest.pack_sequence_as(
                structure_of_tensor_types, flattened_unfinalized_metrics_list)
            for metric_name, metric_finalizer in metric_finalizers.items():
                finalized_metrics[metric_name] = metric_finalizer(
                    unfinalized_metrics[metric_name])
            return finalized_metrics

        return intrinsics.federated_map(finalizer_computation,
                                        metrics_aggregation_output)

    return aggregator_computation
コード例 #13
0
class IbltFactoryTest(tf.test.TestCase, parameterized.TestCase):
    def setUp(self):
        super().setUp()
        execution_contexts.set_test_python_execution_context()

    def test_capacity_validation(self):
        with self.assertRaisesRegex(ValueError, 'capacity'):
            iblt_factory.IbltFactory(capacity=0,
                                     string_max_length=10,
                                     repetitions=3,
                                     seed=0)
        with self.assertRaisesRegex(ValueError, 'capacity'):
            iblt_factory.IbltFactory(capacity=-1,
                                     string_max_length=10,
                                     repetitions=3,
                                     seed=0)
        # Should not raise
        iblt_factory.IbltFactory(capacity=1,
                                 string_max_length=10,
                                 repetitions=3,
                                 seed=0)

    def test_string_max_length_validation(self):
        with self.assertRaisesRegex(ValueError, 'string_max_length'):
            iblt_factory.IbltFactory(string_max_length=0,
                                     capacity=10,
                                     repetitions=3,
                                     seed=0)
        with self.assertRaisesRegex(ValueError, 'string_max_length'):
            iblt_factory.IbltFactory(string_max_length=-1,
                                     capacity=10,
                                     repetitions=3,
                                     seed=0)
        # Should not raise
        iblt_factory.IbltFactory(string_max_length=1,
                                 capacity=10,
                                 repetitions=3,
                                 seed=0)

    def test_repetitions_validation(self):
        with self.assertRaisesRegex(ValueError, 'repetitions'):
            iblt_factory.IbltFactory(repetitions=0,
                                     capacity=10,
                                     string_max_length=10,
                                     seed=0)
        with self.assertRaisesRegex(ValueError, 'repetitions'):
            iblt_factory.IbltFactory(repetitions=2,
                                     capacity=10,
                                     string_max_length=10,
                                     seed=0)
        # Should not raise
        iblt_factory.IbltFactory(repetitions=3,
                                 capacity=10,
                                 string_max_length=10,
                                 seed=0)

    @parameterized.named_parameters(
        ('scalar',
         computation_types.SequenceType(
             computation_types.TensorType(shape=(), dtype=tf.int64))),
        ('list',
         computation_types.SequenceType(
             computation_types.TensorType(shape=(3, ), dtype=tf.int64))),
        ('dict_wrong_key',
         computation_types.SequenceType(
             collections.OrderedDict([
                 ('foo', tf.int64),
                 (iblt_factory.DATASET_VALUE,
                  computation_types.TensorType(shape=(1, ), dtype=tf.int64)),
             ]))),
        ('dict_extra_key',
         computation_types.SequenceType(
             collections.OrderedDict([
                 ('bar', tf.int64),
                 (iblt_factory.DATASET_KEY, tf.int64),
                 (iblt_factory.DATASET_VALUE,
                  computation_types.TensorType(shape=(1, ), dtype=tf.int64)),
             ]))),
        ('dict_int64_int64',
         computation_types.SequenceType(
             collections.OrderedDict([
                 (iblt_factory.DATASET_KEY, tf.int64),
                 (iblt_factory.DATASET_VALUE,
                  computation_types.TensorType(shape=(1, ), dtype=tf.int64)),
             ]))),
        ('dict_string_int32',
         computation_types.SequenceType(
             collections.OrderedDict([
                 (iblt_factory.DATASET_KEY, tf.string),
                 (iblt_factory.DATASET_VALUE,
                  computation_types.TensorType(shape=(1, ), dtype=tf.int32)),
             ]))),
    )
    def test_value_type_validation(self, value_type):
        iblt_agg_factory = iblt_factory.IbltFactory(capacity=10,
                                                    string_max_length=5,
                                                    repetitions=3,
                                                    seed=0)
        with self.assertRaises(ValueError):
            iblt_agg_factory.create(value_type)

    def test_string_max_length_error(self):
        client = collections.OrderedDict([
            (iblt_factory.DATASET_KEY,
             tf.constant(['thisisalongword'], dtype=tf.string)),
            (iblt_factory.DATASET_VALUE, tf.constant([[1]], dtype=tf.int64)),
        ])
        value_type = computation_types.SequenceType(
            collections.OrderedDict(key=tf.string,
                                    value=computation_types.TensorType(
                                        shape=(1, ), dtype=tf.int64)))
        client_data = [tf.data.Dataset.from_tensor_slices(client)]
        iblt_agg_factory = iblt_factory.IbltFactory(capacity=10,
                                                    string_max_length=5,
                                                    repetitions=3,
                                                    seed=0)
        iblt_agg_process = iblt_agg_factory.create(value_type)
        with self.assertRaises(tf.errors.InvalidArgumentError):
            iblt_agg_process.next(iblt_agg_process.initialize(), client_data)

    @parameterized.named_parameters(
        {
            'testcase_name': 'default_factories',
            'capacity': 10,
            'string_max_length': 10,
            'repetitions': 3,
            'seed': 0,
        },
        {
            'testcase_name': 'sketch_secure_factory',
            'sketch_agg_factory': secure.SecureSumFactory(2**32 - 1),
            'capacity': 20,
            'string_max_length': 20,
            'repetitions': 3,
            'seed': 1,
        },
        {
            'testcase_name': 'tensor_value_sum_factory',
            'value_tensor_agg_factory': sum_factory.SumFactory(),
            'capacity': 100,
            'string_max_length': 10,
            'repetitions': 5,
            'seed': 5,
        },
        {
            'testcase_name': 'secure_sum_factories',
            'sketch_agg_factory': secure.SecureSumFactory(2**32 - 1),
            'value_tensor_agg_factory': secure.SecureSumFactory(2**32 - 1),
            'capacity': 10,
            'string_max_length': 10,
            'repetitions': 4,
            'seed': 5,
        },
    )
    def test_iblt_aggregation_as_expected(
        self,
        capacity: int,
        string_max_length: int,
        repetitions: int,
        seed: int,
        sketch_agg_factory: Optional[
            factory.UnweightedAggregationFactory] = None,
        value_tensor_agg_factory: Optional[
            factory.UnweightedAggregationFactory] = None):
        iblt_agg_factory = iblt_factory.IbltFactory(
            sketch_agg_factory=sketch_agg_factory,
            value_tensor_agg_factory=value_tensor_agg_factory,
            capacity=capacity,
            string_max_length=string_max_length,
            repetitions=repetitions,
            seed=seed)
        iblt_agg_process = iblt_agg_factory.create(VALUE_TYPE)
        process_output = iblt_agg_process.next(iblt_agg_process.initialize(),
                                               CLIENT_DATA)
        output_strings = [
            s.decode('utf-8') for s in process_output.result.output_strings
        ]
        string_values = process_output.result.string_values
        result = dict(zip(output_strings, string_values))

        self.assertCountEqual(result, AGGREGATED_DATA)

        expected_measurements = collections.OrderedDict([('num_not_decoded',
                                                          0), ('sketch', ()),
                                                         ('value_tensor', ())])
        self.assertCountEqual(process_output.measurements,
                              expected_measurements)
コード例 #14
0
def create_hierarchical_histogram_aggregation_factory(
        num_bins: int,
        arity: int = 2,
        clip_mechanism: str = 'sub-sampling',
        max_records_per_user: int = 10,
        dp_mechanism: str = 'no-noise',
        noise_multiplier: float = 0.0,
        expected_clients_per_round: int = 10,
        bits: int = 22,
        enable_secure_sum: bool = True):
    """Creates hierarchical histogram aggregation factory.

  Hierarchical histogram factory is constructed by composing 3 aggregation
  factories.
  (1) The inner-most factory is `SumFactory`.
  (2) The middle factory is `DifferentiallyPrivateFactory` whose inner query is
      `TreeRangeSumQuery`. This factory 1) takes in a clipped histogram,
      constructs the hierarchical histogram and checks the norm bound of the
      hierarchical histogram at clients, 2) adds noise either at clients or at
      server according to `dp_mechanism`.
  (3) The outer-most factory is `HistogramClippingSumFactory` which clips the
      input histogram to bound each user's contribution.

  Args:
    num_bins: An `int` representing the input histogram size.
    arity: An `int` representing the branching factor of the tree. Defaults to
      2.
   clip_mechanism: A `str` representing the clipping mechanism. Currently
     supported mechanisms are
      - 'sub-sampling': (Default) Uniformly sample up to `max_records_per_user`
        records without replacement from the client dataset.
      - 'distinct': Uniquify client dataset and uniformly sample up to
        `max_records_per_user` records without replacement from it.
    max_records_per_user: An `int` representing the maximum of records each user
      can include in their local histogram. Defaults to 10.
    dp_mechanism: A `str` representing the differentially private mechanism to
      use. Currently supported mechanisms are
      - 'no-noise': (Default) Tree aggregation mechanism without noise.
      - 'central-gaussian': Tree aggregation with central Gaussian mechanism.
      - 'distributed-discrete-gaussian': Tree aggregation mechanism with
        distributed discrete Gaussian mechanism in "The Distributed Discrete
        Gaussian Mechanism for Federated Learning with Secure Aggregation. Peter
        Kairouz, Ziyu Liu, Thomas Steinke".
    noise_multiplier: A `float` specifying the noise multiplier (central noise
      stddev / L2 clip norm) for model updates. Only needed when `dp_mechanism`
      is not 'no-noise'. Defaults to 0.0.
    expected_clients_per_round: An `int` specifying the lower bound of the
      expected number of clients. Only needed when `dp_mechanism` is
      'distributed-discrete-gaussian. Defaults to 10.
    bits: A positive integer specifying the communication bit-width B (where
      2**B will be the field size for SecAgg operations). Only needed when
      `dp_mechanism` is 'distributed-discrete-gaussian'. Please read the below
      precautions carefully and set `bits` accordingly. Otherwise, unexpected
      overflow or accuracy degradation might happen. (1) Should be in the
      inclusive range [1, 22] to avoid overflow inside secure aggregation; (2)
      Should be at least as large as `log2(4 * sqrt(expected_clients_per_round)*
      noise_multiplier * l2_norm_bound + expected_clients_per_round *
      max_records_per_user) + 1` to avoid accuracy degradation caused by
      frequent modular clipping; (3) If the number of clients exceed
      `expected_clients_per_round`, overflow might happen.
    enable_secure_sum: Whether to aggregate client's update by secure sum or
      not. Defaults to `True`. When `dp_mechanism` is set to
      `'distributed-discrete-gaussian'`, `enable_secure_sum` must be `True`.

  Returns:
    `tff.aggregators.UnweightedAggregationFactory`.

  Raises:
    TypeError: If arguments have the wrong type(s).
    ValueError: If arguments have invalid value(s).
  """
    _check_positive(num_bins, 'num_bins')
    _check_greater_equal(arity, 2, 'arity')
    _check_membership(clip_mechanism, clipping_factory.CLIP_MECHANISMS,
                      'clip_mechanism')
    _check_positive(max_records_per_user, 'max_records_per_user')
    _check_membership(dp_mechanism, DP_MECHANISMS, 'dp_mechanism')
    _check_non_negative(noise_multiplier, 'noise_multiplier')
    _check_positive(expected_clients_per_round, 'expected_clients_per_round')
    _check_in_range(bits, 'bits', 1, 22)

    # Converts `max_records_per_user` to the corresponding norm bound according to
    # the chosen `clip_mechanism` and `dp_mechanism`.
    if dp_mechanism in ['central-gaussian', 'distributed-discrete-gaussian']:
        if clip_mechanism == 'sub-sampling':
            l2_norm_bound = max_records_per_user * math.sqrt(
                _tree_depth(num_bins, arity))
        elif clip_mechanism == 'distinct':
            # The following code block converts `max_records_per_user` to L2 norm
            # bound of the hierarchical histogram layer by layer. For the bottom
            # layer with only 0s and at most `max_records_per_user` 1s, the L2 norm
            # bound is `sqrt(max_records_per_user)`. For the second layer from bottom,
            # the worst case is only 0s and `max_records_per_user/2` 2s. And so on
            # until the root node. Another natural L2 norm bound on each layer is
            # `max_records_per_user` so we take the minimum between the two bounds.
            square_l2_norm_bound = 0.
            square_layer_l2_norm_bound = max_records_per_user
            for _ in range(_tree_depth(num_bins, arity)):
                square_l2_norm_bound += min(max_records_per_user**2,
                                            square_layer_l2_norm_bound)
                square_layer_l2_norm_bound *= arity
            l2_norm_bound = math.sqrt(square_l2_norm_bound)

    if not enable_secure_sum and dp_mechanism in DISTRIBUTED_DP_MECHANISMS:
        raise ValueError(f'When dp_mechanism is {DISTRIBUTED_DP_MECHANISMS}, '
                         'enable_secure_sum must be set to True to preserve '
                         'distributed DP.')

    # Build nested aggregtion factory from innermost to outermost.
    # 1. Sum factory. The most inner factory that sums the preprocessed records.
    # (1) If  `enable_secure_sum` is `False`, should be `SumFactory`.
    if not enable_secure_sum:
        nested_factory = sum_factory.SumFactory()
    else:
        # (2) If  `enable_secure_sum` is `True`, and `dp_mechanism` is 'no-noise' or
        # 'central-gaussian', the sum factory should be `SecureSumFactory`, with
        # a `upper_bound_threshold` of `max_records_per_user`. When `dp_mechanism`
        # is 'central-gaussian', use a float `SecureSumFactory` to be compatible
        # with `GaussianSumQuery`.
        if dp_mechanism in ['no-noise']:
            nested_factory = secure.SecureSumFactory(max_records_per_user)
        elif dp_mechanism in ['central-gaussian']:
            nested_factory = secure.SecureSumFactory(
                float(max_records_per_user))
        # (3) If `dp_mechanism` is in `DISTRIBUTED_DP_MECHANISMS`, should be
        #     `SecureSumFactory`. To preserve DP and avoid overflow, we have 4
        #    modular clips from nesting two modular clip aggregators:
        #    #1. outer-client: clips to [-2**(bits-1), 2**(bits-1))
        #        Bounds the client values.
        #    #2. inner-client: clips to [0, 2**bits)
        #        Similar to applying a two's complement to the values such that
        #        frequent values (post-rotation) are now near 0 (representing small
        #        positives) and 2**bits (small negatives). 0 also always map to 0,
        #        and we do not require another explicit value range shift from
        #        [-2**(bits-1), 2**(bits-1)] to [0, 2**bits] to make sure that
        #        values are compatible with SecAgg's mod m = 2**bits. This can be
        #        reverted at #4.
        #    #3. inner-server: clips to [0, 2**bits)
        #        Ensures the aggregated value range does not grow by
        #        `log2(expected_clients_per_round)`.
        #        NOTE: If underlying SecAgg is implemented using the new
        #        `tff.federated_secure_modular_sum()` operator with the same
        #        modular clipping range, then this would correspond to a no-op.
        #    #4. outer-server: clips to [-2**(bits-1), 2**(bits-1))
        #        Keeps aggregated values centered near 0 out of the logical SecAgg
        #        black box for outer aggregators.
        elif dp_mechanism in ['distributed-discrete-gaussian']:
            # TODO(b/196312838): Please add scaling to the distributed case once we
            # have a stable guideline for setting scaling factor to improve
            # performance and avoid overflow. The below test is to make sure that
            # modular clipping happens with small probability so the accuracy of the
            # result won't be harmed. However, if the number of clients exceeds
            # `expected_clients_per_round`, overflow still might happen. It is the
            # caller's responsibility to carefully choose `bits` according to system
            # details to avoid overflow or performance degradation.
            if bits < math.log2(4 * math.sqrt(expected_clients_per_round) *
                                noise_multiplier * l2_norm_bound +
                                expected_clients_per_round *
                                max_records_per_user) + 1:
                raise ValueError(
                    f'The selected bit-width ({bits}) is too small for the '
                    f'given parameters (expected_clients_per_round = '
                    f'{expected_clients_per_round}, max_records_per_user = '******'{max_records_per_user}, noise_multiplier = '
                    f'{noise_multiplier}) and will harm the accuracy of the '
                    f'result. Please decrease the '
                    f'`expected_clients_per_round` / `max_records_per_user` '
                    f'/ `noise_multiplier`, or increase `bits`.')
            nested_factory = secure.SecureSumFactory(
                upper_bound_threshold=2**bits - 1, lower_bound_threshold=0)
            nested_factory = modular_clipping_factory.ModularClippingSumFactory(
                clip_range_lower=0,
                clip_range_upper=2**bits,
                inner_agg_factory=nested_factory)
            nested_factory = modular_clipping_factory.ModularClippingSumFactory(
                clip_range_lower=-2**(bits - 1),
                clip_range_upper=2**(bits - 1),
                inner_agg_factory=nested_factory)

    # 2. DP operations.
    # Constructs `DifferentiallyPrivateFactory` according to the chosen
    # `dp_mechanism`.
    if dp_mechanism == 'central-gaussian':
        query = tfp.TreeRangeSumQuery.build_central_gaussian_query(
            l2_norm_bound, noise_multiplier * l2_norm_bound, arity)
        # If the inner `DifferentiallyPrivateFactory` uses `GaussianSumQuery`, then
        # the record is casted to `tf.float32` before feeding to the DP factory.
        cast_to_float = True
    elif dp_mechanism == 'distributed-discrete-gaussian':
        query = tfp.TreeRangeSumQuery.build_distributed_discrete_gaussian_query(
            l2_norm_bound, noise_multiplier * l2_norm_bound /
            math.sqrt(expected_clients_per_round), arity)
        # If the inner `DifferentiallyPrivateFactory` uses
        # `DistributedDiscreteGaussianQuery`, then the record is kept as `tf.int32`
        # before feeding to the DP factory.
        cast_to_float = False
    elif dp_mechanism == 'no-noise':
        inner_query = tfp.NoPrivacySumQuery()
        query = tfp.TreeRangeSumQuery(arity=arity, inner_query=inner_query)
        # If the inner `DifferentiallyPrivateFactory` uses `NoPrivacyQuery`, then
        # the record is kept as `tf.int32` before feeding to the DP factory.
        cast_to_float = False
    else:
        raise ValueError('Unexpected dp_mechanism.')
    nested_factory = differential_privacy.DifferentiallyPrivateFactory(
        query, nested_factory)

    # 3. Clip as specified by `clip_mechanism`.
    nested_factory = clipping_factory.HistogramClippingSumFactory(
        clip_mechanism=clip_mechanism,
        max_records_per_user=max_records_per_user,
        inner_agg_factory=nested_factory,
        cast_to_float=cast_to_float)

    return nested_factory
コード例 #15
0
    def _build_aggregation_factory(self):
        central_stddev = self._value_noise_mult * self._initial_l2_clip
        local_stddev = central_stddev / math.sqrt(self._num_clients)

        # Ensure dim is at least 1 only for computing DDP parameters.
        self._client_dim = max(1, self._client_dim)
        if self._rotation_type == 'hd':
            # Hadamard transform requires dimension to be powers of 2.
            self._padded_dim = 2**math.ceil(math.log2(self._client_dim))
            rotation_factory = rotation.HadamardTransformFactory
        else:
            # DFT pads at most 1 zero.
            self._padded_dim = math.ceil(self._client_dim / 2.0) * 2
            rotation_factory = rotation.DiscreteFourierTransformFactory

        scale = _heuristic_scale_factor(local_stddev, self._initial_l2_clip,
                                        self._bits, self._num_clients,
                                        self._padded_dim,
                                        self._k_stddevs).numpy()

        # Very large scales could lead to overflows and are not as helpful for
        # utility. See comment above for more details.
        scale = min(scale, MAX_SCALE_FACTOR)

        if scale <= 1:
            warnings.warn(
                f'The selected scale_factor {scale} <= 1. This may lead to'
                f'substantial quantization errors. Consider increasing'
                f'the bit-width (currently {self._bits}) or decreasing the'
                f'expected number of clients per round (currently '
                f'{self._num_clients}).')

        # The procedure for obtaining inflated L2 bound assumes eager TF execution
        # and can be rewritten with NumPy if needed.
        inflated_l2 = discretization.inflated_l2_norm_bound(
            l2_norm_bound=self._initial_l2_clip,
            gamma=1.0 / scale,
            beta=self._beta,
            dim=self._padded_dim).numpy()

        # Add small leeway on norm bounds to gracefully allow numerical errors.
        # Specifically, the norm thresholds are computed directly from the specified
        # parameters in Python and will be checked right before noising; on the
        # other hand, the actual norm of the record (to be measured at noising time)
        # can possibly be (negligibly) higher due to the float32 arithmetic after
        # the conditional rounding (thus failing the check). While we have mitigated
        # this by sharing the computation for the inflated norm bound from
        # quantization, adding a leeway makes the execution more robust (it does not
        # need to abort should any precision issues happen) while not affecting the
        # correctness if privacy accounting is done based on the norm bounds at the
        # DPQuery/DPFactory (which incorporates the leeway).
        scaled_inflated_l2 = (inflated_l2 + 1e-5) * scale
        # Since values are scaled and rounded to integers, we have L1 <= L2^2
        # on top of the general of L1 <= sqrt(d) * L2.
        scaled_l1 = math.ceil(
            scaled_inflated_l2 *
            min(math.sqrt(self._padded_dim), scaled_inflated_l2))

        # Build nested aggregtion factory.
        # 1. Secure Aggregation. In particular, we have 4 modular clips from
        #    nesting two modular clip aggregators:
        #    #1. outer-client: clips to [-2^(b-1), 2^(b-1)]
        #        Bounds the client values (with limited effect as scaling was
        #        chosen such that `num_clients` is taken into account).
        #    #2. inner-client: clips to [0, 2^b]
        #        Similar to applying a two's complement to the values such that
        #        frequent values (post-rotation) are now near 0 (representing small
        #        positives) and 2^b (small negatives). 0 also always map to 0, and
        #        we do not require another explicit value range shift from
        #        [-2^(b-1), 2^(b-1)] to [0, 2^b] to make sure that values are
        #        compatible with SecAgg's mod m = 2^b. This can be reverted at #4.
        #    #3. inner-server: clips to [0, 2^b]
        #        Ensures the aggregated value range does not grow by log_2(n).
        #        NOTE: If underlying SecAgg is implemented using the new
        #        `tff.federated_secure_modular_sum()` operator with the same
        #        modular clipping range, then this would correspond to a no-op.
        #    #4. outer-server: clips to [-2^(b-1), 2^(b-1)]
        #        Keeps aggregated values centered near 0 out of the logical SecAgg
        #        black box for outer aggregators.
        #    Note that the scaling factor and the bit-width are chosen such that
        #    the number of clients to aggregate is taken into account.
        nested_factory = secure.SecureSumFactory(
            upper_bound_threshold=2**self._bits - 1, lower_bound_threshold=0)
        nested_factory = modular_clipping.ModularClippingSumFactory(
            clip_range_lower=0,
            clip_range_upper=2**self._bits,
            inner_agg_factory=nested_factory)
        nested_factory = modular_clipping.ModularClippingSumFactory(
            clip_range_lower=-(2**(self._bits - 1)),
            clip_range_upper=2**(self._bits - 1),
            inner_agg_factory=nested_factory)

        # 2. DP operations. DP params are in the scaled domain (post-quantization).
        if self._mechanism == 'distributed_dgauss':
            dp_query = tfp.DistributedDiscreteGaussianSumQuery(
                l2_norm_bound=scaled_inflated_l2,
                local_stddev=local_stddev * scale)
        else:
            dp_query = tfp.DistributedSkellamSumQuery(
                l1_norm_bound=scaled_l1,
                l2_norm_bound=scaled_inflated_l2,
                local_stddev=local_stddev * scale)

        nested_factory = differential_privacy.DifferentiallyPrivateFactory(
            query=dp_query, record_aggregation_factory=nested_factory)

        # 3. Discretization operations. This appropriately quantizes the inputs.
        nested_factory = discretization.DiscretizationFactory(
            inner_agg_factory=nested_factory,
            scale_factor=scale,
            stochastic=True,
            beta=self._beta,
            prior_norm_bound=self._initial_l2_clip)

        # 4. L2 clip, possibly adaptively with a `tff.templates.EstimationProcess`.
        nested_factory = robust.clipping_factory(
            clipping_norm=self._l2_clip,
            inner_agg_factory=nested_factory,
            clipped_count_sum_factory=secure.SecureSumFactory(
                upper_bound_threshold=1, lower_bound_threshold=0))

        # 5. Flattening to improve quantization and reduce modular wrapping.
        nested_factory = rotation_factory(inner_agg_factory=nested_factory)

        # 6. Concat the input structure into a single vector.
        nested_factory = concat.concat_factory(
            inner_agg_factory=nested_factory)
        return nested_factory