def test_failure_with_invalid_context_type(self): def model_fn(): return model_examples.LinearRegression(feature_dim=2) zero_model_weights = _create_zero_model_weights(model_fn) p13n_fn_dict = _create_p13n_fn_dict(learning_rate=1.0) with self.assertRaises(TypeError): # `tf.int32` is not a `tff.Type`. bad_context_tff_type = tf.int32 federated_p13n_eval = p13n_eval.build_personalization_eval( model_fn, p13n_fn_dict, _evaluate_fn, context_tff_type=bad_context_tff_type) with self.assertRaises(TypeError): # `context_tff_type` is provided but `context` is not provided. context_tff_type = computation_types.to_type(tf.int32) federated_p13n_eval = p13n_eval.build_personalization_eval( model_fn, p13n_fn_dict, _evaluate_fn, context_tff_type=context_tff_type) federated_p13n_eval(zero_model_weights, [ _create_client_input(train_scale=1.0, test_scale=1.0, context=None), _create_client_input(train_scale=1.0, test_scale=2.0, context=None) ])
def test_construction_calls_model_fn(self): # Assert that the the process building does not call `model_fn` too many # times. `model_fn` can potentially be expensive (loading weights, # processing, etc). mock_model_fn = mock.Mock(side_effect=model_examples.LinearRegression) p13n_fn_dict = _create_p13n_fn_dict(learning_rate=1.0) p13n_eval.build_personalization_eval( mock_model_fn, p13n_fn_dict, _evaluate_fn, max_num_clients=1) # TODO(b/186451541): reduce the number of calls to model_fn. self.assertEqual(mock_model_fn.call_count, 3)
def test_failure_with_invalid_baseline_eval_fn(self): def model_fn(): return model_examples.LinearRegression(feature_dim=2) p13n_fn_dict = _create_p13n_fn_dict(learning_rate=1.0) with self.assertRaises(TypeError): # `baseline_evaluate_fn` should be a callable. bad_baseline_evaluate_fn = 6 p13n_eval.build_personalization_eval(model_fn, p13n_fn_dict, bad_baseline_evaluate_fn)
def test_failure_with_invalid_model_fn(self): p13n_fn_dict = _create_p13n_fn_dict(learning_rate=1.0) with self.assertRaises(TypeError): # `model_fn` should be a callable. bad_model_fn = 6 p13n_eval.build_personalization_eval(bad_model_fn, p13n_fn_dict, _evaluate_fn) with self.assertRaises(TypeError): # `model_fn` should be a callable that returns a `tff.learning.Model`. bad_model_fn = lambda: 6 p13n_eval.build_personalization_eval(bad_model_fn, p13n_fn_dict, _evaluate_fn)
def test_failure_with_invalid_p13n_fns(self): def model_fn(): return model_examples.LinearRegression(feature_dim=2) with self.assertRaises(TypeError): # `personalize_fn_dict` should be a `OrderedDict`. bad_p13n_fn_dict = {'a': 6} p13n_eval.build_personalization_eval(model_fn, bad_p13n_fn_dict, _evaluate_fn) with self.assertRaises(TypeError): # `personalize_fn_dict` should be a `OrderedDict` that maps a `string` to # a `callable`. bad_p13n_fn_dict = collections.OrderedDict(a=6) p13n_eval.build_personalization_eval(model_fn, bad_p13n_fn_dict, _evaluate_fn) with self.assertRaises(TypeError): # `personalize_fn_dict` should be a `OrderedDict` that maps a `string` to # a `callable` that when called, gives another `callable`. bad_p13n_fn_dict = collections.OrderedDict(x=lambda: 2) p13n_eval.build_personalization_eval(model_fn, bad_p13n_fn_dict, _evaluate_fn) with self.assertRaises(ValueError): # `personalize_fn_dict` should not use `baseline_metrics` as a key. bad_p13n_fn_dict = collections.OrderedDict(baseline_metrics=lambda: 2) p13n_eval.build_personalization_eval(model_fn, bad_p13n_fn_dict, _evaluate_fn)
def test_success_with_valid_context(self): def model_fn(): return model_examples.LinearRegression(feature_dim=2) zero_model_weights = _create_zero_model_weights(model_fn) p13n_fn_dict = _create_p13n_fn_dict(learning_rate=1.0) # Build the p13n eval with an extra `context` argument. context_tff_type = computation_types.to_type(tf.int32) federated_p13n_eval = p13n_eval.build_personalization_eval( model_fn, p13n_fn_dict, _evaluate_fn, context_tff_type=context_tff_type) # Perform p13n eval on two clients with different `context` values. results = federated_p13n_eval(zero_model_weights, [ _create_client_input(train_scale=1.0, test_scale=1.0, context=2), _create_client_input(train_scale=1.0, test_scale=2.0, context=5) ]) bs1_metrics = results['batch_size_1'] bs2_metrics = results['batch_size_2'] # Number of training examples is `3 + context` for both clients. # Note: the order is not preserved due to `federated_sample`, but the order # should be consistent across different personalization strategies. self.assertAllEqual(sorted(bs1_metrics['num_examples']), [5, 8]) self.assertAllEqual(bs1_metrics['num_examples'], bs2_metrics['num_examples'])
def test_success_with_directly_constructed_model(self): def model_fn(): return model_examples.LinearRegression(feature_dim=2) zero_model_weights = _create_zero_model_weights(model_fn) p13n_fn_dict = _create_p13n_fn_dict(learning_rate=1.0) federated_p13n_eval = p13n_eval.build_personalization_eval( model_fn, p13n_fn_dict, _evaluate_fn) # Perform p13n eval on two clients: their train data are equivalent, but the # test data have different scales. results = federated_p13n_eval(zero_model_weights, [ _create_client_input(train_scale=1.0, test_scale=1.0), _create_client_input(train_scale=1.0, test_scale=2.0) ]) # Check if the baseline metrics are correct. baseline_metrics = results['baseline_metrics'] # Number of test examples is 3 for both clients. self.assertAllEqual(baseline_metrics['num_examples'], [3, 3]) # Number of test batches is 3 for both clients, because the function that # evaluates the baseline metrics `_evaluate_fn` uses a default batch size 1. self.assertAllEqual(sorted(baseline_metrics['num_batches']), [3, 3]) # The initial weights are all zeros. The average loss can be computed as: # Client 1, 0.5*(1 + 1 + 1)/3 = 0.5; Client 2, 0.5*(4 + 4 + 4)/3 = 2.0. # Note: the order is not preserved due to `federated_sample`. self.assertAllEqual(sorted(baseline_metrics['loss']), [0.5, 2.0]) if baseline_metrics['loss'][0] == 0.5: client_1_idx, client_2_idx = 0, 1 else: client_1_idx, client_2_idx = 1, 0 # Check if the metrics of `batch_size_1` are correct. bs1_metrics = results['batch_size_1'] # Number of training examples is 3 for both clients. self.assertAllEqual(bs1_metrics['num_examples'], [3, 3]) bs1_test_outputs = bs1_metrics['test_outputs'] # Number of test examples is also 3 for both clients. self.assertAllEqual(bs1_test_outputs['num_examples'], [3, 3]) # Number of test batches is 1 for both clients since test batch size is 3. self.assertAllEqual(bs1_test_outputs['num_batches'], [1, 1]) # Both clients's weights become [-3, -3, -1] after training, which gives an # average loss 24 for Client 1 and 88.5 for Client 2. self.assertAlmostEqual(bs1_test_outputs['loss'][client_1_idx], 24.0) self.assertAlmostEqual(bs1_test_outputs['loss'][client_2_idx], 88.5) # Check if the metrics of `batch_size_2` are correct. bs2_metrics = results['batch_size_2'] # Number of training examples is 3 for both clients. self.assertAllEqual(bs2_metrics['num_examples'], [3, 3]) bs2_test_outputs = bs2_metrics['test_outputs'] # Number of test examples is also 3 for both clients. self.assertAllEqual(bs2_test_outputs['num_examples'], [3, 3]) # Number of test batches is 1 for both clients since test batch size is 3. self.assertAllEqual(bs2_test_outputs['num_batches'], [1, 1]) # Both clients' weights become [0, 0, 1] after training, which gives an # average loss 0 for Client 1 and 0.5 for Client 2. self.assertAlmostEqual(bs2_test_outputs['loss'][client_1_idx], 0.0) self.assertAlmostEqual(bs2_test_outputs['loss'][client_2_idx], 0.5)
def test_failure_with_invalid_sample_size(self): def model_fn(): return model_examples.LinearRegression(feature_dim=2) p13n_fn_dict = _create_p13n_fn_dict(learning_rate=1.0) with self.assertRaises(TypeError): # `max_num_clients` should be an `int`. bad_num_clients = 1.0 p13n_eval.build_personalization_eval( model_fn, p13n_fn_dict, _evaluate_fn, max_num_clients=bad_num_clients) with self.assertRaises(ValueError): # `max_num_clients` should be a positive `int`. bad_num_clients = 0 p13n_eval.build_personalization_eval( model_fn, p13n_fn_dict, _evaluate_fn, max_num_clients=bad_num_clients)
def test_success_with_directly_constructed_model(self): def model_fn(): return model_examples.LinearRegression(feature_dim=2) zero_model_weights = _create_zero_model_weights(model_fn) p13n_fn_dict = _create_p13n_fn_dict(learning_rate=1.0) federated_p13n_eval = p13n_eval.build_personalization_eval( model_fn, p13n_fn_dict, _evaluate_fn) # Perform p13n eval on two clients with different batch sizes. results = federated_p13n_eval( zero_model_weights, [_create_client_input(1, 1), _create_client_input(2, 3)]) results = results._asdict(recursive=True) # Check if the baseline metrics are correct. baseline_metrics = results['baseline_metrics'] # Average loss is 0.5 * (1 + 1 + 1)/3 = 0.5. self.assertAllEqual(baseline_metrics['loss'], [0.5, 0.5]) # Number of test examples is 3 for both clients. self.assertAllEqual(baseline_metrics['num_examples'], [3, 3]) # Number of test batches is 3 and 1. # Note: the order is not preserved due to `federated_sample`. self.assertAllEqual(sorted(baseline_metrics['num_batches']), [1, 3]) if baseline_metrics['num_batches'][0] == 3: client_1_idx, client_2_idx = 0, 1 else: client_1_idx, client_2_idx = 1, 0 # Check if the metrics of `sgd_opt` are correct. sgd_metrics = results['sgd_opt'] # Number of training examples is 3 for both clients. self.assertAllEqual(sgd_metrics['num_examples'], [3, 3]) sgd_test_outputs = sgd_metrics['test_outputs'] # Number of test examples is also 3 for both clients. self.assertAllEqual(sgd_test_outputs['num_examples'], [3, 3]) # Client 1's weights become [-3, -3, -1], which gives average loss 24. # Client 2's weights become [0, 0, 1], which gives average loss 0. self.assertAlmostEqual(sgd_test_outputs['loss'][client_1_idx], 24.0) self.assertAlmostEqual(sgd_test_outputs['loss'][client_2_idx], 0.0) # Number of test batches should have the same order as baseline metrics. self.assertAllEqual(sgd_test_outputs['num_batches'], baseline_metrics['num_batches']) # Check if the metrics of `adam_opt` are correct. adam_metrics = results['adam_opt'] # Number of training examples is 3 for both clients. self.assertAllEqual(adam_metrics['num_examples'], [3, 3]) adam_test_outputs = adam_metrics['test_outputs'] # Number of test examples is also 3 for both clients. self.assertAllEqual(adam_test_outputs['num_examples'], [3, 3]) # Number of test batches should have the same order as baseline metrics. self.assertAllEqual(adam_test_outputs['num_batches'], baseline_metrics['num_batches'])
def test_success_with_model_constructed_from_keras(self): def model_fn(): inputs = tf.keras.Input(shape=(2,)) # feature dim = 2 outputs = tf.keras.layers.Dense(1)(inputs) keras_model = tf.keras.Model(inputs=inputs, outputs=outputs) input_spec = collections.OrderedDict([ ('x', tf.TensorSpec([None, 2], dtype=tf.float32)), ('y', tf.TensorSpec([None, 1], dtype=tf.float32)) ]) return keras_utils.from_keras_model( keras_model, input_spec=input_spec, loss=tf.keras.losses.MeanSquaredError()) zero_model_weights = _create_zero_model_weights(model_fn) p13n_fn_dict = _create_p13n_fn_dict(learning_rate=0.5) federated_p13n_eval = p13n_eval.build_personalization_eval( model_fn, p13n_fn_dict, _evaluate_fn) # Perform p13n eval on two clients: their train data are equivalent, but the # test data have different scales. results = federated_p13n_eval(zero_model_weights, [ _create_client_input(train_scale=1.0, test_scale=1.0), _create_client_input(train_scale=1.0, test_scale=2.0) ]) results = results._asdict(recursive=True) # Check if the baseline metrics are correct. baseline_metrics = results['baseline_metrics'] # The initial weights are all zeros. The MeanSquredError(MSE) is: # Client 1, (1 + 1 + 1)/3 = 1.0; Client 2, (4 + 4 + 4)/3 = 4.0. # Note: the order is not preserved due to `federated_sample`. self.assertAllEqual(sorted(baseline_metrics['loss']), [1.0, 4.0]) # Check if the metrics of `batch_size_1` are correct. bs1_metrics = results['batch_size_1'] # Number of training examples is 3 for both clients. self.assertAllEqual(bs1_metrics['num_examples'], [3, 3]) bs1_test_outputs = bs1_metrics['test_outputs'] # Both clients' weights become [-3, -3, -1] after training, which gives MSE # 48 for Client 1 and 177 for Client 2. self.assertAlmostEqual(sorted(bs1_test_outputs['loss']), [48.0, 177.0]) # Check if the metrics of `batch_size_2` are correct. bs2_metrics = results['batch_size_2'] # Number of training examples is 3 for both clients. self.assertAllEqual(bs2_metrics['num_examples'], [3, 3]) bs2_test_outputs = bs2_metrics['test_outputs'] # Both clients' weights become [0, 0, 1] after training, which gives MSE 0 # for Client 1 and 1.0 for Client 2. self.assertAlmostEqual(sorted(bs2_test_outputs['loss']), [0.0, 1.0])
def test_failure_with_batched_datasets(self): def model_fn(): return model_examples.LinearRegression(feature_dim=2) zero_model_weights = _create_zero_model_weights(model_fn) p13n_fn_dict = _create_p13n_fn_dict(learning_rate=1.0) federated_p13n_eval = p13n_eval.build_personalization_eval( model_fn, p13n_fn_dict, _evaluate_fn) with self.assertRaises(TypeError): # client_input should not have batched datasets. bad_client_input = collections.OrderedDict( train_data=_create_dataset(scale=1.0).batch(1), test_data=_create_dataset(scale=1.0).batch(1)) federated_p13n_eval(zero_model_weights, [bad_client_input])
def test_success_with_model_constructed_from_keras(self): def model_fn(): inputs = tf.keras.Input(shape=(2,)) # feature dim = 2 outputs = tf.keras.layers.Dense(1)(inputs) keras_model = tf.keras.Model(inputs=inputs, outputs=outputs) dummy_batch = collections.OrderedDict([ ('x', np.zeros([1, 2], dtype=np.float32)), ('y', np.zeros([1, 1], dtype=np.float32)) ]) return keras_utils.from_keras_model(keras_model, dummy_batch, tf.keras.losses.MeanSquaredError()) zero_model_weights = _create_zero_model_weights(model_fn) p13n_fn_dict = _create_p13n_fn_dict(learning_rate=0.5) federated_p13n_eval = p13n_eval.build_personalization_eval( model_fn, p13n_fn_dict, _evaluate_fn) # Perform p13n eval on two clients with different batch sizes. results = federated_p13n_eval( zero_model_weights, [_create_client_input(1, 1), _create_client_input(2, 3)]) results = results._asdict(recursive=True) # Check if the baseline metrics are correct. baseline_metrics = results['baseline_metrics'] # MeanSquredError(MSE) is (1 + 1 + 1)/3 = 1.0. self.assertAllEqual(baseline_metrics['loss'], [1.0, 1.0]) # Check if the metrics of `sgd_opt` are correct. sgd_metrics = results['sgd_opt'] # Number of training examples is 3 for both clients. self.assertAllEqual(sgd_metrics['num_examples'], [3, 3]) sgd_test_outputs = sgd_metrics['test_outputs'] # Client 1's weights become [-3, -3, -1], which gives MSE 48. # Client 2's weights become [0, 0, 1], which gives MSE 0. self.assertAlmostEqual(sorted(sgd_test_outputs['loss']), [0.0, 48.0]) # Check if the metrics of `adam_opt` are correct. adam_metrics = results['adam_opt'] # Number of training examples is 3 for both clients. self.assertAllEqual(adam_metrics['num_examples'], [3, 3])
def test_success_with_small_sample_size(self): def model_fn(): return model_examples.LinearRegression(feature_dim=2) zero_model_weights = _create_zero_model_weights(model_fn) p13n_fn_dict = _create_p13n_fn_dict(learning_rate=1.0) federated_p13n_eval = p13n_eval.build_personalization_eval( model_fn, p13n_fn_dict, _evaluate_fn, max_num_clients=1) # Perform p13n eval on two clients. results = federated_p13n_eval(zero_model_weights, [ _create_client_input(train_scale=1.0, test_scale=1.0), _create_client_input(train_scale=1.0, test_scale=2.0) ]) # The results should only contain metrics from one client. self.assertAllEqual(len(results['baseline_metrics']['loss']), 1) self.assertAllEqual(len(results['batch_size_1']['test_outputs']['loss']), 1) self.assertAllEqual(len(results['batch_size_2']['test_outputs']['loss']), 1)
def test_success_with_small_sample_size(self): def model_fn(): return model_examples.LinearRegression(feature_dim=2) zero_model_weights = _create_zero_model_weights(model_fn) p13n_fn_dict = _create_p13n_fn_dict(learning_rate=1.0) federated_p13n_eval = p13n_eval.build_personalization_eval( model_fn, p13n_fn_dict, _evaluate_fn, max_num_samples=1) # Perform p13n eval on two clients with different batch sizes. results = federated_p13n_eval( zero_model_weights, [_create_client_input(1, 1), _create_client_input(2, 3)]) results = results._asdict(recursive=True) # The results should only contain metrics from one client. self.assertAllEqual(len(results['baseline_metrics']['loss']), 1) self.assertAllEqual(len(results['sgd_opt']['test_outputs']['loss']), 1) self.assertAllEqual(len(results['adam_opt']['test_outputs']['loss']), 1)