def testInvalidStructureRaisesError(self): # Input dataset, content doesn't matter. inputs = tf.data.Dataset.from_tensor_slices( (self.original_x, self.original_y)).batch(5) nested_inputs = {"a": inputs, "b": inputs} bad_nested_inputs = {"a": inputs, "b": [inputs]} # No errors raised for valid nested structures. _ = input_utils.build_min_diff_dataset( sensitive_group_dataset=nested_inputs, nonsensitive_group_dataset=nested_inputs) # Assert raises error for invalid sensitive_group_dataset structure. with self.assertRaisesRegex( ValueError, "sensitive_group_dataset.*unnested" ".*only elements of type.*Dataset.*Given"): _ = input_utils.build_min_diff_dataset(bad_nested_inputs, nested_inputs) # Assert raises error for invalid nonsensitive_group_dataset structure. with self.assertRaisesRegex( ValueError, "nonsensitive_group_dataset.*unnested.*only elements of " "type.*Dataset.*Given"): _ = input_utils.build_min_diff_dataset(nested_inputs, bad_nested_inputs) # Assert raises error for different sensitive and nonsensitive structures. different_nested_inputs = {"a": inputs, "c": inputs} with self.assertRaisesRegex( ValueError, "sensitive_group_dataset.*" "nonsensitive_group_dataset.*do " "not have the same structure(.|\n)*don't have the same set of keys"): _ = input_utils.build_min_diff_dataset(nested_inputs, different_nested_inputs)
def testBuildFromSingleDatasets(self): sensitive_batch_size = 3 sensitive_dataset = tf.data.Dataset.from_tensor_slices( (self.sensitive_x, None, self.sensitive_w)).batch(sensitive_batch_size) nonsensitive_batch_size = 1 nonsensitive_dataset = tf.data.Dataset.from_tensor_slices( (self.nonsensitive_x, None, self.nonsensitive_w)).batch(nonsensitive_batch_size) dataset = input_utils.build_min_diff_dataset(sensitive_dataset, nonsensitive_dataset) for batch_ind, min_diff_batch in enumerate(dataset): # Assert min_diff batch properly formed. min_diff_x, min_diff_membership, min_diff_w = min_diff_batch self.assertAllClose( min_diff_x, _get_min_diff_batch(self.sensitive_x, self.nonsensitive_x, sensitive_batch_size, nonsensitive_batch_size, batch_ind)) self.assertAllClose( min_diff_membership, _get_min_diff_membership_batch(sensitive_batch_size, nonsensitive_batch_size)) self.assertAllClose( min_diff_w, _get_min_diff_batch(self.sensitive_w, self.nonsensitive_w, sensitive_batch_size, nonsensitive_batch_size, batch_ind))
def testBuildFromSingleDatasets(self): sensitive_batch_size = 3 sensitive_dataset = tf.data.Dataset.from_tensor_slices( (self.sensitive_x, None, self.sensitive_w)).batch(sensitive_batch_size) nonsensitive_batch_size = 1 nonsensitive_dataset = tf.data.Dataset.from_tensor_slices( (self.nonsensitive_x, None, self.nonsensitive_w)).batch(nonsensitive_batch_size) dataset = input_utils.build_min_diff_dataset(sensitive_dataset, nonsensitive_dataset) # The resulting dataset will repeat infinitely so we only take the first 20 # batches which corresponds to 2 full epochs of the nonsensitive dataset. for batch_ind, min_diff_batch in enumerate(dataset.take(20)): # Assert min_diff batch properly formed. min_diff_x, min_diff_membership, min_diff_w = min_diff_batch self.assertAllClose( min_diff_x, _get_min_diff_batch(self.sensitive_x, self.nonsensitive_x, sensitive_batch_size, nonsensitive_batch_size, batch_ind)) self.assertAllClose( min_diff_membership, _get_min_diff_membership_batch(sensitive_batch_size, nonsensitive_batch_size)) self.assertAllClose( min_diff_w, _get_min_diff_batch(self.sensitive_w, self.nonsensitive_w, sensitive_batch_size, nonsensitive_batch_size, batch_ind))
def testDifferentWeightsShapeRaisesError(self): sensitive_batch_size = 3 # Create weights with different shape. sensitive_w = self.sensitive_w[:, tf.newaxis, :] sensitive_dataset = tf.data.Dataset.from_tensor_slices( (self.sensitive_x, None, sensitive_w)).batch(sensitive_batch_size) nonsensitive_batch_size = 2 nonsensitive_dataset = tf.data.Dataset.from_tensor_slices( (self.nonsensitive_x, None, self.nonsensitive_w)).batch(nonsensitive_batch_size) with self.assertRaisesRegex(ValueError, "must be rank.*but is rank"): _ = input_utils.build_min_diff_dataset(sensitive_dataset, nonsensitive_dataset)
def testBuildFromDictsOfDatasets(self): sensitive_batch_sizes = [3, 5] sensitive_dataset = { key: tf.data.Dataset.from_tensor_slices( (self.sensitive_x, None, self.sensitive_w)).batch(batch_size) for key, batch_size in zip(["k1", "k2"], sensitive_batch_sizes) } nonsensitive_batch_sizes = [1, 2] nonsensitive_dataset = { key: tf.data.Dataset.from_tensor_slices( (self.nonsensitive_x, None, self.nonsensitive_w)).batch(batch_size) for key, batch_size in zip(["k1", "k2"], nonsensitive_batch_sizes) } dataset = input_utils.build_min_diff_dataset(sensitive_dataset, nonsensitive_dataset) # The resulting dataset will repeat infinitely so we only take the first 20 # batches which corresponds to at least 2 full epochs of the nonsensitive # dataset. for batch_ind, min_diff_batches in enumerate(dataset.take(20)): min_diff_keys = sorted(min_diff_batches.keys()) # Assert min_diff_batches has the right structure (i.e. set of keys). self.assertAllEqual(min_diff_keys, ["k1", "k2"]) min_diff_batches = [min_diff_batches[key] for key in min_diff_keys] for sensitive_batch_size, nonsensitive_batch_size, min_diff_batch in zip( sensitive_batch_sizes, nonsensitive_batch_sizes, min_diff_batches): # Assert min_diff batch properly formed. min_diff_x, min_diff_membership, min_diff_w = min_diff_batch self.assertAllClose( min_diff_x, _get_min_diff_batch(self.sensitive_x, self.nonsensitive_x, sensitive_batch_size, nonsensitive_batch_size, batch_ind)) self.assertAllClose( min_diff_membership, _get_min_diff_membership_batch(sensitive_batch_size, nonsensitive_batch_size)) self.assertAllClose( min_diff_w, _get_min_diff_batch(self.sensitive_w, self.nonsensitive_w, sensitive_batch_size, nonsensitive_batch_size, batch_ind))
def testBuildFromDictsOfDatasets(self): sensitive_batch_sizes = [3, 5] sensitive_dataset = { key: tf.data.Dataset.from_tensor_slices( (self.sensitive_x, None, self.sensitive_w)).batch(batch_size) for key, batch_size in zip(["k1", "k2"], sensitive_batch_sizes) } nonsensitive_batch_sizes = [1, 2] nonsensitive_dataset = { key: tf.data.Dataset.from_tensor_slices( (self.nonsensitive_x, None, self.nonsensitive_w)).batch(batch_size) for key, batch_size in zip(["k1", "k2"], nonsensitive_batch_sizes) } dataset = input_utils.build_min_diff_dataset(sensitive_dataset, nonsensitive_dataset) for batch_ind, min_diff_batches in enumerate(dataset): min_diff_keys = sorted(min_diff_batches.keys()) # Assert min_diff_batches has the right structure (i.e. set of keys). self.assertAllEqual(min_diff_keys, ["k1", "k2"]) min_diff_batches = [min_diff_batches[key] for key in min_diff_keys] for sensitive_batch_size, nonsensitive_batch_size, min_diff_batch in zip( sensitive_batch_sizes, nonsensitive_batch_sizes, min_diff_batches): # Assert min_diff batch properly formed. min_diff_x, min_diff_membership, min_diff_w = min_diff_batch self.assertAllClose( min_diff_x, _get_min_diff_batch(self.sensitive_x, self.nonsensitive_x, sensitive_batch_size, nonsensitive_batch_size, batch_ind)) self.assertAllClose( min_diff_membership, _get_min_diff_membership_batch(sensitive_batch_size, nonsensitive_batch_size)) self.assertAllClose( min_diff_w, _get_min_diff_batch(self.sensitive_w, self.nonsensitive_w, sensitive_batch_size, nonsensitive_batch_size, batch_ind))
def testWithBothMinDiffWeightsNone(self): sensitive_batch_size = 3 sensitive_dataset = tf.data.Dataset.from_tensor_slices( (self.sensitive_x, None, None)).batch(sensitive_batch_size) nonsensitive_batch_size = 2 nonsensitive_dataset = tf.data.Dataset.from_tensor_slices( (self.nonsensitive_x, None, None)).batch(nonsensitive_batch_size) dataset = input_utils.build_min_diff_dataset(sensitive_dataset, nonsensitive_dataset) # The resulting dataset will repeat infinitely so we only take the first 10 # batches which corresponds to 2 full epochs of the nonsensitive dataset. for min_diff_batch in dataset.take(10): # Skip all min_diff_data assertions except for weight. _, _, min_diff_w = tf.keras.utils.unpack_x_y_sample_weight(min_diff_batch) self.assertIsNone(min_diff_w)
def testWithOnlySensitiveWeightsNone(self): sensitive_batch_size = 3 sensitive_dataset = tf.data.Dataset.from_tensor_slices( (self.sensitive_x, None, None)).batch(sensitive_batch_size) nonsensitive_batch_size = 2 nonsensitive_dataset = tf.data.Dataset.from_tensor_slices( (self.nonsensitive_x, None, self.nonsensitive_w)).batch(nonsensitive_batch_size) dataset = input_utils.build_min_diff_dataset(sensitive_dataset, nonsensitive_dataset) for batch_ind, min_diff_batch in enumerate(dataset): # Skip all min_diff_data assertions except for weight. _, _, min_diff_w = tf.keras.utils.unpack_x_y_sample_weight( min_diff_batch) self.assertAllClose( min_diff_w, _get_min_diff_batch(tf.fill([sensitive_batch_size, 1], 1.0), self.nonsensitive_w, sensitive_batch_size, nonsensitive_batch_size, batch_ind))
def testWithVariableSizeSparseTensors(self): sensitive_batch_size = 3 sensitive_dataset = tf.data.Dataset.from_tensor_slices( (self.sensitive_x, self.sensitive_w, None)).batch(sensitive_batch_size) nonsensitive_batch_size = 2 nonsensitive_x = copy.copy(self.nonsensitive_x) # Modify so that f2_sparse has a different dense shape in non_sensitive # than in sensitive. nonsensitive_x["f2_sparse"] = tf.sparse.reset_shape( nonsensitive_x["f2_sparse"], [10, 5]) nonsensitive_dataset = tf.data.Dataset.from_tensor_slices( (nonsensitive_x, None, self.nonsensitive_w)).batch(nonsensitive_batch_size) dataset = input_utils.build_min_diff_dataset(sensitive_dataset, nonsensitive_dataset) for _, min_diff_batch in enumerate(dataset.take(10)): min_diff_x, _, _ = tf.keras.utils.unpack_x_y_sample_weight( min_diff_batch) self.assertEqual(min_diff_x["f2_sparse"].dense_shape[1], 5)
def testWithOnlyNonsensitiveWeightsNone(self): sensitive_batch_size = 3 sensitive_dataset = tf.data.Dataset.from_tensor_slices( (self.sensitive_x, None, self.sensitive_w)).batch(sensitive_batch_size) nonsensitive_batch_size = 2 nonsensitive_dataset = tf.data.Dataset.from_tensor_slices( (self.nonsensitive_x, None, None)).batch(nonsensitive_batch_size) dataset = input_utils.build_min_diff_dataset(sensitive_dataset, nonsensitive_dataset) # The resulting dataset will repeat infinitely so we only take the first 10 # batches which corresponds to 2 full epochs of the nonsensitive dataset. for batch_ind, min_diff_batch in enumerate(dataset.take(10)): # Skip all min_diff_data assertions except for weight. _, _, min_diff_w = tf.keras.utils.unpack_x_y_sample_weight(min_diff_batch) self.assertAllClose( min_diff_w, _get_min_diff_batch(self.sensitive_w, tf.fill([nonsensitive_batch_size, 1], 1.0), sensitive_batch_size, nonsensitive_batch_size, batch_ind))
def testPackDictsOfDatasets(self): original_batch_size = 5 original_dataset = tf.data.Dataset.from_tensor_slices( (self.original_x, self.original_y, self.original_w)).batch(original_batch_size) sensitive_batch_sizes = [3, 5] sensitive_dataset = { key: tf.data.Dataset.from_tensor_slices( (self.sensitive_x, None, self.sensitive_w)).batch(batch_size) for key, batch_size in zip(["k1", "k2"], sensitive_batch_sizes) } nonsensitive_batch_sizes = [1, 2] nonsensitive_dataset = { key: tf.data.Dataset.from_tensor_slices( (self.nonsensitive_x, None, self.nonsensitive_w)).batch(batch_size) for key, batch_size in zip(["k1", "k2"], nonsensitive_batch_sizes) } dataset = input_utils.pack_min_diff_data( original_dataset, min_diff_dataset=input_utils.build_min_diff_dataset( sensitive_dataset, nonsensitive_dataset)) for batch_ind, (packed_inputs, y, w) in enumerate(dataset): self.assertIsInstance(packed_inputs, input_utils.MinDiffPackedInputs) # Assert original batch is conserved self.assertAllClose( packed_inputs.original_inputs, _get_batch(self.original_x, original_batch_size, batch_ind)) self.assertAllClose( y, _get_batch(self.original_y, original_batch_size, batch_ind)) self.assertAllClose( w, _get_batch(self.original_w, original_batch_size, batch_ind)) min_diff_keys = sorted(packed_inputs.min_diff_data.keys()) # Assert min_diff_batches has the right structure (i.e. set of keys). self.assertAllEqual(min_diff_keys, ["k1", "k2"]) min_diff_batches = [ packed_inputs.min_diff_data[key] for key in min_diff_keys ] for sensitive_batch_size, nonsensitive_batch_size, min_diff_batch in zip( sensitive_batch_sizes, nonsensitive_batch_sizes, min_diff_batches): # Assert min_diff batch properly formed. min_diff_x, min_diff_membership, min_diff_w = min_diff_batch self.assertAllClose( min_diff_x, _get_min_diff_batch(self.sensitive_x, self.nonsensitive_x, sensitive_batch_size, nonsensitive_batch_size, batch_ind)) self.assertAllClose( min_diff_membership, _get_min_diff_membership_batch(sensitive_batch_size, nonsensitive_batch_size)) self.assertAllClose( min_diff_w, _get_min_diff_batch(self.sensitive_w, self.nonsensitive_w, sensitive_batch_size, nonsensitive_batch_size, batch_ind))