def test_network_creation(self): hidden_size = 32 sequence_length = 21 num_hidden_instances = 3 embedding_cfg = { "vocab_size": 100, "type_vocab_size": 16, "hidden_size": hidden_size, "seq_length": sequence_length, "max_seq_length": sequence_length, "initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02), "dropout_rate": 0.1, } call_list = [] hidden_cfg = { "num_attention_heads": 2, "intermediate_size": 3072, "intermediate_activation": activations.gelu, "dropout_rate": 0.1, "attention_dropout_rate": 0.1, "kernel_initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02), "call_list": call_list } # Create a small EncoderScaffold for testing. test_network = encoder_scaffold.EncoderScaffold( num_hidden_instances=num_hidden_instances, num_output_classes=hidden_size, classification_layer_initializer=tf.keras.initializers. TruncatedNormal(stddev=0.02), hidden_cls=ValidatedTransformerLayer, hidden_cfg=hidden_cfg, embedding_cfg=embedding_cfg) # Create the inputs (note that the first dimension is implicit). word_ids = tf.keras.Input(shape=(sequence_length, ), dtype=tf.int32) mask = tf.keras.Input(shape=(sequence_length, ), dtype=tf.int32) type_ids = tf.keras.Input(shape=(sequence_length, ), dtype=tf.int32) data, pooled = test_network([word_ids, mask, type_ids]) expected_data_shape = [None, sequence_length, hidden_size] expected_pooled_shape = [None, hidden_size] self.assertAllEqual(expected_data_shape, data.shape.as_list()) self.assertAllEqual(expected_pooled_shape, pooled.shape.as_list()) # The default output dtype is float32. self.assertAllEqual(tf.float32, data.dtype) self.assertAllEqual(tf.float32, pooled.dtype) # If call_list[0] exists and is True, the passed layer class was # instantiated from the given config properly. self.assertNotEmpty(call_list) self.assertTrue(call_list[0], "The passed layer class wasn't instantiated.")
def _get_network(self, vocab_size): sequence_length = 512 hidden_size = 50 embedding_cfg = { 'vocab_size': vocab_size, 'type_vocab_size': 1, 'hidden_size': hidden_size, 'embedding_width': hidden_size, 'max_seq_length': sequence_length, 'initializer': tf.keras.initializers.TruncatedNormal(stddev=0.02), 'dropout_rate': 0.1, } embedding_inst = packed_sequence_embedding.PackedSequenceEmbedding( **embedding_cfg) hidden_cfg = { 'num_attention_heads': 2, 'intermediate_size': 3072, 'intermediate_activation': activations.gelu, 'dropout_rate': 0.1, 'attention_dropout_rate': 0.1, 'kernel_initializer': tf.keras.initializers.TruncatedNormal(stddev=0.02), } return encoder_scaffold.EncoderScaffold( num_hidden_instances=2, pooled_output_dim=hidden_size, embedding_cfg=embedding_cfg, embedding_cls=embedding_inst, hidden_cfg=hidden_cfg, dict_outputs=True)
def test_network_creation_with_float16_dtype(self): tf.keras.mixed_precision.experimental.set_policy("mixed_float16") hidden_size = 32 sequence_length = 21 embedding_cfg = { "vocab_size": 100, "type_vocab_size": 16, "hidden_size": hidden_size, "seq_length": sequence_length, "max_seq_length": sequence_length, "initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02), "dropout_rate": 0.1, "dtype": "float16", } hidden_cfg = { "num_attention_heads": 2, "intermediate_size": 3072, "intermediate_activation": activations.gelu, "dropout_rate": 0.1, "attention_dropout_rate": 0.1, "kernel_initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02), "dtype": "float16", } # Create a small EncoderScaffold for testing. test_network = encoder_scaffold.EncoderScaffold( num_hidden_instances=3, num_output_classes=hidden_size, classification_layer_initializer=tf.keras.initializers. TruncatedNormal(stddev=0.02), classification_layer_dtype=tf.float16, hidden_cfg=hidden_cfg, embedding_cfg=embedding_cfg) # Create the inputs (note that the first dimension is implicit). word_ids = tf.keras.Input(shape=(sequence_length, ), dtype=tf.int32) mask = tf.keras.Input(shape=(sequence_length, ), dtype=tf.int32) type_ids = tf.keras.Input(shape=(sequence_length, ), dtype=tf.int32) data, pooled = test_network([word_ids, mask, type_ids]) expected_data_shape = [None, sequence_length, hidden_size] expected_pooled_shape = [None, hidden_size] self.assertAllEqual(expected_data_shape, data.shape.as_list()) self.assertAllEqual(expected_pooled_shape, pooled.shape.as_list()) # If float_dtype is set to float16, the output should always be float16. self.assertAllEqual(tf.float16, data.dtype) self.assertAllEqual(tf.float16, pooled.dtype)
def test_network_invocation(self): hidden_size = 32 sequence_length = 21 vocab_size = 57 # Build an embedding network to swap in for the default network. This one # will have 2 inputs (mask and word_ids) instead of 3, and won't use # positional embeddings. network = Embeddings(vocab_size, hidden_size) hidden_cfg = { "num_attention_heads": 2, "intermediate_size": 3072, "intermediate_activation": activations.gelu, "dropout_rate": 0.1, "attention_dropout_rate": 0.1, "kernel_initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02), } # Create a small EncoderScaffold for testing. test_network = encoder_scaffold.EncoderScaffold( num_hidden_instances=3, pooled_output_dim=hidden_size, pooler_layer_initializer=tf.keras.initializers.TruncatedNormal( stddev=0.02), hidden_cfg=hidden_cfg, embedding_cls=network) # Create the inputs (note that the first dimension is implicit). word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32) mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32) data, pooled = test_network([word_ids, mask]) # Create a model based off of this network: model = tf.keras.Model([word_ids, mask], [data, pooled]) # Invoke the model. We can't validate the output data here (the model is too # complex) but this will catch structural runtime errors. batch_size = 3 word_id_data = np.random.randint( vocab_size, size=(batch_size, sequence_length)) mask_data = np.random.randint(2, size=(batch_size, sequence_length)) _ = model.predict([word_id_data, mask_data])
def test_serialize_deserialize(self): # Create a network object that sets all of its config options. hidden_size = 32 sequence_length = 21 embedding_cfg = { "vocab_size": 100, "type_vocab_size": 16, "hidden_size": hidden_size, "seq_length": sequence_length, "max_seq_length": sequence_length, "initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02), "dropout_rate": 0.1, } hidden_cfg = { "num_attention_heads": 2, "intermediate_size": 3072, "intermediate_activation": activations.gelu, "dropout_rate": 0.1, "attention_dropout_rate": 0.1, "kernel_initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02), "dtype": "float32", } # Create a small EncoderScaffold for testing. network = encoder_scaffold.EncoderScaffold( num_hidden_instances=3, num_output_classes=hidden_size, classification_layer_initializer=tf.keras.initializers. TruncatedNormal(stddev=0.02), hidden_cfg=hidden_cfg, embedding_cfg=embedding_cfg) # Create another network object from the first object's config. new_network = encoder_scaffold.EncoderScaffold.from_config( network.get_config()) # Validate that the config can be forced to JSON. _ = new_network.to_json() # If the serialization was successful, the new config should match the old. self.assertAllEqual(network.get_config(), new_network.get_config())
def test_serialize_deserialize(self, use_hidden_cls_instance): hidden_size = 32 sequence_length = 21 vocab_size = 57 num_types = 7 embedding_cfg = { "vocab_size": vocab_size, "type_vocab_size": num_types, "hidden_size": hidden_size, "seq_length": sequence_length, "max_seq_length": sequence_length, "initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02), "dropout_rate": 0.1, } call_list = [] hidden_cfg = { "num_attention_heads": 2, "intermediate_size": 3072, "intermediate_activation": activations.gelu, "dropout_rate": 0.1, "attention_dropout_rate": 0.1, "kernel_initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02), "call_list": call_list, "call_class": TestLayer } mask_call_list = [] mask_cfg = {"call_list": mask_call_list, "call_class": TestLayer} # Create a small EncoderScaffold for testing. This time, we pass an already- # instantiated layer object. kwargs = dict( num_hidden_instances=3, pooled_output_dim=hidden_size, pooler_layer_initializer=tf.keras.initializers.TruncatedNormal( stddev=0.02), embedding_cfg=embedding_cfg) if use_hidden_cls_instance: xformer = ValidatedTransformerLayer(**hidden_cfg) xmask = ValidatedMaskLayer(**mask_cfg) test_network = encoder_scaffold.EncoderScaffold( hidden_cls=xformer, mask_cls=xmask, **kwargs) else: test_network = encoder_scaffold.EncoderScaffold( hidden_cls=ValidatedTransformerLayer, hidden_cfg=hidden_cfg, mask_cls=ValidatedMaskLayer, mask_cfg=mask_cfg, **kwargs) # Create another network object from the first object's config. new_network = encoder_scaffold.EncoderScaffold.from_config( test_network.get_config()) # Validate that the config can be forced to JSON. _ = new_network.to_json() # If the serialization was successful, the new config should match the old. self.assertAllEqual(test_network.get_config(), new_network.get_config()) # Create a model based off of the old and new networks: word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32) mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32) type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32) data, pooled = new_network([word_ids, mask, type_ids]) new_model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled]) data, pooled = test_network([word_ids, mask, type_ids]) model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled]) # Copy the weights between models. new_model.set_weights(model.get_weights()) # Invoke the models. batch_size = 3 word_id_data = np.random.randint( vocab_size, size=(batch_size, sequence_length)) mask_data = np.random.randint(2, size=(batch_size, sequence_length)) type_id_data = np.random.randint( num_types, size=(batch_size, sequence_length)) data, cls = model.predict([word_id_data, mask_data, type_id_data]) new_data, new_cls = new_model.predict( [word_id_data, mask_data, type_id_data]) # The output should be equal. self.assertAllEqual(data, new_data) self.assertAllEqual(cls, new_cls)
def test_hidden_cls_list(self): hidden_size = 32 sequence_length = 10 vocab_size = 57 embedding_network = Embeddings(vocab_size, hidden_size) call_list = [] hidden_cfg = { "num_attention_heads": 2, "intermediate_size": 3072, "intermediate_activation": activations.gelu, "dropout_rate": 0.1, "attention_dropout_rate": 0.1, "kernel_initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02), "call_list": call_list } mask_call_list = [] mask_cfg = { "call_list": mask_call_list } # Create a small EncoderScaffold for testing. This time, we pass an already- # instantiated layer object. xformer = ValidatedTransformerLayer(**hidden_cfg) xmask = ValidatedMaskLayer(**mask_cfg) test_network_a = encoder_scaffold.EncoderScaffold( num_hidden_instances=3, pooled_output_dim=hidden_size, pooler_layer_initializer=tf.keras.initializers.TruncatedNormal( stddev=0.02), hidden_cls=xformer, mask_cls=xmask, embedding_cls=embedding_network) # Create a network b with same embedding and hidden layers as network a. test_network_b = encoder_scaffold.EncoderScaffold( num_hidden_instances=3, pooled_output_dim=hidden_size, pooler_layer_initializer=tf.keras.initializers.TruncatedNormal( stddev=0.02), mask_cls=xmask, embedding_cls=test_network_a.embedding_network, hidden_cls=test_network_a.hidden_layers) # Create a network c with same embedding but fewer hidden layers compared to # network a and b. hidden_layers = test_network_a.hidden_layers hidden_layers.pop() test_network_c = encoder_scaffold.EncoderScaffold( num_hidden_instances=2, pooled_output_dim=hidden_size, pooler_layer_initializer=tf.keras.initializers.TruncatedNormal( stddev=0.02), mask_cls=xmask, embedding_cls=test_network_a.embedding_network, hidden_cls=hidden_layers) # Create the inputs (note that the first dimension is implicit). word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32) mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32) # Create model based off of network a: data_a, pooled_a = test_network_a([word_ids, mask]) model_a = tf.keras.Model([word_ids, mask], [data_a, pooled_a]) # Create model based off of network b: data_b, pooled_b = test_network_b([word_ids, mask]) model_b = tf.keras.Model([word_ids, mask], [data_b, pooled_b]) # Create model based off of network b: data_c, pooled_c = test_network_c([word_ids, mask]) model_c = tf.keras.Model([word_ids, mask], [data_c, pooled_c]) batch_size = 3 word_id_data = np.random.randint( vocab_size, size=(batch_size, sequence_length)) mask_data = np.random.randint(2, size=(batch_size, sequence_length)) output_a, _ = model_a.predict([word_id_data, mask_data]) output_b, _ = model_b.predict([word_id_data, mask_data]) output_c, _ = model_c.predict([word_id_data, mask_data]) # Outputs from model a and b should be the same since they share the same # embedding and hidden layers. self.assertAllEqual(output_a, output_b) # Outputs from model a and c shouldn't be the same since they share the same # embedding layer but different number of hidden layers. self.assertNotAllEqual(output_a, output_c)
def test_network_invocation(self): hidden_size = 32 sequence_length = 21 vocab_size = 57 num_types = 7 embedding_cfg = { "vocab_size": vocab_size, "type_vocab_size": num_types, "hidden_size": hidden_size, "seq_length": sequence_length, "max_seq_length": sequence_length, "initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02), "dropout_rate": 0.1, } call_list = [] hidden_cfg = { "num_attention_heads": 2, "intermediate_size": 3072, "intermediate_activation": activations.gelu, "dropout_rate": 0.1, "attention_dropout_rate": 0.1, "kernel_initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02), "call_list": call_list } mask_call_list = [] mask_cfg = { "call_list": mask_call_list } # Create a small EncoderScaffold for testing. This time, we pass an already- # instantiated layer object. xformer = ValidatedTransformerLayer(**hidden_cfg) xmask = ValidatedMaskLayer(**mask_cfg) test_network = encoder_scaffold.EncoderScaffold( num_hidden_instances=3, pooled_output_dim=hidden_size, pooler_layer_initializer=tf.keras.initializers.TruncatedNormal( stddev=0.02), hidden_cls=xformer, mask_cls=xmask, embedding_cfg=embedding_cfg) # Create the inputs (note that the first dimension is implicit). word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32) mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32) type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32) data, pooled = test_network([word_ids, mask, type_ids]) # Create a model based off of this network: model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled]) # Invoke the model. We can't validate the output data here (the model is too # complex) but this will catch structural runtime errors. batch_size = 3 word_id_data = np.random.randint( vocab_size, size=(batch_size, sequence_length)) mask_data = np.random.randint(2, size=(batch_size, sequence_length)) type_id_data = np.random.randint( num_types, size=(batch_size, sequence_length)) _ = model.predict([word_id_data, mask_data, type_id_data]) # If call_list[0] exists and is True, the passed layer class was # called as part of the graph creation. self.assertNotEmpty(call_list) self.assertTrue(call_list[0], "The passed layer class wasn't instantiated.")
def test_serialize_deserialize(self): hidden_size = 32 sequence_length = 21 vocab_size = 57 # Build an embedding network to swap in for the default network. This one # will have 2 inputs (mask and word_ids) instead of 3, and won't use # positional embeddings. word_ids = tf.keras.layers.Input( shape=(sequence_length,), dtype=tf.int32, name="input_word_ids") mask = tf.keras.layers.Input( shape=(sequence_length,), dtype=tf.int32, name="input_mask") embedding_layer = layers.OnDeviceEmbedding( vocab_size=vocab_size, embedding_width=hidden_size, initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02), name="word_embeddings") word_embeddings = embedding_layer(word_ids) attention_mask = layers.SelfAttentionMask()([word_embeddings, mask]) network = tf.keras.Model([word_ids, mask], [word_embeddings, attention_mask]) hidden_cfg = { "num_attention_heads": 2, "intermediate_size": 3072, "intermediate_activation": activations.gelu, "dropout_rate": 0.1, "attention_dropout_rate": 0.1, "kernel_initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02), } # Create a small EncoderScaffold for testing. test_network = encoder_scaffold.EncoderScaffold( num_hidden_instances=3, pooled_output_dim=hidden_size, pooler_layer_initializer=tf.keras.initializers.TruncatedNormal( stddev=0.02), hidden_cfg=hidden_cfg, embedding_cls=network, embedding_data=embedding_layer.embeddings) # Create another network object from the first object's config. new_network = encoder_scaffold.EncoderScaffold.from_config( test_network.get_config()) # Validate that the config can be forced to JSON. _ = new_network.to_json() # If the serialization was successful, the new config should match the old. self.assertAllEqual(test_network.get_config(), new_network.get_config()) # Create a model based off of the old and new networks: word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32) mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32) data, pooled = new_network([word_ids, mask]) new_model = tf.keras.Model([word_ids, mask], [data, pooled]) data, pooled = test_network([word_ids, mask]) model = tf.keras.Model([word_ids, mask], [data, pooled]) # Copy the weights between models. new_model.set_weights(model.get_weights()) # Invoke the models. batch_size = 3 word_id_data = np.random.randint( vocab_size, size=(batch_size, sequence_length)) mask_data = np.random.randint(2, size=(batch_size, sequence_length)) data, cls = model.predict([word_id_data, mask_data]) new_data, new_cls = new_model.predict([word_id_data, mask_data]) # The output should be equal. self.assertAllEqual(data, new_data) self.assertAllEqual(cls, new_cls) # We should not be able to get a reference to the embedding data. with self.assertRaisesRegex(RuntimeError, ".*does not have a reference.*"): new_network.get_embedding_table()
def test_network_invocation(self): hidden_size = 32 sequence_length = 21 vocab_size = 57 num_types = 7 embedding_cfg = { "vocab_size": vocab_size, "type_vocab_size": num_types, "hidden_size": hidden_size, "seq_length": sequence_length, "max_seq_length": sequence_length, "initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02), "dropout_rate": 0.1, } hidden_cfg = { "num_attention_heads": 2, "intermediate_size": 3072, "intermediate_activation": activations.gelu, "dropout_rate": 0.1, "attention_dropout_rate": 0.1, "kernel_initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02), } # Create a small EncoderScaffold for testing. test_network = encoder_scaffold.EncoderScaffold( num_hidden_instances=3, pooled_output_dim=hidden_size, pooler_layer_initializer=tf.keras.initializers.TruncatedNormal( stddev=0.02), hidden_cfg=hidden_cfg, embedding_cfg=embedding_cfg, dict_outputs=True) # Create the inputs (note that the first dimension is implicit). word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32) mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32) type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32) outputs = test_network([word_ids, mask, type_ids]) # Create a model based off of this network: model = tf.keras.Model([word_ids, mask, type_ids], outputs) # Invoke the model. We can't validate the output data here (the model is too # complex) but this will catch structural runtime errors. batch_size = 3 word_id_data = np.random.randint( vocab_size, size=(batch_size, sequence_length)) mask_data = np.random.randint(2, size=(batch_size, sequence_length)) type_id_data = np.random.randint( num_types, size=(batch_size, sequence_length)) preds = model.predict([word_id_data, mask_data, type_id_data]) self.assertEqual(preds["pooled_output"].shape, (3, hidden_size)) # Creates a EncoderScaffold with max_sequence_length != sequence_length num_types = 7 embedding_cfg = { "vocab_size": vocab_size, "type_vocab_size": num_types, "hidden_size": hidden_size, "seq_length": sequence_length, "max_seq_length": sequence_length * 2, "initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02), "dropout_rate": 0.1, } hidden_cfg = { "num_attention_heads": 2, "intermediate_size": 3072, "intermediate_activation": activations.gelu, "dropout_rate": 0.1, "attention_dropout_rate": 0.1, "kernel_initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02), } # Create a small EncoderScaffold for testing. test_network = encoder_scaffold.EncoderScaffold( num_hidden_instances=3, pooled_output_dim=hidden_size, pooler_layer_initializer=tf.keras.initializers.TruncatedNormal( stddev=0.02), hidden_cfg=hidden_cfg, embedding_cfg=embedding_cfg) outputs = test_network([word_ids, mask, type_ids]) model = tf.keras.Model([word_ids, mask, type_ids], outputs) _ = model.predict([word_id_data, mask_data, type_id_data])
def test_network_invocation(self): hidden_size = 32 sequence_length = 21 vocab_size = 57 # Build an embedding network to swap in for the default network. This one # will have 2 inputs (mask and word_ids) instead of 3, and won't use # positional embeddings. word_ids = tf.keras.layers.Input(shape=(sequence_length, ), dtype=tf.int32, name="input_word_ids") mask = tf.keras.layers.Input(shape=(sequence_length, ), dtype=tf.int32, name="input_mask") embedding_layer = layers.OnDeviceEmbedding( vocab_size=vocab_size, embedding_width=hidden_size, initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02), name="word_embeddings") word_embeddings = embedding_layer(word_ids) network = tf.keras.Model([word_ids, mask], [word_embeddings, mask]) hidden_cfg = { "num_attention_heads": 2, "intermediate_size": 3072, "intermediate_activation": activations.gelu, "dropout_rate": 0.1, "attention_dropout_rate": 0.1, "kernel_initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02), } # Create a small EncoderScaffold for testing. test_network = encoder_scaffold.EncoderScaffold( num_hidden_instances=3, num_output_classes=hidden_size, classification_layer_initializer=tf.keras.initializers. TruncatedNormal(stddev=0.02), hidden_cfg=hidden_cfg, embedding_cls=network, embedding_data=embedding_layer.embeddings) # Create the inputs (note that the first dimension is implicit). word_ids = tf.keras.Input(shape=(sequence_length, ), dtype=tf.int32) mask = tf.keras.Input(shape=(sequence_length, ), dtype=tf.int32) data, pooled = test_network([word_ids, mask]) # Create a model based off of this network: model = tf.keras.Model([word_ids, mask], [data, pooled]) # Invoke the model. We can't validate the output data here (the model is too # complex) but this will catch structural runtime errors. batch_size = 3 word_id_data = np.random.randint(vocab_size, size=(batch_size, sequence_length)) mask_data = np.random.randint(2, size=(batch_size, sequence_length)) _ = model.predict([word_id_data, mask_data]) # Test that we can get the embedding data that we passed to the object. This # is necessary to support standard language model training. self.assertIs(embedding_layer.embeddings, test_network.get_embedding_table())