def test_network_invocation(self, output_range, out_seq_len): hidden_size = 32 sequence_length = 21 vocab_size = 57 num_types = 7 # Create a small TransformerEncoder for testing. test_network = transformer_encoder.TransformerEncoder( vocab_size=vocab_size, hidden_size=hidden_size, sequence_length=sequence_length, num_attention_heads=2, num_layers=3, type_vocab_size=num_types, output_range=output_range) self.assertTrue( test_network._position_embedding_layer._use_dynamic_slicing) # Create the inputs (note that the first dimension is implicit). word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32) mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32) type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32) data, pooled = test_network([word_ids, mask, type_ids]) # Create a model based off of this network: model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled]) # Invoke the model. We can't validate the output data here (the model is too # complex) but this will catch structural runtime errors. batch_size = 3 word_id_data = np.random.randint( vocab_size, size=(batch_size, sequence_length)) mask_data = np.random.randint(2, size=(batch_size, sequence_length)) type_id_data = np.random.randint( num_types, size=(batch_size, sequence_length)) _ = model.predict([word_id_data, mask_data, type_id_data]) # Creates a TransformerEncoder with max_sequence_length != sequence_length max_sequence_length = 128 test_network = transformer_encoder.TransformerEncoder( vocab_size=vocab_size, hidden_size=hidden_size, sequence_length=sequence_length, max_sequence_length=max_sequence_length, num_attention_heads=2, num_layers=3, type_vocab_size=num_types) self.assertTrue(test_network._position_embedding_layer._use_dynamic_slicing) model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled]) outputs = model.predict([word_id_data, mask_data, type_id_data]) self.assertEqual(outputs[0].shape[1], out_seq_len)
def test_network_creation_with_float16_dtype(self): hidden_size = 32 sequence_length = 21 tf.keras.mixed_precision.experimental.set_policy("mixed_float16") # Create a small TransformerEncoder for testing. test_network = transformer_encoder.TransformerEncoder( vocab_size=100, hidden_size=hidden_size, sequence_length=sequence_length, num_attention_heads=2, num_layers=3) # Create the inputs (note that the first dimension is implicit). word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32) mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32) type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32) data, pooled = test_network([word_ids, mask, type_ids]) expected_data_shape = [None, sequence_length, hidden_size] expected_pooled_shape = [None, hidden_size] self.assertAllEqual(expected_data_shape, data.shape.as_list()) self.assertAllEqual(expected_pooled_shape, pooled.shape.as_list()) # If float_dtype is set to float16, the data output is float32 (from a layer # norm) and pool output should be float16. self.assertAllEqual(tf.float32, data.dtype) self.assertAllEqual(tf.float16, pooled.dtype)
def test_all_encoder_outputs_network_creation(self): hidden_size = 32 sequence_length = 21 # Create a small TransformerEncoder for testing. test_network = transformer_encoder.TransformerEncoder( vocab_size=100, hidden_size=hidden_size, sequence_length=sequence_length, num_attention_heads=2, num_layers=3, return_all_encoder_outputs=True) # Create the inputs (note that the first dimension is implicit). word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32) mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32) type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32) all_encoder_outputs, pooled = test_network([word_ids, mask, type_ids]) expected_data_shape = [None, sequence_length, hidden_size] expected_pooled_shape = [None, hidden_size] self.assertLen(all_encoder_outputs, 3) for data in all_encoder_outputs: self.assertAllEqual(expected_data_shape, data.shape.as_list()) self.assertAllEqual(expected_pooled_shape, pooled.shape.as_list()) # The default output dtype is float32. self.assertAllEqual(tf.float32, all_encoder_outputs[-1].dtype) self.assertAllEqual(tf.float32, pooled.dtype)
def create_network(self, vocab_size, sequence_length, hidden_size, num_predictions, output='predictions', xformer_stack=None): # First, create a transformer stack that we can use to get the LM's # vocabulary weight. if xformer_stack is None: xformer_stack = transformer_encoder.TransformerEncoder( vocab_size=vocab_size, num_layers=1, sequence_length=sequence_length, hidden_size=hidden_size, num_attention_heads=4, ) word_ids = tf.keras.Input(shape=(sequence_length, ), dtype=tf.int32) mask = tf.keras.Input(shape=(sequence_length, ), dtype=tf.int32) type_ids = tf.keras.Input(shape=(sequence_length, ), dtype=tf.int32) lm_outputs, _ = xformer_stack([word_ids, mask, type_ids]) # Create a maskedLM from the transformer stack. test_network = masked_lm.MaskedLM(num_predictions=num_predictions, input_width=lm_outputs.shape[-1], source_network=xformer_stack, output=output) return test_network
def test_serialize_deserialize(self): tf.keras.mixed_precision.experimental.set_policy("mixed_float16") # Create a network object that sets all of its config options. kwargs = dict( vocab_size=100, hidden_size=32, num_layers=3, num_attention_heads=2, sequence_length=21, max_sequence_length=21, type_vocab_size=12, intermediate_size=1223, activation="relu", dropout_rate=0.05, attention_dropout_rate=0.22, initializer="glorot_uniform", return_all_encoder_outputs=False, output_range=-1) network = transformer_encoder.TransformerEncoder(**kwargs) expected_config = dict(kwargs) expected_config["activation"] = tf.keras.activations.serialize( tf.keras.activations.get(expected_config["activation"])) expected_config["initializer"] = tf.keras.initializers.serialize( tf.keras.initializers.get(expected_config["initializer"])) self.assertEqual(network.get_config(), expected_config) # Create another network object from the first object's config. new_network = transformer_encoder.TransformerEncoder.from_config( network.get_config()) # Validate that the config can be forced to JSON. _ = new_network.to_json() # If the serialization was successful, the new config should match the old. self.assertAllEqual(network.get_config(), new_network.get_config())
def test_network_invocation_with_external_logits(self): vocab_size = 100 sequence_length = 32 hidden_size = 64 num_predictions = 21 xformer_stack = transformer_encoder.TransformerEncoder( vocab_size=vocab_size, num_layers=1, sequence_length=sequence_length, hidden_size=hidden_size, num_attention_heads=4, ) test_network = self.create_network(vocab_size=vocab_size, sequence_length=sequence_length, hidden_size=hidden_size, num_predictions=num_predictions, xformer_stack=xformer_stack, output='predictions') logit_network = self.create_network(vocab_size=vocab_size, sequence_length=sequence_length, hidden_size=hidden_size, num_predictions=num_predictions, xformer_stack=xformer_stack, output='logits') logit_network.set_weights(test_network.get_weights()) # Create a model from the masked LM layer. lm_input_tensor = tf.keras.Input(shape=(sequence_length, hidden_size)) masked_lm_positions = tf.keras.Input(shape=(num_predictions, ), dtype=tf.int32) output = test_network([lm_input_tensor, masked_lm_positions]) logit_output = logit_network([lm_input_tensor, masked_lm_positions]) model = tf.keras.Model([lm_input_tensor, masked_lm_positions], output) logits_model = tf.keras.Model(([lm_input_tensor, masked_lm_positions]), logit_output) # Invoke the masked LM on some fake data to make sure there are no runtime # errors in the code. batch_size = 3 lm_input_data = 10 * np.random.random_sample( (batch_size, sequence_length, hidden_size)) masked_position_data = np.random.randint(2, size=(batch_size, num_predictions)) outputs = model.predict([lm_input_data, masked_position_data]) logits = logits_model.predict([lm_input_data, masked_position_data]) # Ensure that the tensor shapes are correct. expected_output_shape = (batch_size, num_predictions, vocab_size) self.assertEqual(expected_output_shape, outputs.shape) self.assertEqual(expected_output_shape, logits.shape) # Ensure that the logits, when softmaxed, create the outputs. input_tensor = tf.keras.Input(expected_output_shape[1:]) output_tensor = tf.keras.layers.Activation( tf.nn.log_softmax)(input_tensor) softmax_model = tf.keras.Model(input_tensor, output_tensor) calculated_softmax = softmax_model.predict(logits) self.assertAllClose(outputs, calculated_softmax)