def test_residual_block_ordering(self): inputs = tf.constant([[1.0, -1.0], [0.5, -1.5]]) inner_layer = tf.keras.layers.ReLU() normalization_layer = tf.keras.layers.Lambda(lambda x: 2 * x) residual_block_default_order = etc_layers.ResidualBlock( inner_layer=inner_layer, normalization_layer=normalization_layer, use_pre_activation_order=False) default_order_result = residual_block_default_order(inputs) residual_block_pre_act_order = etc_layers.ResidualBlock( inner_layer=inner_layer, normalization_layer=normalization_layer, use_pre_activation_order=True) pre_act_order_result = residual_block_pre_act_order(inputs) self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertAllClose([[4.0, -2.0], [2.0, -3.0]], default_order_result) self.assertAllClose([[3.0, -1.0], [1.5, -1.5]], pre_act_order_result)
def test_residual_block_with_relative_attention(self, use_pre_activation_order): np.random.seed(1234) batch_size = 2 seq_len = 4 hidden_size = 10 inputs = tf.constant( np.random.normal(size=[batch_size, seq_len, hidden_size]), tf.float32) att_mask = tf.stack([ # Force each element in the first example to only attend to itself. tf.eye(seq_len, dtype=tf.int32), # The second example can attend everywhere. tf.ones([seq_len, seq_len], dtype=tf.int32) ]) inner_layer = etc_layers.RelativeAttention( hidden_size=hidden_size, num_heads=2, relative_vocab_size=2, initializer=tf.keras.initializers.Identity()) residual_block = etc_layers.ResidualBlock( inner_layer=inner_layer, normalization_layer=tf.keras.layers.Lambda(lambda x: x), dropout_probability=0.0, use_pre_activation_order=use_pre_activation_order) relative_att_ids1 = tf.zeros([batch_size, seq_len, seq_len], dtype=tf.int32) result1 = residual_block(inputs, att_mask=att_mask, relative_att_ids=relative_att_ids1) self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertAllClose(2 * inputs[0], result1[0]) relative_att_ids2 = tf.tile([[[0, 1, 0, 1]]], [batch_size, seq_len, 1]) result2 = residual_block(inputs, att_mask=att_mask, relative_att_ids=relative_att_ids2) self.assertAllClose(result1[0], result2[0]) self.assertNotAllClose(result1[1], result2[1])
def test_residual_block_training_vs_inference_dropout( self, use_pre_activation_order): tf.compat.v1.random.set_random_seed(1234) np.random.seed(1234) batch_size = 3 input_size = 10 inputs = tf.constant(np.random.normal(size=[batch_size, input_size])) residual_block = etc_layers.ResidualBlock( dropout_probability=0.5, use_pre_activation_order=use_pre_activation_order) inference_output1 = residual_block(inputs, training=False) inference_output2 = residual_block(inputs, training=False) self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertAllClose(inference_output1, inference_output2) # Dropout makes this non-deterministic. training_output1 = residual_block(inputs, training=True) training_output2 = residual_block(inputs, training=True) self.assertNotAllClose(training_output1, training_output2)
def test_residual_block_training_vs_inference_normalization_layer( self, use_pre_activation_order): np.random.seed(1234) batch_size = 3 input_size = 10 inputs = tf.constant(np.random.normal(size=[batch_size, input_size])) residual_block = etc_layers.ResidualBlock( normalization_layer=tf.keras.layers.BatchNormalization(), dropout_probability=0.0, use_pre_activation_order=use_pre_activation_order) inference_output1 = residual_block(inputs, training=False) inference_output2 = residual_block(inputs, training=False) self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertAllClose(inference_output1, inference_output2) training_output1 = residual_block(inputs, training=True) training_output2 = residual_block(inputs, training=True) self.assertAllClose(training_output1, training_output2) # Batch normalization gives different results for training vs. inference. self.assertNotAllClose(inference_output1, training_output1)