def testTransformerStackFPropWithPackedInputs(self): # batch = 2. time = 2, depth = 2 with self.session(use_gpu=True) as sess: with tf.variable_scope('packing_test', reuse=tf.AUTO_REUSE): params = self._TransformerParams() xformer = mt_layers.TransformerStack(params) packed_params = params.Copy() packed_params.packed_input = True xformer_packed = mt_layers.TransformerStack(packed_params) input_arr = np.array([[[0, 1], [1, -1]], [[1, 2], [-2, -1]]], dtype=int) paddings_arr = np.array([[0, 0], [0, 0]], dtype=int) seg_id_arr = np.array([[0, 1, 0, 1]], dtype=int) inputs = tf.constant(input_arr.tolist(), dtype=tf.float32) paddings = tf.constant(paddings_arr.tolist(), dtype=tf.float32) inputs_packed = tf.reshape(inputs, [-1, 1, 2]) paddings_packed = tf.reshape(paddings, [-1, 1]) seg_id = tf.transpose( tf.constant(seg_id_arr.tolist(), dtype=tf.float32)) output, _, _ = xformer.FProp(xformer.theta, inputs, paddings, seg_id) output_packed, _, _ = xformer_packed.FProp( xformer_packed.theta, inputs_packed, paddings_packed, seg_id) output_packed = tf.reshape(output_packed, tf.shape(output)) tf.global_variables_initializer().run() output, output_packed = sess.run([output, output_packed]) self.assertAllClose(output_packed, output)
def testTransparentTransformerStackEvalFProp(self): # time = 2, batch = 1 with self.session(use_gpu=True) as sess: params = self._TransformerParams(is_eval=True) params.is_transparent = True params.num_transparent_outputs = 2 xformer = mt_layers.TransformerStack(params) input_arr = np.array([[[0, 1]], [[1, -1]]], dtype=int) paddings_arr = np.array([[0], [0]], dtype=int) inputs = tf.constant(input_arr.tolist(), dtype=tf.float32) paddings = tf.constant(paddings_arr.tolist(), dtype=tf.float32) tf.global_variables_initializer().run() outputs, _, _ = xformer.FPropDefaultTheta(inputs, paddings) out = sess.run(outputs) # pylint: disable=bad-whitespace # pyformat: disable self.assertAllClose( [[[-0.23663561, 0.99756944]], [[ 0.91392964, -0.85869682]]], out[:, :, :, 0]) self.assertAllClose( [[[-0.23663561, 0.99756944]], [[ 0.91392964, -0.85869682]]], out[:, :, :, 1])
def testTransformerStackAlternateLayers(self): batch = 3 tf.flags.FLAGS.tpu_compatible = True with self.session(use_gpu=False) as sess: model_dim = 2 num_transformer_layers = 2 transformer_tpl = layers_with_attention.TransformerLayer.Params() transformer_tpl.tr_atten_tpl.num_attention_heads = 1 transformer_tpl.tr_fflayer_tpl.hidden_dim = 2 params = mt_layers.TransformerStack.Params().Set( name='transformer', model_dim=model_dim, num_transformer_layers=num_transformer_layers, transformer_tpl=[ transformer_tpl.Copy() for _ in range(num_transformer_layers) ], random_seed=123456) xformer = mt_layers.TransformerStack(params) input_arr = np.array([ [[0, 1]] * batch, [[1, -1]] * batch, ], dtype=int) paddings_arr = np.array([[0] * batch, [0] * batch], dtype=int) inputs = tf.constant(input_arr.tolist(), dtype=py_utils.FPropDtype(params)) paddings = tf.constant(paddings_arr.tolist(), dtype=py_utils.FPropDtype(params)) output, _, _ = xformer.FProp(xformer.theta, inputs, paddings) tf.global_variables_initializer().run() output = sess.run(output) print(repr(output)) # pylint: disable=bad-whitespace # pyformat: disable self.assertAllCloseAccordingToType( np.array([[[-2.17566538, -0.2821945], [-2.17566514, -0.28219438], [-2.17566514, -0.28219438]], [[-0.71516591, -0.90594757], [-0.71516603, -0.90594769], [-0.71516603, -0.90594769]]]), output)
def testTransparentTransformerStackEvalFProp(self): # time = 2, batch = 1 with self.session(use_gpu=True) as sess, self.SetEval(True): params = self._TransformerParams() params.is_transparent = True params.num_transparent_outputs = 2 xformer = mt_layers.TransformerStack(params) input_arr = np.array([[[0, 1]], [[1, -1]]], dtype=int) paddings_arr = np.array([[0], [0]], dtype=int) inputs = tf.constant(input_arr.tolist(), dtype=tf.float32) paddings = tf.constant(paddings_arr.tolist(), dtype=tf.float32) self.evaluate(tf.global_variables_initializer()) outputs, _, _ = xformer.FPropDefaultTheta(inputs, paddings) out = sess.run(outputs) self.assertAllClose([[[1.38054, -1.37836]], [[-0.811525, 1.183977]]], out[:, :, :, 0]) self.assertAllClose([[[1.38054, -1.37836]], [[-0.811525, 1.183977]]], out[:, :, :, 1])
def testTransformerStackAlternateLayers(self): batch = 3 tf.flags.FLAGS.tpu_compatible = True with self.session(use_gpu=False): model_dim = 2 num_transformer_layers = 2 transformer_tpl = layers_with_attention.TransformerLayer.Params() transformer_tpl.tr_atten_tpl.num_attention_heads = 1 transformer_tpl.tr_fflayer_tpl.hidden_dim = 2 params = mt_layers.TransformerStack.Params().Set( name='transformer', model_dim=model_dim, num_transformer_layers=num_transformer_layers, transformer_tpl=[ transformer_tpl.Copy() for _ in range(num_transformer_layers) ], random_seed=123456) xformer = mt_layers.TransformerStack(params) input_arr = np.array([ [[0, 1]] * batch, [[1, -1]] * batch, ], dtype=int) paddings_arr = np.array([[0] * batch, [0] * batch], dtype=int) inputs = tf.constant(input_arr.tolist(), dtype=py_utils.FPropDtype(params)) paddings = tf.constant(paddings_arr.tolist(), dtype=py_utils.FPropDtype(params)) output, _, _ = xformer.FProp(xformer.theta, inputs, paddings) self.evaluate(tf.global_variables_initializer()) output = self.evaluate(output) print(repr(output)) self.assertAllCloseAccordingToType( np.array([[[-0.940543, 1.479253]] * batch, [[-0.413938, -2.550903]] * batch]), output)