Exemple #1
0
    def testTransformerAttentionLayerReference(self):
        depth = 4
        p = layers_with_attention.TransformerAttentionLayer.Params()
        p.name = 'transformer_atten'
        p.source_dim = depth
        p.is_masked = False
        p.num_attention_heads = 2
        p.atten_tpl.params_init = py_utils.WeightInit.Gaussian(0.1, 12345)
        transformer_atten_ref = layers_with_attention.TransformerAttentionLayer(
            p)

        (query_vec, _, aux_vecs,
         aux_paddings) = self._TransformerSingleSourceInputs(depth)

        ctx_ref, probs_ref = transformer_atten_ref.FPropDefaultTheta(
            query_vec, aux_paddings, aux_vecs)

        expected_ctx, expected_probs = self._ExpectedSingleSourceResults()
        with self.session(use_gpu=True) as sess:
            tf.global_variables_initializer().run()
            actual_ctx_ref, actual_probs_ref = sess.run([ctx_ref, probs_ref])
            tf.logging.info(np.array_repr(actual_ctx_ref))
            tf.logging.info(np.array_repr(actual_probs_ref))
            self.assertAllClose(expected_ctx, actual_ctx_ref)
            self.assertAllClose(expected_probs, actual_probs_ref)
  def testTransformerAttentionLayerDeterministicDropout(self):
    with self.session(use_gpu=True) as sess:
      # Needed to generate a seed pair.
      py_utils.ResetStepSeed()
      py_utils.GetOrCreateGlobalStep()

      depth = 4
      p = layers_with_attention.TransformerAttentionLayer.Params()
      p.name = 'transformer_atten'
      p.source_dim = depth
      p.is_masked = False
      p.num_attention_heads = 2

      p.residual_dropout_tpl = layers.DeterministicDropoutLayer.Params()
      p.residual_dropout_prob = 0.1

      transformer_atten = layers_with_attention.TransformerAttentionLayer(p)

      (source_vecs, source_padding, _,
       _) = self._testTransformerAttentionLayerInputs(depth=depth)

      ctx, probs = transformer_atten.FProp(transformer_atten.theta, source_vecs,
                                           source_padding)

      tf.global_variables_initializer().run()
      actual_ctx, actual_probs = sess.run([ctx, probs])

      # pylint: disable=bad-whitespace
      # pyformat: disable
      print(np.array_repr(actual_ctx))
      expected_ctx = np.array([
          [[-1.45762944,  1.5337404 ,  0.34037334, -0.97208667],
           [-1.35992002, -1.06530988,  1.53705895,  2.79370689]],
          [[ 0.00657134,  1.12030125, -1.32564592, -1.73569465],
           [-0.80793667, -0.10877949, -0.80295694,  2.25494242]],
          [[ 1.76956046, -0.50777751, -1.19745886, -1.46751583],
           [-1.79178905, -0.77374339,  1.31586027,  2.98173356]],
          [[-0.85498607, -0.37413225,  1.25707364, -0.50043333],
           [ 1.62276983,  0.50820369, -1.52967572, -2.02076197]],
          [[-0.66754031, -0.68657839, -0.51643699,  1.96581018],
           [-1.4816376 ,  0.89419198, -0.57226259,  1.90177512]]
      ], dtype=np.float32)

      print(np.array_repr(actual_probs))
      expected_probs = np.array([
          [[ 0.21387868,  0.22080734,  0.        ,  0.        ,  0.56531399],
           [ 0.        ,  0.30584112,  0.24723588,  0.44692296,  0.        ]],
          [[ 0.25358215,  0.50932312,  0.        ,  0.        ,  0.23709476],
           [ 0.        ,  0.56834149,  0.2632803 ,  0.16837817,  0.        ]],
          [[ 0.38519409,  0.55454361,  0.        ,  0.        ,  0.06026226],
           [ 0.        ,  0.33708778,  0.21976741,  0.4431448 ,  0.        ]],
          [[ 0.27139962,  0.12790371,  0.        ,  0.        ,  0.60069668],
           [ 0.        ,  0.31849149,  0.28174096,  0.39976761,  0.        ]],
          [[ 0.16272782,  0.15781289,  0.        ,  0.        ,  0.67945927],
           [ 0.        ,  0.55003977,  0.26049581,  0.18946445,  0.        ]]
      ], dtype=np.float32)
      # pyformat: enable
      # pylint: enable=bad-whitespace
      self.assertAllClose(expected_ctx, actual_ctx, rtol=1e-05, atol=1e-05)
      self.assertAllClose(expected_probs, actual_probs, rtol=1e-05, atol=1e-05)
Exemple #3
0
    def testTransformerAttentionLayerCase3(self):
        with self.session(use_gpu=True) as sess:
            depth = 4
            p = layers_with_attention.TransformerAttentionLayer.Params()
            p.name = 'transformer_atten'
            p.source_dim = depth
            p.is_masked = False
            p.num_attention_heads = 2
            transformer_atten = layers_with_attention.TransformerAttentionLayer(
                p)

            (query_vec, _, aux_vecs,
             aux_paddings) = self._testTransformerAttentionLayerInputs(
                 depth=depth)

            ctx, probs = transformer_atten.FPropDefaultTheta(
                query_vec, aux_paddings, aux_vecs)
            tf.global_variables_initializer().run()
            actual_ctx, actual_probs = sess.run([ctx, probs])
            tf.logging.info(np.array_repr(actual_ctx))
            tf.logging.info(np.array_repr(actual_probs))
            # pylint: disable=bad-whitespace
            # pyformat: disable
            expected_ctx = [
                [[-1.42420077, 1.19024372, 1.35146523, 0.85896158],
                 [-0.44974625, -1.00108492, 1.63387251, 1.678146]],
                [[0.1134335, 1.97617495, -0.35918081, 0.26396495],
                 [-0.19688171, -0.71197301, 0.0659425, 2.5417304]],
                [[1.58169425, 0.81259179, -0.58948535, 0.20254248],
                 [-0.84438968, -0.65845209, 1.45584249, 1.87587976]],
                [[-1.01532316, -0.05166581, 2.07901478, 0.97540361],
                 [2.08563352, 0.34328598, -0.23240227, -0.19035631]],
                [[-0.53881919, -0.60117185, 0.29170275, 2.6474514],
                 [-0.88318163, 0.37149727, -0.16098523, 2.3810885]]
            ]
            expected_probs = [
                [[0.32392544, 0., 0.27218491, 0., 0.19574419, 0., 0.20814547],
                 [0., 0.273045, 0., 0.43572819, 0., 0.2912268, 0.]],
                [[0.24094662, 0., 0.23919827, 0., 0.26563686, 0., 0.25421822],
                 [0., 0.21680018, 0., 0.33962148, 0., 0.44357836, 0.]],
                [[0.20083594, 0., 0.20683075, 0., 0.28931937, 0., 0.30301392],
                 [0., 0.24710922, 0., 0.453915, 0., 0.29897571, 0.]],
                [[0.32845193, 0., 0.26491433, 0., 0.18304622, 0., 0.22358747],
                 [0., 0.39426237, 0., 0.19774443, 0., 0.4079932, 0.]],
                [[0.23542665, 0., 0.27910906, 0., 0.30036426, 0., 0.18510005],
                 [0., 0.20147586, 0., 0.37759233, 0., 0.42093182, 0.]]
            ]
            # pyformat: enable
            # pylint: enable=bad-whitespace
            self.assertAllClose(expected_ctx,
                                actual_ctx,
                                rtol=1e-05,
                                atol=1e-05)
            self.assertAllClose(expected_probs,
                                actual_probs,
                                rtol=1e-05,
                                atol=1e-05)
Exemple #4
0
    def testTransformerAttentionLayerCase1(self):
        with self.session(use_gpu=True) as sess:
            depth = 4
            p = layers_with_attention.TransformerAttentionLayer.Params()
            p.name = 'transformer_atten'
            p.source_dim = depth
            p.is_masked = False
            p.num_attention_heads = 2
            transformer_atten = layers_with_attention.TransformerAttentionLayer(
                p)

            (source_vecs, source_padding, _,
             _) = self._testTransformerAttentionLayerInputs(depth=depth)

            ctx, probs = transformer_atten.FPropDefaultTheta(
                source_vecs, source_padding)
            tf.global_variables_initializer().run()
            actual_ctx, actual_probs = sess.run([ctx, probs])
            # pylint: disable=bad-whitespace
            # pyformat: disable
            expected_ctx = [
                [[-1.47126436, 1.46579707, 0.39105844, -0.88563323],
                 [-1.29514003, -1.08241224, 1.49894714, 2.5935874]],
                [[-0.00313053, 1.17399275, -1.28071034, -1.6311729],
                 [-0.77028418, -0.18855178, -0.75814998, 2.19872856]],
                [[1.72851753, -0.40323859, -1.19053328, -1.39761829],
                 [-1.72141743, -0.78715289, 1.28404212, 2.78338313]],
                [[-0.8881942, 0.33776048, 1.28791749, -0.45082122],
                 [1.4362365, 0.46009994, -1.45436597, -1.90602148]],
                [[-0.51681399, -0.70075679, -0.48352116, 1.93754733],
                 [-1.44486678, 0.81801879, -1.03079689, 1.86697066]]
            ]
            expected_probs = [[[0.21387868, 0.22080734, 0., 0., 0.56531399],
                               [0., 0.30584112, 0.24723588, 0.44692296, 0.]],
                              [[0.25358215, 0.50932312, 0., 0., 0.23709476],
                               [0., 0.56834149, 0.2632803, 0.16837817, 0.]],
                              [[0.38519409, 0.55454361, 0., 0., 0.06026226],
                               [0., 0.33708778, 0.21976741, 0.4431448, 0.]],
                              [[0.27139962, 0.12790371, 0., 0., 0.60069668],
                               [0., 0.31849149, 0.28174096, 0.39976761, 0.]],
                              [[0.16272782, 0.15781289, 0., 0., 0.67945927],
                               [0., 0.55003977, 0.26049581, 0.18946445, 0.]]]
            # pyformat: enable
            # pylint: enable=bad-whitespace
            self.assertAllClose(expected_ctx,
                                actual_ctx,
                                rtol=1e-05,
                                atol=1e-05)
            self.assertAllClose(expected_probs,
                                actual_probs,
                                rtol=1e-05,
                                atol=1e-05)
Exemple #5
0
    def testTransformerAttentionLayerCase2(self):
        with self.session(use_gpu=True) as sess:
            depth = 4
            p = layers_with_attention.TransformerAttentionLayer.Params()
            p.name = 'transformer_atten'
            p.source_dim = depth
            p.is_masked = True
            p.num_attention_heads = 2
            transformer_atten = layers_with_attention.TransformerAttentionLayer(
                p)

            (source_vecs, source_padding, _,
             _) = self._testTransformerAttentionLayerInputs(depth=depth)
            ctx, probs = transformer_atten.FPropDefaultTheta(
                source_vecs, source_padding)
            tf.global_variables_initializer().run()
            actual_ctx, actual_probs = sess.run([ctx, probs])
            tf.logging.info(np.array_repr(actual_ctx))
            tf.logging.info(np.array_repr(actual_probs))
            # pylint: disable=bad-whitespace
            # pyformat: disable
            expected_ctx = [
                [[-0.14429152, 1.15510106, 1.11930299, -1.19245839],
                 [-0.69580591, -0.47006619, 0.82592297, 0.69593251]],
                [[0.24164687, 0.53328454, -1.02119482, -1.49412084],
                 [-0.82601064, 0.024203, -1.11880171, 1.80784416]],
                [[1.7644347, -0.53346401, -1.1461122, -1.42797422],
                 [-0.95326459, 0.39580142, 0.39262164, 0.67513674]],
                [[-0.28252155, -0.95237327, 2.08757687, -0.21231559],
                 [1.4362365, 0.46009994, -1.45436597, -1.90602148]],
                [[-0.51681399, -0.70075679, -0.48352116, 1.93754733],
                 [-1.44486678, 0.81801879, -1.03079689, 1.86697066]]
            ]
            expected_probs = [[[1., 0., 0., 0., 0.], [0.2, 0.2, 0.2, 0.2,
                                                      0.2]],
                              [[0.3966811, 0.60331887, 0., 0., 0.],
                               [0., 1., 0., 0., 0.]],
                              [[0.41050252, 0.58949745, 0., 0., 0.],
                               [0., 0.5245893, 0.4754107, 0., 0.]],
                              [[0.58882225, 0.41117775, 0., 0., 0.],
                               [0., 0.31849149, 0.28174096, 0.39976761, 0.]],
                              [[0.16272782, 0.15781289, 0., 0., 0.67945927],
                               [0., 0.55003977, 0.26049581, 0.18946445, 0.]]]
            # pyformat: enable
            # pylint: enable=bad-whitespace
            self.assertAllClose(expected_ctx, actual_ctx)
            self.assertAllClose(expected_probs, actual_probs)
Exemple #6
0
    def testTransformerAttentionLayerStepByStep(self):
        with self.session(use_gpu=True) as sess:
            depth = 4
            p = layers_with_attention.TransformerAttentionLayer.Params()
            p.name = 'transformer_atten'
            p.source_dim = depth
            p.is_masked = True
            p.num_attention_heads = 2
            x_atten = layers_with_attention.TransformerAttentionLayer(p)

            (source_vecs, _, _,
             _) = self._testTransformerAttentionLayerInputs(depth=depth)
            source_padding = tf.zeros([5, 2])

            ctx1, probs1 = x_atten.FPropDefaultTheta(source_vecs,
                                                     source_padding)
            ctx2 = []
            probs2 = []
            cached_source_vecs = tf.zeros([0, 2, 4])
            cached_source_contexts = tf.zeros([0, 2, 4])
            prefix_states = py_utils.NestedMap(key=cached_source_vecs,
                                               value=cached_source_contexts)
            for i in range(5):
                ctx, probs, prefix_states = x_atten.ExtendStep(
                    x_atten.theta, source_vecs[i, :, :], prefix_states)
                probs_pad = tf.zeros([2, 5 - i - 1])
                padded_probs = tf.concat([probs, probs_pad], 1)
                ctx2.append(ctx)
                probs2.append(padded_probs)

            ctx2 = tf.stack(ctx2)
            probs2 = tf.stack(probs2)

            tf.global_variables_initializer().run()
            ctx1_v, probs1_v, ctx2_v, probs2_v = sess.run(
                [ctx1, probs1, ctx2, probs2])
            tf.logging.info(np.array_repr(ctx1_v))
            tf.logging.info(np.array_repr(probs1_v))
            tf.logging.info(np.array_repr(ctx2_v))
            tf.logging.info(np.array_repr(probs2_v))
            self.assertAllClose(ctx1_v, ctx2_v)
            self.assertAllClose(probs1_v, probs2_v)