Ejemplo n.º 1
0
    def test_ids_to_embedding_correctlyEmbeds(self):
        seq_len = 6
        vocab_size = 5
        model_size = 2

        vocab_dim = mtf.Dimension('vocab', vocab_size)
        model_dim = mtf.Dimension('model', model_size)
        length_dim = mtf.Dimension('length', seq_len)

        ids = tf.constant([0, 1, 2, 3, 4, 0], dtype=tf.int32)
        mtf_ids = mtf.import_tf_tensor(self.mesh,
                                       ids,
                                       shape=mtf.Shape([length_dim]))

        self.initializer_mock.side_effect = initialize_by_shape({
            (3, 2): [[0, 1], [2, 0], [-1000, -4000]],
            (3, 1): [[1], [2], [3]],
            (1, 2): [[1], [2]],
        })

        vocab_embedding = adaptive_softmax.AdaptiveSoftmaxVocabEmbedding(
            self.mesh,
            vocab_dim,
            output_dim=model_dim,
            variable_dtype=self.variable_dtype,
            name='embedding',
            ensemble_dim=None,
            clusters=[{
                'token_count': 2,
                'embedding_size': 2
            }, {
                'token_count': 3,
                'embedding_size': 1
            }])

        mtf_embedding = vocab_embedding.ids_to_embedding(mtf_ids, context=None)

        mesh_impl = mtf.placement_mesh_impl.PlacementMeshImpl(shape=[],
                                                              layout={},
                                                              devices=[''])
        lowering = mtf.Lowering(self.graph, {self.mesh: mesh_impl})
        actual_embedding = lowering.export_to_tf_tensor(mtf_embedding)

        self.evaluate(tf.global_variables_initializer())
        self.evaluate(lowering.copy_masters_to_slices())
        actual, = self.evaluate([actual_embedding])

        self.assertAllClose(actual,
                            [[0, 1], [2, 0], [1, 2], [2, 4], [3, 6], [0, 1]])
Ejemplo n.º 2
0
  def test_constructor_projectFactorNotWithinZeroAndOne_raisesValueError(self):
    vocab_dim = mtf.Dimension('vocab', 3)
    model_dim = mtf.Dimension('model', 2)

    with self.assertRaises(ValueError):
      adaptive_softmax.AdaptiveSoftmaxVocabEmbedding(
          self.mesh,
          vocab_dim,
          output_dim=model_dim,
          variable_dtype=self.variable_dtype,
          name='embedding',
          ensemble_dim=None,
          clusters=[{
              'token_count': 3,
              'embedding_size': 2,
              'length_projection_factor': 1.1,
          }])
Ejemplo n.º 3
0
    def test_constructor_tokenCountsDontSumToVocabSize_raisesValueError(self):
        vocab_dim = mtf.Dimension('vocab', 5)
        model_dim = mtf.Dimension('model', 2)

        with self.assertRaises(ValueError):
            adaptive_softmax.AdaptiveSoftmaxVocabEmbedding(
                self.mesh,
                vocab_dim,
                output_dim=model_dim,
                variable_dtype=self.variable_dtype,
                name='embedding',
                ensemble_dim=None,
                clusters=[{
                    'token_count': 3,
                    'embedding_size': 2
                }, {
                    'token_count': 3,
                    'embedding_size': 1
                }])
Ejemplo n.º 4
0
    def test_hidden_to_logits_returnsHiddenDuringTraining(self):
        # Arrange.
        seq_len = 2
        vocab_size = 3
        model_size = 2

        vocab_dim = mtf.Dimension('vocab', vocab_size)
        model_dim = mtf.Dimension('model', model_size)
        length_dim = mtf.Dimension('length', seq_len)

        context = mock.MagicMock()
        context.activation_dtype = tf.float32
        context.mode = tf.estimator.ModeKeys.TRAIN

        embeddings = tf.constant([[1, 0], [0, 2]], dtype=tf.float32)
        mtf_embeddings = mtf.import_tf_tensor(self.mesh,
                                              embeddings,
                                              shape=mtf.Shape(
                                                  [length_dim, model_dim]))

        self.initializer_mock.side_effect = initialize_by_shape({
            (3, 2): [[1, 6], [2, 7], [3, 8]],
        })

        vocab_embedding = adaptive_softmax.AdaptiveSoftmaxVocabEmbedding(
            self.mesh,
            vocab_dim,
            output_dim=model_dim,
            variable_dtype=self.variable_dtype,
            name='embedding',
            ensemble_dim=None,
            clusters=[{
                'token_count': 3,
                'embedding_size': 2
            }])
        mtf_logits = vocab_embedding.hidden_to_logits(mtf_embeddings,
                                                      context=context)

        self.assertEqual(mtf_logits, mtf_embeddings)
Ejemplo n.º 5
0
    def test_adaptive_softmax_loss_fn_tailClustersAllProject_correctlyComputesTheLoss(
            self):
        # Arrange.
        seq_len = 16
        vocab_size = 8
        model_size = 2

        vocab_dim = mtf.Dimension('vocab', vocab_size)
        model_dim = mtf.Dimension('model', model_size)
        length_dim = mtf.Dimension('length', seq_len)

        decoder = mock.MagicMock()
        decoder.z_loss = 0.0
        decoder.loss_denominator = mock.MagicMock()
        decoder.loss_denominator.return_value = 1.0

        # 7 tokens in head cluster
        # 5 tokens in tail cluster 1
        # 4 tokens in tail cluster 2
        targets_array = [2, 4, 4, 6, 2, 5, 7, 5, 2, 1, 6, 7, 0, 0, 3, 2]
        targets = tf.constant(targets_array, dtype=tf.int32)
        hidden = tf.constant(
            [[0, -10], [1, -11], [2, -12], [3, -13], [4, -14], [5, -15],
             [6, -16], [7, -17], [8, -18], [9, -19], [10, -20], [11, -21],
             [12, -22], [13, -23], [14, -24], [15, -25]],
            dtype=tf.float32)

        mtf_targets = mtf.import_tf_tensor(self.mesh,
                                           targets,
                                           shape=mtf.Shape([length_dim]))
        mtf_hidden = mtf.import_tf_tensor(self.mesh,
                                          hidden,
                                          shape=mtf.Shape(
                                              [length_dim, model_dim]))

        self.initializer_mock.side_effect = initialize_by_shape({
            (5, 2): [[1, 6], [2, 7], [3, 8], [4, 9], [5, 10]],
            (3, 2): [[11, 14], [12, 15], [13, 16]],
            (2, 1): [[17, 18]],
            (1, 2): [[19], [20]],
        })

        vocab_embedding = adaptive_softmax.AdaptiveSoftmaxVocabEmbedding(
            self.mesh,
            vocab_dim,
            output_dim=model_dim,
            variable_dtype=self.variable_dtype,
            name='embedding',
            ensemble_dim=None,
            clusters=[{
                'token_count': 3,
                'embedding_size': 2
            }, {
                'token_count': 3,
                'embedding_size': 2,
                'length_projection_factor': 0.5,
            }, {
                'token_count': 2,
                'embedding_size': 1,
                'length_projection_factor': 0.125,
            }])

        context = mock.MagicMock()
        context.activation_dtype = tf.float32
        context.shared_params = {'embedding': vocab_embedding}

        # Act.
        mtf_loss = adaptive_softmax.adaptive_softmax_loss_fn(
            decoder, context, mtf_hidden, mtf_targets, output_vocab_dim=None)
        lowering, loss = self._export_to_tf_tensor(mtf_loss)

        self.evaluate(tf.global_variables_initializer())
        self.evaluate(lowering.copy_masters_to_slices())
        actual_loss, = self.evaluate([loss])

        # Assert.
        def expected_head_loss(position, label):
            factor = model_dim.size**-0.5
            logits = [
                factor * (1 * position - 6 * (10 + position)),
                factor * (2 * position - 7 * (10 + position)),
                factor * (3 * position - 8 * (10 + position)),
                factor * (4 * position - 9 * (10 + position)),
                factor * (5 * position - 10 * (10 + position)),
            ]
            return _softmax_cross_entropy_with_logits(logits, label)

        expected_head_labels = [2, 3, 3, 4, 2, 3, 4, 3, 2, 1, 4, 4, 0, 0, 3, 2]
        expected_head_loss = sum(
            expected_head_loss(position, expected_label)
            for position, expected_label in enumerate(expected_head_labels)
            if expected_label)

        def expected_tail_cluster_1_loss(position):
            factor = model_dim.size**-0.5
            logits = [
                factor * (11 * position - 14 * (10 + position)),
                factor * (12 * position - 15 * (10 + position)),
                factor * (13 * position - 16 * (10 + position)),
            ]
            first_token_in_cluster_id = 3
            return _softmax_cross_entropy_with_logits(
                logits, targets_array[position] - first_token_in_cluster_id)

        expected_tail_cluster_1_loss = sum([
            expected_tail_cluster_1_loss(position=1),
            expected_tail_cluster_1_loss(position=2),
            expected_tail_cluster_1_loss(position=5),
            expected_tail_cluster_1_loss(position=7),
            expected_tail_cluster_1_loss(position=14),
        ])

        def expected_tail_cluster_2_loss(position):
            factor = model_dim.size**-0.5
            logits = [
                factor * (17 * 19 * position - 17 * 20 * (10 + position)),
                factor * (18 * 19 * position - 18 * 20 * (10 + position)),
            ]
            first_token_in_cluster_id = 6
            return _softmax_cross_entropy_with_logits(
                logits, targets_array[position] - first_token_in_cluster_id)

        # Due to the length_projection_factor of 1/8, only 2 tokens will be counted
        # despite there being 4 tokens in this cluster.
        expected_tail_cluster_2_loss = sum([
            expected_tail_cluster_2_loss(position=3),
            expected_tail_cluster_2_loss(position=6),
        ])

        expected_loss = (expected_head_loss + expected_tail_cluster_1_loss +
                         expected_tail_cluster_2_loss)

        self.assertAllClose(actual_loss, expected_loss)
Ejemplo n.º 6
0
    def test_hidden_to_logits_returnsCorrectLogitsDuringEval(self):
        # Arrange.
        seq_len = 2
        vocab_size = 8
        model_size = 2

        vocab_dim = mtf.Dimension('vocab', vocab_size)
        model_dim = mtf.Dimension('model', model_size)
        length_dim = mtf.Dimension('length', seq_len)

        context = mock.MagicMock()
        context.activation_dtype = tf.float32
        context.mode = tf.estimator.ModeKeys.EVAL

        embeddings = tf.constant([[1, 0], [0, 2]], dtype=tf.float32)
        mtf_embeddings = mtf.import_tf_tensor(self.mesh,
                                              embeddings,
                                              shape=mtf.Shape(
                                                  [length_dim, model_dim]))

        self.initializer_mock.side_effect = initialize_by_shape({
            (5, 2): [[1, 6], [2, 7], [3, 8], [4, 9], [5, 10]],
            (3, 2): [[11, 14], [12, 15], [13, 16]],
            (2, 1): [[17, 18]],
            (1, 2): [[19], [20]],
        })

        vocab_embedding = adaptive_softmax.AdaptiveSoftmaxVocabEmbedding(
            self.mesh,
            vocab_dim,
            output_dim=model_dim,
            variable_dtype=self.variable_dtype,
            name='embedding',
            ensemble_dim=None,
            clusters=[{
                'token_count': 3,
                'embedding_size': 2
            }, {
                'token_count': 3,
                'embedding_size': 2,
                'length_projection_factor': 0.5,
            }, {
                'token_count': 2,
                'embedding_size': 1,
                'length_projection_factor': 0.125,
            }])

        # Act.
        mtf_logits = vocab_embedding.hidden_to_logits(mtf_embeddings,
                                                      context=context)
        lowering, logits = self._export_to_tf_tensor(mtf_logits)

        self.evaluate(tf.global_variables_initializer())
        self.evaluate(lowering.copy_masters_to_slices())
        actual_logits, = self.evaluate([logits])

        # Assert.
        def scaled_log_softmax(a):
            a = np.array(a, dtype=float) * model_dim.size**-0.5
            return _log_softmax(a)

        head_log_softmax1 = scaled_log_softmax([1, 2, 3, 4, 5])
        head_log_softmax2 = scaled_log_softmax(
            [2 * 6, 2 * 7, 2 * 8, 2 * 9, 2 * 10])

        expected_logits = [
            np.concatenate([
                head_log_softmax1[:3],
                head_log_softmax1[3] + scaled_log_softmax([11, 12, 13]),
                head_log_softmax1[4] + scaled_log_softmax([17 * 19, 18 * 19]),
            ]),
            np.concatenate([
                head_log_softmax2[:3],
                head_log_softmax2[3] +
                scaled_log_softmax([2 * 14, 2 * 15, 2 * 16]),
                head_log_softmax2[4] +
                scaled_log_softmax([2 * 17 * 20, 2 * 18 * 20]),
            ]),
        ]

        self.assertAllClose(actual_logits, expected_logits, atol=5e-5)