コード例 #1
0
    def testGPipeTransformerStackTrainTransparentFPropWithEmbeddings(
            self, splits=1, num_micro_batches=1):
        # time = 2,
        batch = 4
        with self.session() as sess:
            params = _TransformerParamsWithEmbeddings(
                splits=splits,
                num_micro_batches=num_micro_batches,
                num_decoder_layers=3,
                num_encoder_layers=1)
            params.is_transparent = True
            params.transparent_merger_dropout_prob = 0.0
            xformer = GPipeTransformerStack(params)

            input_ids, id_paddings, tgt_inputs, tgt_paddings, _, _ = _TransformerRandomInputsIds(
                batch=batch)
            inputs, paddings, _, _ = _TransformerRandomInputsVecs(batch=batch)
            tf.random.set_seed(1234)
            tf.global_variables_initializer().run()
            enc_outputs = xformer.EncoderFPropDefaultTheta(inputs, paddings)
            dec_output = xformer.FProp(xformer.theta, input_ids, id_paddings,
                                       tgt_inputs, tgt_paddings)[2]
            enc_out_1 = sess.run(enc_outputs)
            dec_out = sess.run(dec_output)
            self.assertAllClose(
                [[[0.017581, 0.802863, 0.975554, -1.164572]] * batch,
                 [[-0.549953, 1.196884, 4.910457, -0.102137]] * batch],
                enc_out_1)
            self.assertAllClose(
                [[[-1.122128, 1.111972, 4.642949, -2.14831]] * batch,
                 [[-1.336919, 1.182709, 4.785938, -2.039246]] * batch,
                 [[-1.335168, 1.297679, 4.720459, -2.111006]] * batch],
                dec_out)
コード例 #2
0
    def _testGPipeTransformerStackTrainTransparentFProp(
            self, splits=1, num_micro_batches=1):
        # time = 2,
        batch = 4
        with self.session() as sess:
            params = self._TransformerParams(
                splits=splits,
                num_micro_batches=num_micro_batches,
                num_decoder_layers=3,
                num_encoder_layers=1)
            params.is_transparent = True
            params.num_transparent_outputs = 3
            params.transparent_merger_dropout_prob = 0.0
            xformer = GPipeTransformerStack(params)

            inputs, paddings, tgt_inputs, tgt_paddings = self._random_inputs(
                batch=batch)
            py_utils.GetOrCreateGlobalStep()
            tf.set_random_seed(1234)
            tf.global_variables_initializer().run()
            enc_outputs = xformer.EncoderFPropDefaultTheta(inputs, paddings)
            dec_output = xformer.FProp(xformer.theta, inputs, paddings,
                                       tgt_inputs, tgt_paddings)
            enc_out_1, enc_out_2, enc_out_3 = sess.run(enc_outputs)
            dec_out = sess.run(dec_output)
            self.assertAllClose(enc_out_1, enc_out_2)
            self.assertAllClose(enc_out_2, enc_out_3)
            self.assertAllClose(enc_out_1,
                                [[[-0.27896273, 1.46589136]] * batch,
                                 [[1.03141928, -0.847896]] * batch])
            self.assertAllClose(dec_out, [[[2.926736, -4.090812]] * batch,
                                          [[-1.69508219, 1.75891459]] * batch,
                                          [[-1.6950829, 1.75891507]] * batch])
コード例 #3
0
    def _testGPipeTransformerDecoderStackFProp(self,
                                               splits=1,
                                               num_micro_batches=1):
        batch = 4
        tf.flags.FLAGS.tpu_compatible = True
        with self.session() as sess:
            params = self._TransformerParams(
                num_decoder_layers=4,
                num_encoder_layers=0,
                splits=splits,
                num_micro_batches=num_micro_batches)
            params.dtype = tf.float32
            params.fprop_dtype = tf.float32
            xformer = GPipeTransformerStack(params)

            inputs, paddings, tgt_inputs, tgt_paddings = self._random_inputs(
                batch)

            output = xformer.FProp(xformer.theta, inputs, paddings, tgt_inputs,
                                   tgt_paddings)

            tf.global_variables_initializer().run()
            output_val = sess.run(output)
            self.assertAllCloseAccordingToType(
                [[[1.03550637, -1.3199079]] * batch,
                 [[-3.36382699, -0.74492991]] * batch,
                 [[-3.36382723, -0.74492997]] * batch], output_val)
コード例 #4
0
    def _testGPipeTransformerStackTrainEncoderTransparentFProp(
            self, splits=1, num_micro_batches=1):
        # time = 2,
        batch = 4
        with self.session() as sess:
            params = self._TransformerParams(
                splits=splits,
                num_micro_batches=num_micro_batches,
                num_decoder_layers=2,
                num_encoder_layers=2)
            params.is_transparent = True
            params.num_transparent_outputs = 1
            params.transparent_merger_dropout_prob = 0.0
            xformer = GPipeTransformerStack(params)

            inputs, paddings, tgt_inputs, tgt_paddings = self._random_inputs(
                batch=batch)
            py_utils.GetOrCreateGlobalStep()
            tf.set_random_seed(1234)
            tf.global_variables_initializer().run()
            enc_output = xformer.EncoderFPropDefaultTheta(inputs, paddings)
            dec_output = xformer.FProp(xformer.theta, inputs, paddings,
                                       tgt_inputs, tgt_paddings)
            enc_out = sess.run(enc_output)
            dec_out = sess.run(dec_output)
            self.assertAllClose(enc_out, [[[-0.118476, 1.031626]] * batch,
                                          [[0.643884, -1.02581167]] * batch])
            self.assertAllClose(dec_out, [[[-2.8764534, 1.00808454]] * batch,
                                          [[1.02129495, -0.78406084]] * batch,
                                          [[1.02129495, -0.78406084]] * batch])
コード例 #5
0
    def testGPipeTransformerStackTrainTransparentFPropWithEmbeddings(
            self, splits=1, num_micro_batches=1):
        # time = 2,
        batch = 4
        with self.session() as sess:
            params = _TransformerParamsWithEmbeddings(
                splits=splits,
                num_micro_batches=num_micro_batches,
                num_decoder_layers=3,
                num_encoder_layers=1)
            params.is_transparent = True
            params.transparent_merger_dropout_prob = 0.0
            xformer = GPipeTransformerStack(params)

            input_ids, id_paddings, tgt_inputs, tgt_paddings, _, _ = _TransformerRandomInputsIds(
                batch=batch)
            inputs, paddings, _, _ = _TransformerRandomInputsVecs(batch=batch)
            tf.set_random_seed(1234)
            tf.global_variables_initializer().run()
            enc_outputs = xformer.EncoderFPropDefaultTheta(inputs, paddings)
            dec_output = xformer.FProp(xformer.theta, input_ids, id_paddings,
                                       tgt_inputs, tgt_paddings)[2]
            enc_out_1 = sess.run(enc_outputs)
            dec_out = sess.run(dec_output)
            self.assertAllClose(
                [[[0.68660116, 0.947429, 0.78953624, -1.20142817]] * batch,
                 [[0.57919669, 1.12979364, 4.29336643, 0.45106331]] * batch],
                enc_out_1)
            self.assertAllClose(
                [[[-0.46651918, -1.62957835, 1.15657926, 1.08397353]] * batch,
                 [[-0.34674695, -1.65999401, 1.08431196, 1.07384491]] * batch,
                 [[-0.41073492, -1.60431314, 1.04607999, 1.08858371]] * batch],
                dec_out)
コード例 #6
0
    def _testGPipeTransformerEncoderFPropDefaultTheta(self,
                                                      splits=1,
                                                      num_micro_batches=1):
        batch = 4
        tf.flags.FLAGS.tpu_compatible = True
        with self.session() as sess:
            params = self._TransformerParams(
                num_decoder_layers=4,
                num_encoder_layers=4,
                splits=splits,
                num_micro_batches=num_micro_batches)
            params.dtype = tf.float32
            params.fprop_dtype = tf.float32
            xformer = GPipeTransformerStack(params)

            inputs, paddings, _, _ = self._random_inputs(batch)

            output = xformer.EncoderFPropDefaultTheta(inputs, paddings)

            tf.global_variables_initializer().run()
            output = sess.run(output)

            self.assertAllCloseAccordingToType(
                [[[0.21085747, 0.60925347]] * batch,
                 [[0.21085747, 0.60925347]] * batch], output)
コード例 #7
0
  def testGPipeTransformerDecoderStackFPropWithEmbeddings(
      self, splits=1, num_micro_batches=1):
    batch = 4
    tf.flags.FLAGS.tpu_compatible = True
    with self.session() as sess:
      params = self._TransformerParamsWithEmbeddings(
          num_decoder_layers=4,
          num_encoder_layers=0,
          splits=splits,
          num_micro_batches=num_micro_batches)
      params.dtype = tf.float32
      xformer = GPipeTransformerStack(params)

      inputs, paddings, tgt_inputs, tgt_paddings = self._random_inputs_ids(
          batch)

      output = xformer.FProp(xformer.theta, inputs, paddings, tgt_inputs,
                             tgt_paddings)

      tf.global_variables_initializer().run()
      output_val = sess.run(output)
      self.assertAllCloseAccordingToType(
          [[[-2.29650807, 0.25992393, 1.81951356, 1.52897644]] * batch,
           [[-2.14101386, 0.32607365, 1.73413348, 1.51806736]] * batch,
           [[-2.18863297, 0.34420109, 1.65913653, 1.58703828]] * batch],
          output_val)
コード例 #8
0
  def testGPipeTransformerMtModel(self, splits=1, num_micro_batches=1):
    batch = 4
    tf.flags.FLAGS.tpu_compatible = True
    with self.session() as sess:
      with tf.variable_scope('transformer_test', reuse=tf.AUTO_REUSE):
        params = self._TransformerParamsWithEmbeddings(
            splits=splits,
            num_micro_batches=num_micro_batches,
            num_decoder_layers=2,
            has_softmax=True)
        params.state_dtype = tf.float32
      xformer = GPipeTransformerStack(params)

      input_ids, id_paddings, tgt_inputs, tgt_paddings = (
          self._random_inputs_ids(batch=batch))
      labels = tf.ones([3, batch])
      label_weights = tf.ones([3, batch])
      tf.set_random_seed(1234)
      tf.global_variables_initializer().run()
      xent, logits = xformer.FProp(xformer.theta, input_ids, id_paddings,
                                   tgt_inputs, tgt_paddings, None, None, labels,
                                   label_weights)
      xent_out, logits_out = sess.run([xent, logits])
      print('xent_out={}'.format(xent_out))
      print('logits_out={}'.format(logits_out))
コード例 #9
0
  def _TransformerParamsWithEmbeddings(self,
                                       num_decoder_layers=0,
                                       num_encoder_layers=4,
                                       splits=1,
                                       num_micro_batches=1):
    model_dim = 4
    params = GPipeTransformerStack.Params()
    params.name = 'transformer'
    params.model_dim = model_dim
    params.num_decoder_layers = num_decoder_layers
    params.decoder_tpl.tr_atten_tpl.num_attention_heads = 1
    params.decoder_tpl.tr_fflayer_tpl.hidden_dim = model_dim
    params.num_encoder_layers = num_encoder_layers
    params.encoder_tpl.tr_atten_tpl.num_attention_heads = 1
    params.encoder_tpl.tr_fflayer_tpl.hidden_dim = model_dim
    params.num_micro_batches = num_micro_batches
    params.use_pipelined_embeddings = True
    params.state_dtype = tf.float32

    emb_params = params.emb_tpl
    # Default config for the token embedding.
    emb_params.token_emb.use_matmul = True
    emb_params.token_emb.use_3d_weight_tensor = False
    emb_params.token_emb.vocab_size = 10
    emb_params.token_emb.embedding_dim = model_dim

    # Default config for the position embedding.
    emb_params.position_emb.embedding_dim = model_dim
    emb_params.position_emb.trainable_scaling = False
    params.splits = splits
    params.random_seed = 0
    return params
コード例 #10
0
    def testGPipeTransformerStackTrainEncoderTransparentFPropEval(self):
        # time = 2,
        batch = 4
        with self.session() as sess:
            params = self._TransformerParams(num_decoder_layers=3,
                                             num_encoder_layers=3)
            params.is_transparent = True
            params.num_transparent_outputs = 1
            params.is_eval = True

            xformer = GPipeTransformerStack(params)

            inputs, paddings, _, _ = self._random_inputs(batch=batch)

            tf.global_variables_initializer().run()
            enc_outputs = xformer.EncoderFPropDefaultTheta(inputs, paddings)
            enc_out = sess.run(enc_outputs)
            self.assertAllClose(enc_out, [[[0.18823329, 0.71548849]] * batch,
                                          [[0.76032472, -0.82791042]] * batch])
コード例 #11
0
    def testGPipeTransformerStackTrainTransparentFPropEval(self):
        # time = 2,
        batch = 4
        with self.session() as sess:
            params = self._TransformerParams(num_decoder_layers=3,
                                             num_encoder_layers=1)
            params.is_transparent = True
            params.is_eval = True

            xformer = GPipeTransformerStack(params)

            inputs, paddings, _, _ = self._random_inputs(batch=batch)

            tf.global_variables_initializer().run()
            enc_outputs = xformer.EncoderFPropDefaultTheta(inputs, paddings)
            enc_out = sess.run(enc_outputs)
            self.assertAllClose(
                enc_out, [[[[-0.27896273] * 3, [1.46589136] * 3]] * batch,
                          [[[1.03141928] * 3, [-0.847896] * 3]] * batch])
コード例 #12
0
  def _testGPipeTransformerFPropPackedInput(self, splits=1):
    batch = 4
    tf.flags.FLAGS.tpu_compatible = True
    with self.session() as sess:
      with tf.variable_scope('transformer_test', reuse=tf.AUTO_REUSE):
        params = self._TransformerParams(splits=splits)
        params.dtype = tf.float32
        params.fprop_dtype = tf.float32
        packed_params = params.Copy()
        packed_params.packed_input = True
        xformer = GPipeTransformerStack(params)
        packed_xformer = GPipeTransformerStack(packed_params)
        # Prepare inputs
        inputs, paddings, tgt_inputs, tgt_paddings = self._random_inputs(batch)
        packed_inputs = tf.reshape(inputs, [-1, 1, 2])
        packed_tgt_inputs = tf.reshape(tgt_inputs, [-1, 1, 2])
        packed_paddings = tf.reshape(paddings, [-1, 1])
        packed_tg_paddings = tf.reshape(tgt_paddings, [-1, 1])
        segment_ids = tf.transpose(
            tf.constant([[0, 1, 2, 3, 0, 1, 2, 3]], dtype=tf.float32))
        tgt_segment_id = tf.transpose(
            tf.constant([[0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]],
                        dtype=tf.float32))

        output = xformer.FProp(xformer.theta, inputs, paddings, tgt_inputs,
                               tgt_paddings)
        packed_output = packed_xformer.FProp(
            packed_xformer.theta, packed_inputs, packed_paddings,
            packed_tgt_inputs, packed_tg_paddings, segment_ids, tgt_segment_id)
        packed_output = tf.reshape(packed_output, output.shape)

        tf.global_variables_initializer().run()
        output, packed_output = sess.run([output, packed_output])
        self.assertAllClose(output, packed_output)
コード例 #13
0
  def testGPipeTransformerStackFPropWithEmbeddings(self,
                                                   splits=1,
                                                   num_micro_batches=1):
    batch = 4
    tf.flags.FLAGS.tpu_compatible = True
    with self.session() as sess:
      params = self._TransformerParamsWithEmbeddings(
          splits=splits, num_micro_batches=num_micro_batches)
      params.dtype = tf.float32
      params.fprop_dtype = tf.float32
      xformer = GPipeTransformerStack(params)

      inputs, paddings, _, _ = self._random_inputs_ids(batch)

      output = xformer.FProp(xformer.theta, inputs, paddings)

      tf.global_variables_initializer().run()
      output = sess.run(output)

      self.assertAllCloseAccordingToType(
          [[[-1.67121327, -1.24759686, 1.41572773, 2.42515182]] * batch,
           [[-1.71240354, -1.1253252, 0.23407015, 3.40547156]] * batch], output)
コード例 #14
0
def _TransformerParamsWithEmbeddings(num_decoder_layers=0,
                                     num_encoder_layers=4,
                                     splits=1,
                                     num_micro_batches=1,
                                     has_softmax=False,
                                     use_task_ids=False):
    model_dim = 4
    params = GPipeTransformerStack.Params()
    params.name = 'transformer'
    params.model_dim = model_dim
    params.num_decoder_layers = num_decoder_layers
    params.decoder_tpl.source_dim = model_dim
    params.decoder_tpl.tr_atten_tpl.num_attention_heads = 1
    params.decoder_tpl.tr_fflayer_tpl.hidden_dim = model_dim
    params.num_encoder_layers = num_encoder_layers
    params.encoder_tpl.source_dim = model_dim
    params.encoder_tpl.tr_atten_tpl.num_attention_heads = 1
    params.encoder_tpl.tr_fflayer_tpl.hidden_dim = model_dim
    params.num_micro_batches = num_micro_batches
    params.state_dtype = tf.float32
    if has_softmax:
        params.softmax_tpl.input_dim = model_dim
        params.softmax_tpl.num_classes = 2
    else:
        params.softmax_tpl = None

    emb_params = params.emb_tpl
    # Default config for the token embedding.
    emb_params.token_emb.use_matmul = True
    emb_params.token_emb.use_3d_weight_tensor = False
    emb_params.token_emb.vocab_size = 10
    emb_params.token_emb.embedding_dim = model_dim

    # Default config for the position embedding.
    emb_params.position_emb.embedding_dim = model_dim
    emb_params.position_emb.trainable_scaling = False

    # Task embeddings.
    if use_task_ids:
        emb_params.enc_task_emb = emb_params.token_emb.Copy()
        emb_params.dec_task_emb = emb_params.token_emb.Copy()
    params.splits = splits
    params.random_seed = 0
    return params
コード例 #15
0
 def _TransformerParams(self,
                        num_decoder_layers=0,
                        num_encoder_layers=4,
                        splits=1,
                        num_micro_batches=1):
     model_dim = 2
     params = GPipeTransformerStack.Params()
     params.name = 'transformer'
     params.model_dim = model_dim
     params.num_decoder_layers = num_decoder_layers
     params.decoder_tpl.tr_atten_tpl.num_attention_heads = 1
     params.decoder_tpl.tr_fflayer_tpl.hidden_dim = model_dim
     params.num_encoder_layers = num_encoder_layers
     params.encoder_tpl.tr_atten_tpl.num_attention_heads = 1
     params.encoder_tpl.tr_fflayer_tpl.hidden_dim = model_dim
     params.num_micro_batches = num_micro_batches
     params.splits = splits
     params.random_seed = 0
     return params
コード例 #16
0
    def testGPipeTransformerFPropPackedInputWithEmbeddings(self, splits=1):
        batch = 4
        tf.flags.FLAGS.tpu_compatible = True
        with self.session():
            with tf.variable_scope('transformer_test', reuse=tf.AUTO_REUSE):
                params = _TransformerParamsWithEmbeddings(splits=splits,
                                                          num_decoder_layers=2)
                params.dtype = tf.float32
                params.fprop_dtype = tf.float32
                packed_params = params.Copy()
                packed_params.packed_input = True
                xformer = GPipeTransformerStack(params)
                packed_xformer = GPipeTransformerStack(packed_params)
                # Prepare inputs
                inputs, paddings, tgt_inputs, tgt_paddings, _, _ = _TransformerRandomInputsIds(
                    batch)
                packed_inputs = tf.reshape(inputs, [-1, 1])
                packed_tgt_inputs = tf.reshape(tgt_inputs, [-1, 1])
                packed_paddings = tf.reshape(paddings, [-1, 1])
                packed_tg_paddings = tf.reshape(tgt_paddings, [-1, 1])
                segment_ids = tf.transpose(
                    tf.constant([[0, 1, 2, 3, 0, 1, 2, 3]], dtype=tf.float32))
                tgt_segment_id = tf.transpose(
                    tf.constant([[0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]],
                                dtype=tf.float32))
                segment_pos_id = tf.transpose(
                    tf.constant([[0, 0, 0, 0, 1, 1, 1, 1]], dtype=tf.int32))
                tgt_segment_pos_id = tf.transpose(
                    tf.constant([[0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]],
                                dtype=tf.int32))

                output = xformer.FProp(xformer.theta, inputs, paddings,
                                       tgt_inputs, tgt_paddings)[2]
                packed_output = packed_xformer.FProp(
                    packed_xformer.theta, packed_inputs, packed_paddings,
                    packed_tgt_inputs, packed_tg_paddings, segment_ids,
                    tgt_segment_id, None, None, segment_pos_id,
                    tgt_segment_pos_id)[2]
                packed_output = tf.reshape(packed_output, output.shape)

                self.evaluate(tf.global_variables_initializer())
                output, packed_output = self.evaluate([output, packed_output])
                self.assertAllClose(output,
                                    packed_output,
                                    rtol=1e-05,
                                    atol=1e-05)