Beispiel #1
0
    def test_output_layer(self):
        decoder = TransformerDecoder(vocab_size=self._vocab_size,
                                     output_layer=None)
        self.assertIsInstance(decoder, TransformerDecoder)

        decoder = TransformerDecoder(output_layer=texar.core.identity)
        self.assertIsInstance(decoder, TransformerDecoder)

        tensor = torch.rand(self._vocab_size, self._emb_dim, dtype=torch.float)
        decoder = TransformerDecoder(output_layer=tensor)
        self.assertIsInstance(decoder, TransformerDecoder)
        self.assertEqual(decoder.vocab_size, self._vocab_size)
    def test_output_layer(self):
        decoder = TransformerDecoder(vocab_size=self._vocab_size,
                                     output_layer=None)
        self.assertIsInstance(decoder, TransformerDecoder)

        decoder = TransformerDecoder(output_layer=tf.identity)
        self.assertIsInstance(decoder, TransformerDecoder)

        tensor = tf.random_uniform(
            [self._emb_dim, self._vocab_size], maxval=1, dtype=tf.float32
        )
        decoder = TransformerDecoder(output_layer=tensor)
        self.assertIsInstance(decoder, TransformerDecoder)
        self.assertEqual(decoder.vocab_size, self._vocab_size)
    def test_beam_search(self):
        """Tests beam_search
        """
        decoder = TransformerDecoder(
            vocab_size=self._vocab_size,
            output_layer=self._output_layer
        )

        outputs = decoder(
            memory=self._memory,
            memory_sequence_length=self._memory_sequence_length,
            memory_attention_bias=None,
            inputs=None,
            embedding=self._embedding_fn,
            beam_width=5,
            start_tokens=self._start_tokens,
            end_token=self._end_token,
            max_decoding_length=self._max_decode_len,
            mode=tf.estimator.ModeKeys.PREDICT
        )

        with self.test_session() as sess:
            sess.run(tf.global_variables_initializer())
            outputs_ = sess.run(outputs)
            self.assertEqual(outputs_['log_prob'].shape,
                             (self._batch_size, 5))
            self.assertEqual(outputs_['sample_id'].shape,
                             (self._batch_size, self._max_decode_len, 5))
    def test_infer_greedy_with_context_without_memory(self):
        """Tests train_greedy with context
        """
        decoder = TransformerDecoder(
            vocab_size=self._vocab_size,
            output_layer=self._output_layer
        )
        helper = tx_helper.GreedyEmbeddingHelper(
            self._embedding_fn, self._start_tokens, self._end_token)

        outputs, length = decoder(
            memory=None,
            memory_sequence_length=None,
            memory_attention_bias=None,
            inputs=None,
            decoding_strategy='infer_greedy',
            helper=helper,
            context=self._context,
            context_sequence_length=self._context_length,
            end_token=self._end_token,
            max_decoding_length=self._max_decode_len,
            mode=tf.estimator.ModeKeys.PREDICT)
        with self.test_session() as sess:
            sess.run(tf.global_variables_initializer())
            outputs_ = sess.run(outputs)
            self.assertIsInstance(outputs_, TransformerDecoderOutput)
    def __init__(self, config_model, config_data):
        ModuleBase.__init__(self)
        self.config_model = config_model
        self.config_data = config_data

        with open(config_data.vocab_file, "rb") as f:
            id2w = pickle.load(f)
        self.id2w = id2w
        self.vocab_size = len(id2w)
        self.pad_token_id, self.bos_token_id = (0, 1)
        self.eos_token_id, self.unk_token_id = (2, 3)

        self.word_embedder = WordEmbedder(vocab_size=self.vocab_size,
                                          hparams=config_model.emb)
        self.pos_embedder = SinusoidsPositionEmbedder(
            position_size=config_data.max_decoding_length,
            hparams=config_model.position_embedder_hparams,
        )

        self.encoder = TransformerEncoder(hparams=config_model.encoder)
        self.decoder = TransformerDecoder(
            vocab_size=self.vocab_size,
            output_layer=self.word_embedder.embedding,
            hparams=config_model.decoder,
        )

        self.smoothed_loss_func = LabelSmoothingLoss(
            label_confidence=self.config_model.loss_label_confidence,
            tgt_vocab_size=self.vocab_size,
            ignore_index=0,
        )
Beispiel #6
0
    def test_greedy_embedding_helper(self):
        """Tests with tf.contrib.seq2seq.GreedyEmbeddingHelper
        """
        decoder = TransformerDecoder(vocab_size=self._vocab_size,
                                     output_layer=self._output_layer)
        decoder.eval()
        helper = decoder_helpers.GreedyEmbeddingHelper(self._embedding,
                                                       self._start_tokens,
                                                       self._end_token)
        outputs, length = decoder(
            memory=self._memory,
            memory_sequence_length=self._memory_sequence_length,
            memory_attention_bias=None,
            helper=helper,
            max_decoding_length=self._max_decode_len)

        self.assertIsInstance(outputs, TransformerDecoderOutput)
Beispiel #7
0
    def test_beam_search(self):
        """Tests beam_search
        """
        decoder = TransformerDecoder(vocab_size=self._vocab_size,
                                     output_layer=self._output_layer)
        decoder.eval()
        outputs = decoder(memory=self._memory,
                          memory_sequence_length=self._memory_sequence_length,
                          memory_attention_bias=None,
                          inputs=None,
                          beam_width=5,
                          start_tokens=self._start_tokens,
                          end_token=self._end_token,
                          max_decoding_length=self._max_decode_len)

        self.assertEqual(outputs['log_prob'].shape, (self._batch_size, 5))
        self.assertEqual(outputs['sample_id'].shape,
                         (self._batch_size, self._max_decode_len, 5))
Beispiel #8
0
    def test_infer_greedy_with_context_without_memory(self):
        """Tests train_greedy with context
        """
        decoder = TransformerDecoder(vocab_size=self._vocab_size,
                                     output_layer=self._output_layer)
        decoder.eval()
        outputs, length = decoder(memory=None,
                                  memory_sequence_length=None,
                                  memory_attention_bias=None,
                                  inputs=None,
                                  decoding_strategy='infer_greedy',
                                  context=self._context,
                                  context_sequence_length=self._context_length,
                                  end_token=self._end_token,
                                  embedding=self._embedding_fn,
                                  max_decoding_length=self._max_decode_len)

        self.assertIsInstance(outputs, TransformerDecoderOutput)
Beispiel #9
0
    def test_decode_infer_sample(self):
        """Tests infer_sample
        """
        decoder = TransformerDecoder(vocab_size=self._vocab_size,
                                     output_layer=self._output_layer)
        decoder.eval()
        helper = decoder_helpers.SampleEmbeddingHelper(self._embedding_fn,
                                                       self._start_tokens,
                                                       self._end_token)

        outputs, length = decoder(
            memory=self._memory,
            memory_sequence_length=self._memory_sequence_length,
            memory_attention_bias=None,
            inputs=None,
            helper=helper,
            max_decoding_length=self._max_decode_len)

        self.assertIsInstance(outputs, TransformerDecoderOutput)
Beispiel #10
0
 def test_decode_train(self):
     """Tests train_greedy
     """
     decoder = TransformerDecoder(vocab_size=self._vocab_size,
                                  output_layer=self._output_layer)
     decoder.train()
     # 6 blocks
     # -self multihead_attention: 4 dense without bias + 2 layer norm vars
     # -encdec multihead_attention: 4 dense without bias + 2 layer norm vars
     # -poswise_network: Dense with bias, Dense with bias + 2 layer norm vars
     # 2 layer norm vars
     outputs = decoder(memory=self._memory,
                       memory_sequence_length=self._memory_sequence_length,
                       memory_attention_bias=None,
                       inputs=self._inputs,
                       decoding_strategy='train_greedy')
     # print(decoder)
     # for name, _ in decoder.named_parameters():
     #     print(name)
     self.assertEqual(len(decoder.trainable_variables), 110)
     self.assertIsInstance(outputs, TransformerDecoderOutput)
Beispiel #11
0
    def test_beam_search(self):
        """Tests beam_search
        """
        decoder = TransformerDecoder(token_pos_embedder=self._embedding_fn,
                                     vocab_size=self._vocab_size,
                                     output_layer=self._output_layer)
        decoder.eval()
        beam_width = 5
        outputs = decoder(memory=self._memory,
                          memory_sequence_length=self._memory_sequence_length,
                          memory_attention_bias=None,
                          inputs=None,
                          beam_width=beam_width,
                          start_tokens=self._start_tokens,
                          end_token=self._end_token,
                          max_decoding_length=self._max_decode_len)

        self.assertEqual(outputs['log_prob'].size(),
                         (self._batch_size, beam_width))
        self.assertEqual(outputs['sample_id'].size(0), self._batch_size)
        self.assertLessEqual(outputs['sample_id'].size(2),
                             self._max_decode_len)
        self.assertEqual(outputs['sample_id'].size(2), beam_width)
Beispiel #12
0
 def test_greedy_embedding_helper(self):
     """Tests with tf.contrib.seq2seq.GreedyEmbeddingHelper
     """
     decoder = TransformerDecoder(embedding=self._embedding)
     helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
         self._embedding, self._start_tokens, self._end_token)
     outputs, length = decoder(
         memory=self._memory,
         memory_sequence_length=self._memory_sequence_length,
         memory_attention_bias=None,
         helper=helper,
         max_decoding_length=self._max_decode_len,
         mode=tf.estimator.ModeKeys.PREDICT)
     with self.test_session() as sess:
         sess.run(tf.global_variables_initializer())
         outputs_ = sess.run(outputs)
         self.assertIsInstance(outputs_, TransformerDecoderOutput)
Beispiel #13
0
 def test_infer_greedy(self):
     """Tests train_greedy
     """
     decoder = TransformerDecoder(embedding=self._embedding)
     outputs, length = decoder(
         memory=self._memory,
         memory_sequence_length=self._memory_sequence_length,
         memory_attention_bias=None,
         inputs=None,
         decoding_strategy='infer_greedy',
         start_tokens=self._start_tokens,
         end_token=self._end_token,
         max_decoding_length=self._max_decode_len,
         mode=tf.estimator.ModeKeys.PREDICT)
     with self.test_session() as sess:
         sess.run(tf.global_variables_initializer())
         outputs_ = sess.run(outputs)
         self.assertIsInstance(outputs_, TransformerDecoderOutput)
Beispiel #14
0
    def __init__(self, gpt2_config, top_k, temperature):
        super().__init__()
        self.word_embedder = WordEmbedder(vocab_size=gpt2_config.vocab_size,
                                          hparams=gpt2_config.embed)

        self.pos_embedder = PositionEmbedder(
            position_size=gpt2_config.position_size,
            hparams=gpt2_config.pos_embed)

        self.decoder = TransformerDecoder(
            vocab_size=gpt2_config.vocab_size,
            output_layer=self.word_embedder.embedding,
            hparams=gpt2_config.decoder)

        self.top_k = top_k
        self.temperature = temperature

        self._embedding_fn = lambda x, y: (self.word_embedder(x) + self.
                                           pos_embedder(y))
Beispiel #15
0
    def test_train(self):
        """Tests train_greedy
        """
        decoder = TransformerDecoder(embedding=self._embedding)
        # 6 blocks
        # -self multihead_attention: 4 dense without bias + 2 layer norm vars
        # -encdec multihead_attention: 4 dense without bias + 2 layer norm vars
        # -poswise_network: Dense with bias, Dense with bias + 2 layer norm vars
        # 2 layer norm vars
        outputs = decoder(memory=self._memory,
                          memory_sequence_length=self._memory_sequence_length,
                          memory_attention_bias=None,
                          inputs=self._inputs,
                          decoding_strategy='train_greedy',
                          mode=tf.estimator.ModeKeys.TRAIN)
        self.assertEqual(len(decoder.trainable_variables), 110)
        with self.test_session() as sess:
            sess.run(tf.global_variables_initializer())
            outputs_ = sess.run(outputs)

            self.assertIsInstance(outputs_, TransformerDecoderOutput)
    def test_decode_infer_sample(self):
        """Tests infer_sample
        """
        decoder = TransformerDecoder(
            vocab_size=self._vocab_size,
            output_layer=self._output_layer
        )
        helper = tx_helper.SampleEmbeddingHelper(
            self._embedding_fn, self._start_tokens, self._end_token)

        outputs, length = decoder(
            memory=self._memory,
            memory_sequence_length=self._memory_sequence_length,
            memory_attention_bias=None,
            inputs=None,
            helper=helper,
            max_decoding_length=self._max_decode_len,
            mode=tf.estimator.ModeKeys.PREDICT)
        with self.test_session() as sess:
            sess.run(tf.global_variables_initializer())
            outputs_ = sess.run(outputs)
            self.assertIsInstance(outputs_, TransformerDecoderOutput)
Beispiel #17
0
    def __init__(self,
                 pretrained_model_name: Optional[str] = None,
                 cache_dir: Optional[str] = None,
                 hparams=None):

        super().__init__(pretrained_model_name=pretrained_model_name,
                         cache_dir=cache_dir,
                         hparams=hparams)

        if self.pretrained_model_dir:
            self._hparams = HParams(self.pretrained_model_hparams,
                                    self._hparams.todict())

        # Word embedding
        self.word_embedder = WordEmbedder(vocab_size=self._hparams.vocab_size,
                                          hparams=self._hparams.embed)

        # Position embedding
        self.position_embedder = PositionEmbedder(
            position_size=self._hparams.position_size,
            hparams=self._hparams.position_embed)

        # The GPT2 decoder (a TransformerDecoder)
        self.decoder = TransformerDecoder(
            vocab_size=self._hparams.vocab_size,
            output_layer=self.word_embedder.embedding,
            hparams=self._hparams.decoder)

        if self.pretrained_model_dir:
            gpt2_utils.init_gpt2_checkpoint(self, self.pretrained_model_dir)
        elif self._hparams.initializer:
            initialize = layers.get_initializer(self._hparams.initializer)
            assert initialize is not None
            # Do not re-initialize LayerNorm modules.
            for name, param in self.named_parameters():
                if name.split(
                        '.')[-1] == 'weight' and 'layer_norm' not in name:
                    initialize(param)
Beispiel #18
0
    def default_hparams():
        r"""Returns a dictionary of hyperparameters with default values.

        * The decoder arch is determined by the constructor argument
          :attr:`pretrained_model_name` if it's specified. In this case,
          `hparams` are ignored.
        * Otherwise, the encoder arch is determined by
          `hparams['pretrained_model_name']` if it's specified. All other
          configurations in `hparams` are ignored.
        * If the above two are `None`, the encoder arch is defined by the
          configurations in `hparams` and weights are randomly initialized.

        .. code-block:: python

            {
                "name": "gpt2_decoder",
                "pretrained_model_name": "117M",
                "vocab_size": 50257,
                "context_size": 1024,
                "embedding_size": 768,
                "embed": {
                    "dim": 768,
                    "name": "word_embeddings"
                },
                "position_size": 1024,
                "position_embed": {
                    "dim": 768,
                    "name": "position_embeddings"
                },

                # hparams for TransformerDecoder
                "decoder": {
                    "dim": 768,
                    "num_blocks": 12,
                    "use_gpt_config": True,
                    "embedding_dropout": 0,
                    "residual_dropout": 0,
                    "multihead_attention": {
                        "use_bias": True,
                        "num_units": 768,
                        "num_heads": 12,
                        "dropout_rate": 0.0,
                        "output_dim": 768
                    },
                    "initializer": {
                        "type": "variance_scaling_initializer",
                        "kwargs": {
                            "factor": 1.0,
                            "mode": "FAN_AVG",
                            "uniform": True
                        }
                    },
                    "poswise_feedforward": {
                        "layers": [
                            {
                                "type": "Linear",
                                "kwargs": {
                                    "in_features": 768,
                                    "out_features": 3072,
                                    "bias": True
                                }
                            },
                            {
                                "type": "GPTGELU",
                                "kwargs": {}
                            },
                            {
                                "type": "Linear",
                                "kwargs": {
                                    "in_features": 3072,
                                    "out_features": 768,
                                    "bias": True
                                }
                            }
                        ],
                        "name": "ffn"
                    }
                },
            }

        Here:

        The default parameters are values for 117M GPT2 model.

        `"pretrained_model_name"`: str or None
            The name of the pre-trained GPT2 model. If None, the model
            will be randomly initialized.

        `"embed"`: dict
            Hyperparameters for word embedding layer.

        `"vocab_size"`: int
            The vocabulary size of `inputs` in `GPT2Model`.

        `"position_embed"`: dict
            Hyperparameters for position embedding layer.

        `"position_size"`:  int
            The maximum sequence length that this model might ever be used with.

        `"name"`: str
            Name of the module.
        """
        return {
            **TransformerDecoder.default_hparams(),
            'dim': 768,
            'num_blocks': 12,
            'use_gpt_config': True,
            'embedding_dropout': 0,
            'residual_dropout': 0,
            'multihead_attention': {
                'use_bias': True,
                'num_units': 768,
                'num_heads': 12,
                "dropout_rate": 0.0,
                'output_dim': 768
            },
            'initializer': {
                'type': 'variance_scaling_initializer',
                'kwargs': {
                    'factor': 1.0,
                    'mode': 'FAN_AVG',
                    'uniform': True
                }
            },
            'poswise_feedforward': {
                'layers': [{
                    'type': 'Linear',
                    'kwargs': {
                        'in_features': 768,
                        'out_features': 3072,
                        'bias': True
                    }
                }, {
                    'type': 'GPTGELU',
                    'kwargs': {}
                }, {
                    'type': 'Linear',
                    'kwargs': {
                        'in_features': 3072,
                        'out_features': 768,
                        'bias': True
                    }
                }],
                'name':
                'ffn'
            },
            'pretrained_model_name': '117M',
            'vocab_size': 50257,
            'context_size': 1024,
            'embedding_size': 768,
            'embed': {
                'dim': 768,
                'name': 'word_embeddings'
            },
            'position_size': 1024,
            'position_embed': {
                'dim': 768,
                'name': 'position_embeddings'
            },
            'name': 'gpt2_decoder',
            '@no_typecheck': ['pretrained_model_name'],
        }
def main(_):
    """
    Builds the model and runs
    """
    np.random.seed(FLAGS.seed)
    tf.set_random_seed(FLAGS.seed)

    nsamples = FLAGS.nsamples
    batch_size = FLAGS.batch_size
    max_decoding_length = FLAGS.max_decoding_length

    ckpt_path = FLAGS.checkpoint
    # Load GPT-2 model configuration
    if FLAGS.config_type == "json":
        gpt2_config = model_utils.transform_gpt2_to_texar_config(
            FLAGS.config_model)
    elif FLAGS.config_type == 'texar':
        gpt2_config = importlib.import_module(FLAGS.config_model)
    else:
        raise ValueError('Unknown config_type.')

    assert max_decoding_length <= gpt2_config.decoder["position_size"], (
        "max_decoding_length should be smaller than position size")
    assert nsamples % batch_size == 0, (
        "nsamples must be dividable by batch_size")

    # Create a data pre-processor for, e.g., BPE encoding
    proc = processor.get_encoder("gpt2_pretrained_models/model_117M")

    context = tf.placeholder(tf.int32, [batch_size, None])
    context_length = tf.placeholder(tf.int32, [batch_size])

    end_token = proc.encoder['<|endoftext|>']
    if FLAGS.is_interactive:
        start_tokens = context[:, 0]
    else:
        start_tokens = tf.fill([batch_size], end_token)

    # Build the GPT-2 modle
    embedder = tx.modules.WordEmbedder(vocab_size=gpt2_config.vocab_size,
                                       hparams=gpt2_config.embed)

    helper = tx.modules.TopKSampleEmbeddingHelper(
        embedding=embedder,
        start_tokens=start_tokens,
        end_token=end_token,
        top_k=FLAGS.top_k,
        softmax_temperature=FLAGS.temperature)

    decoder = TransformerDecoder(embedding=embedder.embedding,
                                 hparams=gpt2_config.decoder)

    with tf.Session() as sess:

        if FLAGS.is_interactive:
            # Generate continuations of context
            lm_output, _ = decoder(context=context,
                                   context_sequence_length=context_length,
                                   max_decoding_length=max_decoding_length,
                                   helper=helper,
                                   mode=tf.estimator.ModeKeys.PREDICT)

            # Load model checkpoint
            model_utils.init_gpt2_checkpoint(sess, ckpt_path)
            print("\nFinished loading\n")

            # Enter interactive mode
            while True:

                raw_text = input("Model input >>> ")

                while not raw_text:
                    print('Input should not be empty!')
                    raw_text = input("Model input >>> ")

                context_tokens = proc.encode(raw_text)

                feed_dict = {
                    context: [context_tokens for _ in range(batch_size)],
                    context_length:
                    [len(context_tokens) for _ in range(batch_size)],
                    tx.context.global_mode():
                    tf.estimator.ModeKeys.PREDICT
                }
                generated = 0
                for _ in range(nsamples // batch_size):

                    output = sess.run(lm_output, feed_dict=feed_dict)

                    sample_id = output.sample_id
                    for i in range(batch_size):

                        generated += 1
                        print("=" * 40 + " SAMPLE " + str(generated) + " " +
                              "=" * 40)
                        si = sample_id[i][len(context_tokens):]
                        print(proc.decode(si))
                print("=" * 80)
        else:
            # Generate samples from scratch
            lm_output, _ = decoder(max_decoding_length=max_decoding_length,
                                   helper=helper,
                                   mode=tf.estimator.ModeKeys.PREDICT)

            # Load model checkpoint
            model_utils.init_gpt2_checkpoint(sess, ckpt_path)
            print("\nFinished loading\n")

            feed_dict = {
                tx.context.global_mode(): tf.estimator.ModeKeys.PREDICT
            }
            generated = 0
            while nsamples == 0 or generated < nsamples:

                output = sess.run(lm_output, feed_dict=feed_dict)

                sample_id = output.sample_id
                for i in range(batch_size):

                    generated += batch_size
                    text = proc.decode(sample_id[i])
                    print("=" * 40 + " SAMPLE " + str(generated) + " " +
                          "=" * 40)
                    print(text)