Ejemplo n.º 1
0
    def _create_model(self, mode, input_ids, input_mask, segment_ids, labels,
                      labels_mask):
        """Creates a LaserTagger model."""
        is_training = (mode == tf.estimator.ModeKeys.TRAIN)
        model = modeling.BertModel(
            config=self._config,
            is_training=is_training,
            input_ids=input_ids,
            input_mask=input_mask,
            token_type_ids=segment_ids,
            use_one_hot_embeddings=self._use_one_hot_embeddings)

        final_hidden = model.get_sequence_output()

        if self._config.use_t2t_decoder:
            # Size of the output vocabulary which contains the tags + begin and end
            # tokens used by the Transformer decoder.
            output_vocab_size = self._num_tags + 2  # TODO  为什么加2,begin and end
            params = _get_decoder_params(self._config, self._use_tpu,
                                         self._max_seq_length,
                                         output_vocab_size)
            decoder = transformer_decoder.TransformerDecoder(
                params, is_training)
            logits = decoder(input_mask, final_hidden,
                             labels)  # labels is the id of operation
        else:
            if is_training:
                # I.e., 0.1 dropout
                final_hidden = tf.nn.dropout(final_hidden, keep_prob=0.9)

            logits = tf.layers.dense(
                final_hidden,
                self._num_tags,
                kernel_initializer=tf.truncated_normal_initializer(
                    stddev=0.02),
                name="output_projection")

        with tf.variable_scope("loss"):
            loss = None
            per_example_loss = None
            if mode != tf.estimator.ModeKeys.PREDICT:
                loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    labels=labels, logits=logits)
                per_example_loss = tf.truediv(
                    tf.reduce_sum(loss, axis=1),
                    tf.dtypes.cast(tf.reduce_sum(labels_mask, axis=1),
                                   tf.float32))
                loss = tf.reduce_mean(per_example_loss)
                pred = tf.cast(tf.argmax(logits, axis=-1), tf.int32)
            else:
                if self._config.use_t2t_decoder:
                    pred = logits["outputs"]
                    # Transformer decoder reserves the first two IDs to the begin and the
                    # end token so we shift the IDs back.
                    pred -= 2
                else:
                    pred = tf.cast(tf.argmax(logits, axis=-1), tf.int32)

            return (loss, per_example_loss, pred)
Ejemplo n.º 2
0
  def test_transformer_decoder_without_target(self):
    batch_size = 256
    input_length = 128
    hidden_size = 768

    tf.reset_default_graph()
    with tf.Session() as sess:
      inputs = tf.constant(np.zeros([batch_size, input_length]),
                           dtype="float32")
      encoder_outputs = tf.constant(np.zeros(
          [batch_size, input_length, hidden_size]),
                                    dtype="float32")

      decoder = transformer_decoder.TransformerDecoder(
          params=model_params.BASE_PARAMS, train=True)
      result = decoder(inputs, encoder_outputs)

      self.assertIn('outputs', result)
      self.assertIn('scores', result)
Ejemplo n.º 3
0
  def test_transformer_decoder_with_target(self):
    batch_size = 256
    input_length = 128
    hidden_size = 768
    target_length = 128

    tf.reset_default_graph()
    with tf.Session() as sess:
      inputs = tf.constant(np.zeros([batch_size, input_length]),
                           dtype="float32")
      encoder_outputs = tf.constant(np.zeros(
          [batch_size, input_length, hidden_size]),
                                    dtype="float32")
      targets = tf.constant(np.zeros([batch_size, target_length]),
                            dtype="int32")
      decoder = transformer_decoder.TransformerDecoder(
          params=model_params.BASE_PARAMS, train=True)
      result = decoder(inputs, encoder_outputs, targets)

      result_dimensions = result.get_shape().as_list()
      self.assertEqual(
          result_dimensions,
          [batch_size, input_length, model_params.BASE_PARAMS['vocab_size']])
Ejemplo n.º 4
0
  def _create_model(self, mode, input_ids, input_mask, segment_ids, labels,
                    labels_mask):
    """Creates a LaserTagger model."""
    is_training = (mode == tf.estimator.ModeKeys.TRAIN)
    model = modeling.BertModel(
        config=self._config,
        is_training=is_training,
        input_ids=input_ids,
        input_mask=input_mask,
        token_type_ids=segment_ids,
        use_one_hot_embeddings=self._use_one_hot_embeddings)

    final_hidden = model.get_sequence_output()

    if self._config.use_t2t_decoder:
      # Size of the output vocabulary which contains the tags + begin and end
      # tokens used by the Transformer decoder.
      output_vocab_size = self._num_tags + 2
      params = _get_decoder_params(self._config, self._use_tpu,
                                   self._max_seq_length, output_vocab_size)
      decoder = transformer_decoder.TransformerDecoder(params, is_training)
      logits = decoder(input_mask, final_hidden, labels)
    else:
      if is_training:
        # I.e., 0.1 dropout
        final_hidden = tf.nn.dropout(final_hidden, keep_prob=0.9)

      logits = tf.layers.dense(
          final_hidden,
          self._num_tags,
          kernel_initializer=tf.truncated_normal_initializer(stddev=0.02),
          name="output_projection")

    with tf.variable_scope("loss"):
      loss = None
      per_example_loss = None
      if mode != tf.estimator.ModeKeys.PREDICT:
        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels,
                                                              logits=logits)

        if self._verb_tags is not None and self._verb_deletion_loss_weight != 0:
          logits_tensor_shape_as_list = logits.get_shape().as_list()
          batch_size, token_length, number_of_tags = logits_tensor_shape_as_list[
              0:3]

          verb_mask = tf.constant(0.0,
                                  dtype="float32",
                                  shape=segment_ids.get_shape())
          for verb_tag in self._verb_tags:
            verb_mask = tf.math.add(
                tf.cast(
                    tf.math.equal(tf.constant(verb_tag, dtype="int32"),
                                  segment_ids), tf.float32), verb_mask)

          delete_tags = self._delete_tags
          delete_tags = np.repeat(delete_tags[np.newaxis, :],
                                  token_length,
                                  axis=0)
          delete_tags = np.repeat(delete_tags[np.newaxis, :, :],
                                  batch_size,
                                  axis=0)
          delete_tags_tensor = tf.constant(delete_tags, dtype="float32")

          delete_probability = tf.math.divide(
              tf.reduce_sum(tf.math.multiply(delete_tags_tensor, logits), 2),
              tf.reduce_sum(logits, 2))

          delete_loss = tf.math.scalar_mul(
              tf.constant(self._verb_deletion_loss_weight, dtype="float32"),
              tf.math.multiply(delete_probability, verb_mask))

          # new loss = loss * (1 + delete_loss)
          loss = tf.math.multiply(
              loss,
              tf.math.add(
                  tf.constant(1.0,
                              dtype="float32",
                              shape=delete_loss.get_shape()), delete_loss))

        # Adjust loss using weights of different edits (add, delete, keep)
        if self._add_weight != 1:
          add_label_mask = tf.cast(
              tf.math.greater_equal(
                  tf.constant(self._smallest_add_tags_ids, dtype="int32"),
                  labels), tf.float32)

          add_loss_weight = tf.math.scalar_mul(
              tf.constant(self._add_weight - 1, dtype="float32"),
              add_label_mask)
          loss = tf.math.multiply(
              loss,
              tf.math.add(
                  tf.constant(1.0,
                              dtype="float32",
                              shape=add_loss_weight.get_shape()),
                  add_loss_weight))

        loss = _update_loss_with_weight(loss, self._keep_weight,
                                        self._keep_tags_ids, labels)
        loss = _update_loss_with_weight(loss, self._delete_weight,
                                        self._delete_tags_ids, labels)

        per_example_loss = tf.truediv(
            tf.reduce_sum(loss, axis=1),
            tf.dtypes.cast(tf.reduce_sum(labels_mask, axis=1), tf.float32))
        loss = tf.reduce_mean(per_example_loss)
        pred = tf.cast(tf.argmax(logits, axis=-1), tf.int32)
      else:
        if self._config.use_t2t_decoder:
          pred = logits["outputs"]
          # Transformer decoder reserves the first two IDs to the begin and the
          # end token so we shift the IDs back.
          pred -= 2
        else:
          pred = tf.cast(tf.argmax(logits, axis=-1), tf.int32)

      return (loss, per_example_loss, pred)
Ejemplo n.º 5
0
    def __init__(self, vocab_size, hidden_size, emb_dim, dropout, tok2id):
        global ARGS
        global CUDA

        super(Seq2Seq, self).__init__()

        self.vocab_size = vocab_size
        self.hidden_dim = hidden_size
        self.emb_dim = emb_dim
        self.dropout = dropout
        self.pad_id = 0
        self.tok2id = tok2id

        self.embeddings = nn.Embedding(self.vocab_size, self.emb_dim,
                                       self.pad_id)
        self.encoder = LSTMEncoder(self.emb_dim,
                                   self.hidden_dim,
                                   layers=1,
                                   bidirectional=True,
                                   dropout=self.dropout)

        self.h_t_projection = nn.Linear(ARGS.hidden_size, ARGS.hidden_size)
        self.c_t_projection = nn.Linear(ARGS.hidden_size, ARGS.hidden_size)

        self.bridge = nn.Linear(768 if ARGS.bert_encoder else self.hidden_dim,
                                self.hidden_dim)

        if ARGS.transformer_decoder:
            self.decoder = transformer.TransformerDecoder(
                num_layers=ARGS.transformer_layers,
                d_model=self.hidden_dim,
                heads=8,
                d_ff=self.hidden_dim,
                copy_attn=False,
                self_attn_type='scaled-dot',
                dropout=self.dropout,
                embeddings=self.embeddings,
                max_relative_positions=0)
        else:
            self.decoder = StackedAttentionLSTM(self.emb_dim,
                                                self.hidden_dim,
                                                layers=1,
                                                dropout=self.dropout)

        self.output_projection = nn.Linear(self.hidden_dim, self.vocab_size)

        # for decoding. TODO -- throw this out?
        self.softmax = nn.Softmax(dim=-1)
        # for training
        self.log_softmax = nn.LogSoftmax(dim=-1)

        self.init_weights()

        # pretrained embs from bert (after init to avoid overwrite)
        if ARGS.bert_word_embeddings or ARGS.bert_full_embeddings or ARGS.bert_encoder:
            model = BertModel.from_pretrained('bert-base-uncased',
                                              cache_dir=ARGS.working_dir +
                                              '/cache')

            if ARGS.bert_word_embeddings:
                self.embeddings = model.embeddings.word_embeddings

            if ARGS.bert_encoder:
                self.encoder = model
                # share bert word embeddings with decoder
                self.embeddings = model.embeddings.word_embeddings

            if ARGS.bert_full_embeddings:
                self.embeddings = model.embeddings

        if ARGS.freeze_embeddings:
            for param in self.embeddings.parameters():
                param.requires_grad = False

        # make this even if ARGS.no_tok_enrich so that you can load from
        #   a no-enrichment model (and visa versa)
        self.enrich_input = torch.ones(hidden_size)
        if CUDA:
            self.enrich_input = self.enrich_input.cuda()
        self.enricher = nn.Linear(hidden_size, hidden_size)