def body(self, features):
        assert self._hparams.block_size > 0
        assert not common_layers.is_xla_compiled()
        assert "targets_segmentation" not in features

        decoder_output = super(TransformerBlockParallel, self).body(features)
        assert not isinstance(decoder_output, tuple)
        assert len(decoder_output.shape) == 4

        relu_dropout_broadcast_dims = (
            common_layers.comma_separated_string_to_integer_list(
                getattr(self._hparams, "relu_dropout_broadcast_dims", "")))

        with tf.variable_scope("block_size_%d" % self._hparams.block_size):
            block_output = common_layers.dense_relu_dense(
                decoder_output,
                self._hparams.block_size * self._hparams.filter_size,
                self._hparams.block_size * self._hparams.hidden_size,
                dropout=self._hparams.relu_dropout,
                dropout_broadcast_dims=relu_dropout_broadcast_dims)

        batch_size, length = common_layers.shape_list(decoder_output)[:2]
        block_output = tf.reshape(block_output, [
            batch_size, length, self._hparams.block_size,
            self._hparams.hidden_size
        ])

        block_output = common_layers.layer_postprocess(decoder_output,
                                                       block_output,
                                                       self._hparams)

        return block_output
    def body(self, features):
        assert self._hparams.block_size > 0
        assert not common_layers.is_xla_compiled()

        hparams = copy.copy(self._hparams)
        targets = features["targets"]
        inputs = features["inputs"]
        if not (tf.get_variable_scope().reuse
                or hparams.mode == tf.estimator.ModeKeys.PREDICT):
            tf.summary.image("inputs", inputs, max_outputs=1)
            tf.summary.image("targets", targets, max_outputs=1)

        encoder_input = cia.prepare_encoder(inputs, hparams)
        encoder_output = cia.transformer_encoder_layers(
            encoder_input,
            hparams.num_encoder_layers,
            hparams,
            attention_type=hparams.enc_attention_type,
            name="encoder")
        decoder_input, rows, cols = cia.prepare_decoder(targets, hparams)
        decoder_output = cia.transformer_decoder_layers(
            decoder_input,
            encoder_output,
            hparams.num_decoder_layers,
            hparams,
            attention_type=hparams.dec_attention_type,
            name="decoder")

        assert not isinstance(decoder_output, tuple)
        assert len(decoder_output.shape) == 4

        relu_dropout_broadcast_dims = (
            common_layers.comma_separated_string_to_integer_list(
                getattr(self._hparams, "relu_dropout_broadcast_dims", "")))

        with tf.variable_scope("block_size_%d" % self._hparams.block_size):
            tf.logging.info("Using block_size %d", self._hparams.block_size)
            block_output = common_layers.dense_relu_dense(
                decoder_output,
                self._hparams.block_size * self._hparams.filter_size,
                self._hparams.block_size * self._hparams.hidden_size,
                dropout=self._hparams.relu_dropout,
                dropout_broadcast_dims=relu_dropout_broadcast_dims)

        batch_size, rows, cols = common_layers.shape_list(decoder_output)[:3]
        decoder_output = tf.reshape(
            decoder_output,
            [batch_size, rows, cols, 1, self._hparams.hidden_size])
        block_output = tf.reshape(block_output, [
            batch_size, rows, cols, self._hparams.block_size,
            self._hparams.hidden_size
        ])

        block_output = common_layers.layer_postprocess(decoder_output,
                                                       block_output,
                                                       self._hparams)

        return block_output
  def decode(self,
             decoder_input,
             encoder_output,
             encoder_decoder_attention_biases,
             decoder_self_attention_biases,
             hparams,
             cache=None,
             decode_loop_step=None,
             nonpadding=None,
             losses=None):
    """Decode Transformer outputs from encoder representation.

    Args:
      decoder_input: inputs to bottom of the model.
          [batch_size, decoder_length, hidden_dim]
      encoder_output: Encoder representation.
          [batch_size, input_length, hidden_dim]
      encoder_decoder_attention_biases: Bias and mask weights for
          encoder-decoder attention. [batch_size, input_length]
      decoder_self_attention_biases: Bias and mask weights for decoder
          self-attention. [batch_size, decoder_length]
      hparams: hyperparameters for model.
      cache: dict, containing tensors which are the results of previous
          attentions, used for fast decoding.
      decode_loop_step: An integer, step number of the decoding loop.
          Only used for inference on TPU.
      nonpadding: optional Tensor with shape [batch_size, decoder_length]
      losses: optional list onto which to append extra training losses

    Returns:
      Final decoder representation. [batch_size, decoder_length, hidden_dim]
    """
    decoder_input = tf.nn.dropout(decoder_input,
                                  1.0 - hparams.layer_prepostprocess_dropout)

    decoder_output = transformer_decoder(
        decoder_input,
        encoder_output,
        decoder_self_attention_biases,
        encoder_decoder_attention_biases,
        hparams,
        cache=cache,
        decode_loop_step=decode_loop_step,
        nonpadding=nonpadding,
        save_weights_to=self.attention_weights,
        losses=losses)

    if (common_layers.is_xla_compiled() and
        hparams.mode == tf.estimator.ModeKeys.TRAIN):
      # TPU does not react kindly to extra dimensions.
      # TODO(noam): remove this once TPU is more forgiving of extra dims.
      return decoder_output
    else:
      # Expand since t2t expects 4d tensors.
      return tf.expand_dims(decoder_output, axis=2)
Esempio n. 4
0
  def decode(self,
             decoder_input,
             encoder_output,
             encoder_decoder_attention_bias,
             decoder_self_attention_bias,
             hparams,
             cache=None,
             nonpadding=None,
             losses=None):
    """Decode inputs using _decoder().

    This performs the same way as transformer.Transformer.decode with the
    decoder portion replaced with _decoder().

    Args:
      decoder_input: Inputs to bottom of the model. [batch_size, decoder_length,
        hidden_dim]
      encoder_output: Encoder representation. [batch_size, input_length,
        hidden_dim]
      encoder_decoder_attention_bias: Bias and mask weights for encoder-decoder
        attention. [batch_size, input_length]
      decoder_self_attention_bias: Bias and mask weights for decoder
        self-attention. [batch_size, decoder_length]
      hparams: Hyperparmeters for model.
      cache: Dict, containing tensors which are the results of previous
        attentions, used for fast decoding.
      nonpadding: Optional Tensor with shape [batch_size, decoder_length]
      losses: Unused losses.

    Returns:
      Final decoder representation. [batch_size, decoder_length, hidden_dim]
    """
    decoder_input = tf.nn.dropout(decoder_input,
                                  1.0 - hparams.layer_prepostprocess_dropout)

    decoder_output = self._decoder(
        decoder_input,
        encoder_output,
        decoder_self_attention_bias,
        encoder_decoder_attention_bias,
        hparams,
        cache=cache,
        nonpadding=nonpadding,
        save_weights_to=self.attention_weights)

    if (common_layers.is_xla_compiled() and
        hparams.mode == tf.estimator.ModeKeys.TRAIN):
      # TPU does not react kindly to extra dimensions.
      return decoder_output

    # Expand since t2t expects 4d tensors.
    return tf.expand_dims(decoder_output, axis=2)
Esempio n. 5
0
def transformer_encoder(features,
                        hparams,
                        embed_scope=None,
                        embed_token_fn=common_embed.embed_tokens,
                        attention_weights=None):
    """Encodes a screen using Transformer.

  Args:
    features: the feature dict.
    hparams: the hyperparameter.
    embed_scope: the scope for token embedding.
    embed_token_fn: the embed function.
    attention_weights: the attention_weights dict.
  Returns:
    encoder_outputs: a Tensor of shape
        [batch_size, num_steps, max_object_count, hidden_size]
    encoder_attn_bias: A tensor of shape
        [batch_size, num_steps, max_object_count]
  """
    tf.logging.info("Using Transformer screen encoder")
    # Remove the default positional encoding in Transformer
    object_embed, object_mask, encoder_attn_bias = prepare_encoder_input(
        features=features,
        hparams=hparams,
        embed_scope=embed_scope,
        embed_token_fn=embed_token_fn)
    with tf.variable_scope("encode_screen", reuse=tf.AUTO_REUSE):
        shape = tf.shape(object_embed)
        with tf.control_dependencies(
            [tf.assert_equal(shape[3], hparams.hidden_size)]):
            object_embed = tf.reshape(
                object_embed,
                [shape[0] * shape[1], shape[2], hparams.hidden_size])
        encoder_input = tf.nn.dropout(object_embed,
                                      keep_prob=1.0 -
                                      hparams.layer_prepostprocess_dropout)
        self_attention_bias = tf.expand_dims(tf.expand_dims(tf.reshape(
            encoder_attn_bias, [shape[0] * shape[1], shape[2]]),
                                                            axis=1),
                                             axis=1)
        encoder_output = transformer.transformer_encoder(
            encoder_input=encoder_input,
            encoder_self_attention_bias=self_attention_bias,
            hparams=hparams,
            save_weights_to=attention_weights,
            make_image_summary=not common_layers.is_xla_compiled())
        encoder_output = tf.reshape(encoder_output,
                                    [shape[0], shape[1], shape[2], shape[3]])
        return encoder_output, object_mask, encoder_attn_bias
Esempio n. 6
0
    def estimator_spec_eval(self, features, logits, labels, loss, losses_dict):
        """Construct EstimatorSpec for EVAL mode."""
        del losses_dict
        hparams = self.hparams

        problem = hparams.problem
        if common_layers.is_xla_compiled():
            raise NotImplementedError("TPU usage is not supported")

        outputs = tf.contrib.framework.nest.map_structure(
            lambda x: tf.squeeze(tf.argmax(x, axis=-1), axis=[2, 3]), logits)

        if hasattr(problem, "compute_predictions"):
            predictions = problem.compute_predictions(outputs,
                                                      features,
                                                      hparams,
                                                      decode=False)
        else:
            predictions = outputs

        problem_metrics = problem.eval_metrics()
        if isinstance(problem_metrics, list):
            eval_metrics = metrics.create_evaluation_metrics([problem],
                                                             hparams)

            for metric_name, metric_fn in eval_metrics.items():
                eval_metrics[metric_name] = metric_fn(logits, features,
                                                      features["targets"])
        else:
            eval_metrics = {}

            for metric_key, metric_fn in problem_metrics.items():
                metric_name = "metrics-%s/%s" % (problem.name, metric_key)
                first, second = metric_fn(predictions, labels, features)

                if isinstance(second, tf.Tensor):
                    scores, weights = first, second
                    eval_metrics[metric_name] = tf.metrics.mean(
                        scores, weights)
                else:
                    value, update_op = first, second
                    eval_metrics[metric_name] = (value, update_op)

            return tf.estimator.EstimatorSpec(tf.estimator.ModeKeys.EVAL,
                                              eval_metric_ops=eval_metrics,
                                              loss=loss)
Esempio n. 7
0
  def estimator_spec_eval(self, features, logits, labels, loss, losses_dict):
    """Constructs `tf.estimator.EstimatorSpec` for EVAL (evaluation) mode."""
    estimator_spec = super(TransformerAE, self).estimator_spec_eval(
        features, logits, labels, loss, losses_dict)
    if common_layers.is_xla_compiled():
      # For TPUs (and XLA more broadly?), do not add summary hooks that depend
      # on losses; they are not supported.
      return estimator_spec

    summary_op = tf.get_collection(tf.GraphKeys.SUMMARIES, scope="losses")
    summary_op.extend(tf.get_collection(tf.GraphKeys.SUMMARIES, scope="loss"))
    summary_op.append(tf.summary.scalar("loss", loss))
    summary_saver_hook = tf.train.SummarySaverHook(
        save_steps=100,
        summary_op=summary_op,
        output_dir=os.path.join(self.hparams.model_dir, "eval"))

    hooks = list(estimator_spec.evaluation_hooks)
    hooks.append(summary_saver_hook)
    return estimator_spec._replace(evaluation_hooks=hooks)
Esempio n. 8
0
  def estimator_spec_eval(self, features, logits, labels, loss, losses_dict):
    """Constructs `tf.estimator.EstimatorSpec` for EVAL (evaluation) mode."""
    estimator_spec = super(TransformerAE, self).estimator_spec_eval(
        features, logits, labels, loss, losses_dict)
    if common_layers.is_xla_compiled():
      # For TPUs (and XLA more broadly?), do not add summary hooks that depend
      # on losses; they are not supported.
      return estimator_spec

    summary_op = tf.get_collection(tf.GraphKeys.SUMMARIES, scope="losses")
    summary_op.extend(tf.get_collection(tf.GraphKeys.SUMMARIES, scope="loss"))
    summary_op.append(tf.summary.scalar("loss", loss))
    summary_saver_hook = tf.train.SummarySaverHook(
        save_steps=100,
        summary_op=summary_op,
        output_dir=os.path.join(self.hparams.model_dir, "eval"))

    hooks = list(estimator_spec.evaluation_hooks)
    hooks.append(summary_saver_hook)
    return estimator_spec._replace(evaluation_hooks=hooks)
Esempio n. 9
0
    def top(self, body_output, _):
        """Generate logits.

    Args:
      body_output: A Tensor with shape [batch, p0, p1, body_input_depth]
    Returns:
      logits: A Tensor with shape  [batch, p0, p1, ?, vocab_size].
    """
        if self._model_hparams.symbol_modality_skip_top:
            return tf.expand_dims(body_output, 3)

        if self._model_hparams.shared_embedding_and_softmax_weights:
            scope_name = "shared"
            reuse = True
        else:
            scope_name = "softmax"
            reuse = False

        with tf.variable_scope(scope_name, reuse=reuse):
            body_output_shape = common_layers.shape_list(body_output)
            var = self._get_weights(body_output_shape[-1])
            if (self._model_hparams.factored_logits and
                    self._model_hparams.mode == tf.estimator.ModeKeys.TRAIN):
                # insert channels dimension
                body_output = tf.expand_dims(body_output, 3)
                return common_layers.FactoredTensor(body_output, var)
            else:
                body_output = tf.reshape(body_output,
                                         [-1, body_output_shape[-1]])
                logits = tf.matmul(body_output, var, transpose_b=True)
                if (common_layers.is_xla_compiled()
                        and self._model_hparams.mode
                        == tf.estimator.ModeKeys.TRAIN):
                    # TPU does not react kindly to extra dimensions.
                    # TODO(noam): remove this once TPU is more forgiving of extra dims.
                    return logits
                else:
                    return tf.reshape(
                        logits, body_output_shape[:-1] + [1, self._vocab_size])
Esempio n. 10
0
    def compute_gradients(self, loss, var_list=None, **kwargs):  # pylint: disable=arguments-differ
        gradients = self._opt.compute_gradients(loss, var_list, **kwargs)

        def cast_grad_tpu(g, v):
            """Should match upstream t2t
      https://github.com/tensorflow/tensor2tensor/blob/1547c25571633f828ddd74accba76d07d8d043af/tensor2tensor/utils/optimize.py#L232
      """
            if v is not None and g is not None:
                g = common_layers.cast_like(g, v)
            if self._zero_grads and g is None:
                g = tf.zeros_like(v)
            return (g, v)

        def cast_grad_gpu(g, v):
            """
      August 7 2018: We still need the code block below instead
          Refer to https://github.com/tensorflow/tensor2tensor/issues/979.
          We need `use_resource=False` in model_fn in utils/t2t_model.py
          and the old version of cast_grad here.
          Without both of these changes, we are very slow with
          large word embeddings on the CPU.

      Sept 30 2019: We tried removing this since we are off word embeddings
          but slowdown seems to still be around
      """
            if v is None or g is None:
                return (g, v)
            if v.dtype.base_dtype == g.dtype.base_dtype:
                return (g, v)
            return (tf.cast(g, v.dtype), v)

        # separate out tpu vs gpu cast grad so that changes in
        # https://github.com/medicode/tensor2tensor/pull/130/files#diff-2b8e7a5e8b58c8e97ae722ba253dff43
        # preserve speed on gpus
        cast_grad = (cast_grad_tpu
                     if common_layers.is_xla_compiled() else cast_grad_gpu)
        gradients = [cast_grad(g, v) for g, v in gradients]
        return gradients
Esempio n. 11
0
def encode_decode_task(features, hparams, train, attention_weights=None):
    """Model core graph for the one-shot action.

  Args:
    features: a dictionary contains "inputs" that is a tensor in shape of
        [batch_size, num_tokens], "verb_id_seq" that is in shape of
        [batch_size, num_actions], "object_spans" and "param_span" tensor
        in shape of [batch_size, num_actions, 2]. 0 is used as padding or
        non-existent values.
    hparams: the general hyperparameters for the model.
    train: the train mode.
    attention_weights: the dict to keep attention weights for analysis.
  Returns:
    loss_dict: the losses for training.
    prediction_dict: the predictions for action tuples.
    areas: the area encodings of the task.
    scope: the embedding scope.
  """
    del train
    input_embeddings, scope = common_embed.embed_tokens(
        features["task"], hparams.task_vocab_size, hparams.hidden_size,
        hparams)
    with tf.variable_scope("encode_decode", reuse=tf.AUTO_REUSE):
        encoder_nonpadding = tf.minimum(tf.to_float(features["task"]), 1.0)
        input_embeddings = tf.multiply(tf.expand_dims(encoder_nonpadding, 2),
                                       input_embeddings)
        encoder_input, self_attention_bias, encoder_decoder_attention_bias = (
            transformer.transformer_prepare_encoder(input_embeddings,
                                                    None,
                                                    hparams,
                                                    features=None))
        encoder_input = tf.nn.dropout(encoder_input,
                                      keep_prob=1.0 -
                                      hparams.layer_prepostprocess_dropout)
        if hparams.instruction_encoder == "transformer":
            encoder_output = transformer.transformer_encoder(
                encoder_input,
                self_attention_bias,
                hparams,
                save_weights_to=attention_weights,
                make_image_summary=not common_layers.is_xla_compiled())
        else:
            raise ValueError("Unsupported instruction encoder %s" %
                             (hparams.instruction_encoder))
        span_rep = hparams.get("span_rep", "area")
        area_encodings, area_starts, area_ends = area_utils.compute_sum_image(
            encoder_output, max_area_width=hparams.max_span)
        current_shape = tf.shape(area_encodings)
        if span_rep == "area":
            area_encodings, _, _ = area_utils.compute_sum_image(
                encoder_output, max_area_width=hparams.max_span)
        elif span_rep == "basic":
            area_encodings = area_utils.compute_alternative_span_rep(
                encoder_output,
                input_embeddings,
                max_area_width=hparams.max_span,
                hidden_size=hparams.hidden_size,
                advanced=False)
        elif span_rep == "coref":
            area_encodings = area_utils.compute_alternative_span_rep(
                encoder_output,
                input_embeddings,
                max_area_width=hparams.max_span,
                hidden_size=hparams.hidden_size,
                advanced=True)
        else:
            raise ValueError("xyz")
        areas = {}
        areas["encodings"] = area_encodings
        areas["starts"] = area_starts
        areas["ends"] = area_ends
        with tf.control_dependencies([
                tf.print("encoder_output", tf.shape(encoder_output)),
                tf.assert_equal(current_shape,
                                tf.shape(area_encodings),
                                summarize=100)
        ]):
            paddings = tf.cast(tf.less(self_attention_bias, -1), tf.int32)
        padding_sum, _, _ = area_utils.compute_sum_image(
            tf.expand_dims(tf.squeeze(paddings, [1, 2]), 2),
            max_area_width=hparams.max_span)
        num_areas = common_layers.shape_list(area_encodings)[1]
        area_paddings = tf.reshape(tf.minimum(tf.to_float(padding_sum), 1.0),
                                   [-1, num_areas])
        areas["bias"] = area_paddings
        decoder_nonpadding = tf.to_float(
            tf.greater(features["verb_refs"][:, :, 1],
                       features["verb_refs"][:, :, 0]))
        if hparams.instruction_encoder == "lstm":
            hparams_decoder = copy.copy(hparams)
            hparams_decoder.set_hparam("pos", "none")
        else:
            hparams_decoder = hparams
        decoder_input, decoder_self_attention_bias = _prepare_decoder_input(
            area_encodings,
            decoder_nonpadding,
            features,
            hparams_decoder,
            embed_scope=scope)
        decoder_input = tf.nn.dropout(decoder_input,
                                      keep_prob=1.0 -
                                      hparams.layer_prepostprocess_dropout)
        if hparams.instruction_decoder == "transformer":
            decoder_output = transformer.transformer_decoder(
                decoder_input=decoder_input,
                encoder_output=encoder_output,
                decoder_self_attention_bias=decoder_self_attention_bias,
                encoder_decoder_attention_bias=encoder_decoder_attention_bias,
                hparams=hparams_decoder)
        else:
            raise ValueError("Unsupported instruction encoder %s" %
                             (hparams.instruction_encoder))
        return decoder_output, decoder_nonpadding, areas, scope
def perf_transformer_encode(encoder_function,
                            inputs,
                            target_space,
                            hparams,
                            baseline,
                            attention_weights=None,
                            features=None,
                            losses=None,
                            prepare_encoder_fn=None,
                            **kwargs):
    """Encoding for performance autoencoder, which mean-aggregates across time.

  Args:
    encoder_function: the encoder function
    inputs: Transformer inputs [batch_size, input_length, 1, hidden_dim] which
      will be flattened along the two spatial dimensions.
    target_space: scalar, target space ID.
    hparams: hyperparameters for model.
    baseline: if True, does not mean-aggregate the encoder output.
    attention_weights: weight to store attention to.
    features: optionally pass the entire features dictionary as well. This is
      needed now for "packed" datasets.
    losses: optional list onto which to append extra training losses
    prepare_encoder_fn: optional, alternative to transformer_prepare_encoder.
    **kwargs: additional arguments to pass to encoder_function

  Returns:
    Tuple of:
        encoder_output: Encoder representation.
            [batch_size, input_length, hidden_dim]
        encoder_decoder_attention_bias: Bias and mask weights for
            encoder-decoder attention. [batch_size, input_length]
  """
    inputs = common_layers.flatten4d3d(inputs)

    if not prepare_encoder_fn:
        prepare_encoder_fn = transformer_prepare_encoder
    encoder_input, self_attention_bias, encoder_decoder_attention_bias = (
        prepare_encoder_fn(inputs,
                           target_space,
                           hparams,
                           features=features,
                           reuse_target_embedding=tf.AUTO_REUSE))

    mlperf_log.transformer_print(
        key=mlperf_log.MODEL_HP_LAYER_POSTPROCESS_DROPOUT,
        value=hparams.layer_prepostprocess_dropout,
        hparams=hparams)

    encoder_input = tf.nn.dropout(encoder_input,
                                  1.0 - hparams.layer_prepostprocess_dropout)

    attn_bias_for_padding = None
    # Otherwise the encoder will just use encoder_self_attention_bias.
    if hparams.unidirectional_encoder:
        attn_bias_for_padding = encoder_decoder_attention_bias

    encoder_output = encoder_function(
        encoder_input,
        self_attention_bias,
        hparams,
        name="encoder",
        nonpadding=features_to_nonpadding(features, "inputs"),
        save_weights_to=attention_weights,
        make_image_summary=not common_layers.is_xla_compiled(),
        losses=losses,
        attn_bias_for_padding=attn_bias_for_padding,
        **kwargs)

    if not baseline:
        encoder_output = tf.math.reduce_mean(encoder_output,
                                             axis=1,
                                             keep_dims=True)
        encoder_decoder_attention_bias = tf.math.reduce_mean(
            encoder_decoder_attention_bias, axis=-1, keep_dims=True)

    return encoder_output, encoder_decoder_attention_bias
def mel_perf_transformer_encode(encoder_function,
                                perf_inputs,
                                mel_inputs,
                                target_space,
                                hparams,
                                attention_weights=None,
                                features=None,
                                losses=None,
                                prepare_encoder_fn=None,
                                **kwargs):
    """Encode transformer inputs. Used for melody & performance autoencoder.

  Performance is mean-aggregated across time and combined with melody in a
  variety of different ways.

  Args:
    encoder_function: the encoder function
    perf_inputs: Transformer inputs [batch_size, input_length, 1, hidden_dim]
    which will be flattened along the two spatial dimensions.
    mel_inputs: Transformer inputs [batch_size, input_length, 1, hidden_dim]
    which will be flattened along the two spatial dimensions.
    target_space: scalar, target space ID.
    hparams: hyperparameters for model.
    attention_weights: weight to store attention to.
    features: optionally pass the entire features dictionary as well. This is
      needed now for "packed" datasets.
    losses: optional list onto which to append extra training losses
    prepare_encoder_fn: optional, alternative to transformer_prepare_encoder.
    **kwargs: additional arguments to pass to encoder_function

  Returns:
    Tuple of:
        encoder_output: Encoder representation.
            [batch_size, input_length, hidden_dim]
        encoder_decoder_attention_bias: Bias and mask weights for
            encoder-decoder attention. [batch_size, input_length]
  """
    perf_inputs = common_layers.flatten4d3d(perf_inputs)
    mel_inputs = common_layers.flatten4d3d(mel_inputs)

    if not prepare_encoder_fn:
        prepare_encoder_fn = transformer_prepare_encoder
    perf_encoder_input, perf_self_attention_bias, perf_encdec_attention_bias = (
        prepare_encoder_fn(perf_inputs,
                           target_space,
                           hparams,
                           features=features,
                           reuse_target_embedding=tf.AUTO_REUSE))

    mlperf_log.transformer_print(
        key=mlperf_log.MODEL_HP_LAYER_POSTPROCESS_DROPOUT,
        value=hparams.layer_prepostprocess_dropout,
        hparams=hparams)

    perf_encoder_input = tf.nn.dropout(
        perf_encoder_input, 1.0 - hparams.layer_prepostprocess_dropout)

    perf_attn_bias_for_padding = None
    # Otherwise the encoder will just use encoder_self_attention_bias.
    if hparams.unidirectional_encoder:
        perf_attn_bias_for_padding = perf_encdec_attention_bias

    # do the same thing for melody
    mel_encoder_input, mel_self_attention_bias, mel_encdec_attention_bias = (
        prepare_encoder_fn(mel_inputs,
                           target_space,
                           hparams,
                           features=features,
                           reuse_target_embedding=tf.AUTO_REUSE))

    mlperf_log.transformer_print(
        key=mlperf_log.MODEL_HP_LAYER_POSTPROCESS_DROPOUT,
        value=hparams.layer_prepostprocess_dropout,
        hparams=hparams)

    mel_encoder_input = tf.nn.dropout(
        mel_encoder_input, 1.0 - hparams.layer_prepostprocess_dropout)

    mel_attn_bias_for_padding = None
    # Otherwise the encoder will just use encoder_self_attention_bias.
    if hparams.unidirectional_encoder:
        mel_attn_bias_for_padding = mel_encdec_attention_bias

    # use the proper encoder function for perf/melody
    perf_encoder_output = encoder_function(
        perf_encoder_input,
        perf_self_attention_bias,
        hparams,
        name="perf_encoder",
        nonpadding=features_to_nonpadding(features, "inputs"),
        save_weights_to=attention_weights,
        make_image_summary=not common_layers.is_xla_compiled(),
        losses=losses,
        attn_bias_for_padding=perf_attn_bias_for_padding,
        **kwargs)
    # same thing for melody
    mel_encoder_output = encoder_function(
        mel_encoder_input,
        mel_self_attention_bias,
        hparams,
        name="mel_encoder",
        nonpadding=features_to_nonpadding(features, "inputs"),
        save_weights_to=attention_weights,
        make_image_summary=not common_layers.is_xla_compiled(),
        losses=losses,
        attn_bias_for_padding=mel_attn_bias_for_padding,
        **kwargs)

    # concatenate the global mean vector/bias term with the full melody encoding
    perf_mean_vector = tf.math.reduce_mean(perf_encoder_output,
                                           axis=1,
                                           keep_dims=True)

    # different methods of aggregating over the performance + melody vectors!
    if hparams.aggregation == "sum":
        # add both mean performance and melody vectors together
        perf_mean_bias = tf.math.reduce_mean(perf_encdec_attention_bias,
                                             axis=-1,
                                             keep_dims=True)
        encoder_output = mel_encoder_output + perf_mean_vector
        encoder_decoder_attention_bias = mel_encdec_attention_bias + perf_mean_bias
    elif hparams.aggregation == "concat":
        # concatenate melody with mean-aggregated performance embedding
        stop_token = tf.zeros((1, 1, 384))
        encoder_output = tf.concat(
            [mel_encoder_output, stop_token, perf_mean_vector], axis=1)
        perf_mean_bias = tf.math.reduce_mean(perf_encdec_attention_bias,
                                             axis=-1,
                                             keep_dims=True)
        stop_bias = tf.zeros((1, 1, 1, 1))
        encoder_decoder_attention_bias = tf.concat(
            [mel_encdec_attention_bias, stop_bias, perf_mean_bias], axis=-1)
    elif hparams.aggregation == "tile":
        # tile performance embedding across each dimension of melody embedding!
        dynamic_val = tf.shape(mel_encoder_output)[1]
        shp = tf.convert_to_tensor([1, dynamic_val, 1], dtype=tf.int32)
        tiled_mean = tf.tile(perf_mean_vector, shp)

        encoder_output = tf.concat([mel_encoder_output, tiled_mean], axis=-1)
        encoder_decoder_attention_bias = mel_encdec_attention_bias
    else:
        NotImplementedError(
            "aggregation method must be in [sum, concat, tile].")

    return encoder_output, encoder_decoder_attention_bias
def hierarchical_attention_network_encoder(
        encoder_input,
        encoder_self_attention_bias,
        contexts,
        context_self_attention_biases,
        features,
        hparams,
        name="hierarchical_attention_network_encoder",
        save_weights_to=None,
        make_image_summary=True,
        losses=None):
    input_x = encoder_input
    context_xs = {}
    for context_name in contexts:
        context_xs[context_name] = contexts[context_name]
    context_paddings = {}
    context_nonpaddings = {}
    context_pad_removers = {}

    attention_dropout_broadcast_dims = (
        common_layers.comma_separated_string_to_integer_list(
            getattr(hparams, "attention_dropout_broadcast_dims", "")))

    with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
        input_padding = common_attention.attention_bias_to_padding(
            encoder_self_attention_bias)
        input_nonpadding = 1.0 - input_padding
        for context_name in context_self_attention_biases:
            context_paddings[
                context_name] = common_attention.attention_bias_to_padding(
                    context_self_attention_biases[context_name])
            context_nonpaddings[
                context_name] = 1.0 - context_paddings[context_name]

        input_pad_remover = None
        for context_name in context_paddings:
            context_pad_removers[context_name] = None
        if hparams.use_pad_remover and not common_layers.is_xla_compiled():
            input_pad_remover = expert_utils.PadRemover(input_padding)
            for context_name in context_paddings:
                context_pad_removers[context_name] = expert_utils.PadRemover(
                    context_paddings[context_name])

        temp_hparam = tf.contrib.training.HParams(
        )  # copy hparams except num_hidden_layers -> num_hidden_layers - 1
        for key, val in hparams.values().items():
            temp_hparam.add_hparam(key, val)
        temp_hparam.set_hparam("num_hidden_layers",
                               hparams.num_hidden_layers - 1)
        encoder_output = transformer_with_contexts_layers.transformer_encoder(
            input_x,
            encoder_self_attention_bias,
            temp_hparam,
            nonpadding=features_to_nonpadding(features, "inputs"),
            save_weights_to=save_weights_to,
            make_image_summary=make_image_summary)

        context_encoded_outputs = {}
        for context_name in context_xs:
            context_encoded_outputs[
                context_name] = transformer_with_contexts_layers.transformer_encoder(
                    context_xs[context_name],
                    context_self_attention_biases[context_name],
                    hparams,
                    nonpadding=features_to_nonpadding(features, context_name),
                    save_weights_to=save_weights_to,
                    make_image_summary=make_image_summary)

        with tf.variable_scope('word_abstraction', reuse=tf.AUTO_REUSE):
            encoder_word_level_query = common_layers.dense(
                encoder_output, hparams.hidden_size)  # q_w = f_w(h_t)
            encoder_word_level_abstraction = {}
            for context_name in context_encoded_outputs:
                encoder_word_level_abstraction[
                    context_name] = transformer_with_contexts_layers.multihead_attention(
                        common_layers.layer_preprocess(
                            encoder_word_level_query, hparams),
                        context_encoded_outputs[context_name],
                        context_self_attention_biases[context_name],
                        hparams.attention_key_channels or hparams.hidden_size,
                        hparams.attention_value_channels
                        or hparams.hidden_size,
                        hparams.hidden_size,
                        hparams.num_heads,
                        hparams.attention_dropout,
                        attention_type=hparams.self_attention_type,
                        save_weights_to=save_weights_to,
                        make_image_summary=make_image_summary,
                        max_relative_position=hparams.max_relative_position,
                        dropout_broadcast_dims=attention_dropout_broadcast_dims,
                        max_length=hparams.get("max_length"),
                        vars_3d=hparams.get("attention_variables_3d"))  # s^j,

            sentence_information = tf.concat([
                encoder_word_level_abstraction[context_name]
                for context_name in encoder_word_level_abstraction
            ],
                                             axis=1)

        with tf.variable_scope('sentence_abstraction', reuse=tf.AUTO_REUSE):
            encoder_sentence_level_query = common_layers.dense(
                encoder_output, hparams.hidden_size)  # q_s = f_s(h_t)
            context_padding = common_attention.embedding_to_padding(
                sentence_information)
            ignore_padding = common_attention.attention_bias_ignore_padding(
                context_padding)
            contextual_information = transformer_with_contexts_layers.multihead_attention(
                common_layers.layer_preprocess(encoder_sentence_level_query,
                                               hparams),
                sentence_information,
                ignore_padding,
                hparams.attention_key_channels or hparams.hidden_size,
                hparams.attention_value_channels or hparams.hidden_size,
                hparams.hidden_size,
                hparams.num_heads,
                hparams.attention_dropout,
                attention_type=hparams.self_attention_type,
                save_weights_to=save_weights_to,
                make_image_summary=make_image_summary,
                max_relative_position=hparams.max_relative_position,
                dropout_broadcast_dims=attention_dropout_broadcast_dims,
                max_length=hparams.get("max_length"),
                vars_3d=hparams.get("attention_variables_3d")
            )  # MultiHead(q_s, s^j), [batch, encoder_length, hidden_dim]

            contextual_information = common_layers.dense_relu_dense(
                contextual_information, hparams.filter_size,
                hparams.hidden_size)

        with tf.variable_scope('context_gating', reuse=tf.AUTO_REUSE):
            gate_lambda = tf.nn.sigmoid(
                common_layers.dense(contextual_information,
                                    hparams.hidden_size) +
                common_layers.dense(encoder_output, hparams.hidden_size))
            encoder_output = gate_lambda * encoder_output + (
                1 - gate_lambda) * contextual_information

    return common_layers.layer_preprocess(encoder_output, hparams)
def hierarchical_context_encoder(encoder_input,
                                 encoder_self_attention_bias,
                                 contexts,
                                 context_self_attention_biases,
                                 features,
                                 hparams,
                                 name="discourse_aware_encoder",
                                 save_weights_to=None,
                                 make_image_summary=True,
                                 losses=None):
    input_x = encoder_input
    context_xs = {}
    for context_name in contexts:
        context_xs[context_name] = contexts[context_name]
    context_paddings = {}
    context_nonpaddings = {}
    context_pad_removers = {}

    attention_dropout_broadcast_dims = (
        common_layers.comma_separated_string_to_integer_list(
            getattr(hparams, "attention_dropout_broadcast_dims", "")))

    with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
        input_padding = common_attention.attention_bias_to_padding(
            encoder_self_attention_bias)
        input_nonpadding = 1.0 - input_padding
        for context_name in context_self_attention_biases:
            context_paddings[
                context_name] = common_attention.attention_bias_to_padding(
                    context_self_attention_biases[context_name])
            context_nonpaddings[
                context_name] = 1.0 - context_paddings[context_name]

        input_pad_remover = None
        for context_name in context_paddings:
            context_pad_removers[context_name] = None
        if hparams.use_pad_remover and not common_layers.is_xla_compiled():
            input_pad_remover = expert_utils.PadRemover(input_padding)
            for context_name in context_paddings:
                context_pad_removers[context_name] = expert_utils.PadRemover(
                    context_paddings[context_name])

        temp_hparam = tf.contrib.training.HParams(
        )  # copy hparams except num_hidden_layers -> num_hidden_layers - 1
        for key, val in hparams.values().items():
            temp_hparam.add_hparam(key, val)
        temp_hparam.set_hparam("num_hidden_layers",
                               hparams.num_hidden_layers - 1)
        encoder_output = transformer_with_contexts_layers.transformer_encoder(
            input_x,
            encoder_self_attention_bias,
            temp_hparam,
            nonpadding=features_to_nonpadding(features, "inputs"),
            save_weights_to=save_weights_to,
            make_image_summary=make_image_summary)

        context_encoded_outputs = {}
        for context_name in context_xs:
            context_encoded_outputs[
                context_name] = transformer_with_contexts_layers.transformer_encoder(
                    context_xs[context_name],
                    context_self_attention_biases[context_name],
                    temp_hparam,
                    nonpadding=features_to_nonpadding(features, context_name),
                    save_weights_to=save_weights_to,
                    make_image_summary=make_image_summary)

        with tf.variable_scope("hierarchical_context_encoder",
                               reuse=tf.AUTO_REUSE):
            for context_name in context_encoded_outputs:
                # self attention feed-forward
                _y = ffn_self_attention_layer(
                    context_encoded_outputs[context_name],
                    hparams.hidden_size,
                    hparams.hidden_size,
                    hparams.num_heads,
                    hparams.attention_dropout,
                    save_weights_to=save_weights_to,
                    name="attentive_sum")
                # mean over sequence length
                context_encoded_outputs[context_name] = tf.reduce_mean(
                    _y, axis=1, keep_dims=True)

            encoded_contexts = [
                context_encoded_outputs[context_name]
                for context_name in context_encoded_outputs
            ]
            encoded_contexts = tf.concat(encoded_contexts, axis=1)

            temp_hparam = tf.contrib.training.HParams(
            )  # copy hparams except num_hidden_layers -> 1
            for key, val in hparams.values().items():
                temp_hparam.add_hparam(key, val)
            temp_hparam.set_hparam("num_hidden_layers", 1)
            context_padding = common_attention.embedding_to_padding(
                encoded_contexts)
            ignore_padding = common_attention.attention_bias_ignore_padding(
                context_padding)

            encoded_contexts = transformer_encoder(encoded_contexts,
                                                   ignore_padding, temp_hparam)

        with tf.variable_scope("encoder/layer_%d" % hparams.num_hidden_layers,
                               reuse=tf.AUTO_REUSE):
            with tf.variable_scope("context_input_attention"):
                context_padding = common_attention.embedding_to_padding(
                    encoded_contexts)
                ignore_padding = common_attention.attention_bias_ignore_padding(
                    context_padding)
                _y = common_attention.multihead_attention(
                    common_layers.layer_preprocess(encoder_output, hparams),
                    encoded_contexts,
                    ignore_padding,
                    hparams.attention_key_channels or hparams.hidden_size,
                    hparams.attention_value_channels or hparams.hidden_size,
                    hparams.hidden_size,
                    hparams.num_heads,
                    hparams.attention_dropout,
                    attention_type=hparams.self_attention_type,
                    save_weights_to=save_weights_to,
                    make_image_summary=make_image_summary,
                    max_relative_position=hparams.max_relative_position,
                    dropout_broadcast_dims=attention_dropout_broadcast_dims,
                    max_length=hparams.get("max_length"),
                    vars_3d=hparams.get("attention_variables_3d"))
                encoded_contexts = common_layers.layer_postprocess(
                    encoder_output, _y, hparams)

            with tf.variable_scope("input_self_attention"):
                _y = common_attention.multihead_attention(
                    common_layers.layer_preprocess(encoder_output, hparams),
                    None,
                    encoder_self_attention_bias,
                    hparams.attention_key_channels or hparams.hidden_size,
                    hparams.attention_value_channels or hparams.hidden_size,
                    hparams.hidden_size,
                    hparams.num_heads,
                    hparams.attention_dropout,
                    attention_type=hparams.self_attention_type,
                    save_weights_to=save_weights_to,
                    max_relative_position=hparams.max_relative_position,
                    make_image_summary=make_image_summary,
                    dropout_broadcast_dims=attention_dropout_broadcast_dims,
                    max_length=hparams.get("max_length"),
                    vars_3d=hparams.get("attention_variables_3d"))
                encoder_output = common_layers.layer_postprocess(
                    encoder_output, _y, hparams)

            with tf.variable_scope("gated_sum"):
                _depth = common_layers.shape_list(encoder_output)[-1]
                gate = tf.layers.dense(tf.concat(
                    [encoded_contexts, encoder_output], axis=-1),
                                       _depth,
                                       activation=tf.nn.sigmoid)
                if save_weights_to:
                    save_weights_to["gated_sum"] = gate
                encoder_output = gate * encoder_output + (
                    1. - gate) * encoded_contexts

            with tf.variable_scope("ffn"):
                _y = transformer_ffn_layer(common_layers.layer_preprocess(
                    encoder_output, hparams),
                                           hparams,
                                           input_pad_remover,
                                           conv_padding="SAME",
                                           nonpadding_mask=input_nonpadding,
                                           losses=losses)
                encoder_output = common_layers.layer_postprocess(
                    encoder_output, _y, hparams)

    return common_layers.layer_preprocess(encoder_output, hparams)
def transformer_encoder(encoder_input,
                        encoder_self_attention_bias,
                        hparams,
                        name="encoder",
                        nonpadding=None,
                        save_weights_to=None,
                        make_image_summary=True,
                        losses=None,
                        attn_bias_for_padding=None):
    """A stack of transformer layers.

  Args:
    encoder_input: a Tensor
    encoder_self_attention_bias: bias Tensor for self-attention
       (see common_attention.attention_bias())
    hparams: hyperparameters for model
    name: a string
    nonpadding: optional Tensor with shape [batch_size, encoder_length]
      indicating what positions are not padding.  This must either be
      passed in, which we do for "packed" datasets, or inferred from
      encoder_self_attention_bias.  The knowledge about padding is used
      for pad_remover(efficiency) and to mask out padding in convolutional
      layers.
    save_weights_to: an optional dictionary to capture attention weights
      for visualization; the weights tensor will be appended there under
      a string key created from the variable scope (including name).
    make_image_summary: Whether to make an attention image summary.
    losses: optional list onto which to append extra training losses
    attn_bias_for_padding: Padded attention bias in case a unidirectional
      encoder is being used where future attention is masked.

  Returns:
    y: a Tensors
  """
    x = encoder_input
    attention_dropout_broadcast_dims = (
        common_layers.comma_separated_string_to_integer_list(
            getattr(hparams, "attention_dropout_broadcast_dims", "")))
    mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_NUM_HIDDEN_LAYERS,
                                 value=hparams.num_encoder_layers
                                 or hparams.num_hidden_layers)
    mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_ATTENTION_DROPOUT,
                                 value=hparams.attention_dropout)
    mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_ATTENTION_DENSE,
                                 value={
                                     "use_bias": "false",
                                     "num_heads": hparams.num_heads,
                                     "hidden_size": hparams.hidden_size
                                 })

    with tf.variable_scope(name):
        if nonpadding is not None:
            padding = 1.0 - nonpadding
        else:
            attention_bias = encoder_self_attention_bias
            if attn_bias_for_padding is not None:
                attention_bias = attn_bias_for_padding
            padding = common_attention.attention_bias_to_padding(
                attention_bias)
            nonpadding = 1.0 - padding
        pad_remover = None
        if hparams.use_pad_remover and not common_layers.is_xla_compiled():
            pad_remover = expert_utils.PadRemover(padding)
        for layer in range(hparams.num_encoder_layers
                           or hparams.num_hidden_layers):
            with tf.variable_scope("layer_%d" % layer):
                with tf.variable_scope("self_attention"):
                    if layer < hparams.get("num_area_layers", 0):
                        max_area_width = hparams.get("max_area_width", 1)
                        max_area_height = hparams.get("max_area_height", 1)
                        memory_height = hparams.get("memory_height", 1)
                    else:
                        max_area_width = 1
                        max_area_height = 1
                        memory_height = 1
                    y = common_attention.multihead_attention(
                        common_layers.layer_preprocess(x, hparams),
                        None,
                        encoder_self_attention_bias,
                        hparams.attention_key_channels or hparams.hidden_size,
                        hparams.attention_value_channels
                        or hparams.hidden_size,
                        hparams.hidden_size,
                        hparams.num_heads,
                        hparams.attention_dropout,
                        attention_type=hparams.self_attention_type,
                        max_relative_position=hparams.max_relative_position,
                        heads_share_relative_embedding=(
                            hparams.heads_share_relative_embedding),
                        add_relative_to_values=hparams.add_relative_to_values,
                        save_weights_to=save_weights_to,
                        make_image_summary=make_image_summary,
                        dropout_broadcast_dims=attention_dropout_broadcast_dims,
                        max_length=hparams.get("max_length"),
                        vars_3d=hparams.get("attention_variables_3d"),
                        activation_dtype=hparams.get("activation_dtype",
                                                     "float32"),
                        weight_dtype=hparams.get("weight_dtype", "float32"),
                        hard_attention_k=hparams.get("hard_attention_k", 0),
                        gumbel_noise_weight=hparams.get(
                            "gumbel_noise_weight", 0.0),
                        max_area_width=max_area_width,
                        max_area_height=max_area_height,
                        memory_height=memory_height,
                        area_key_mode=hparams.get("area_key_mode", "none"),
                        area_value_mode=hparams.get("area_value_mode", "none"),
                        training=(hparams.get("mode",
                                              tf.estimator.ModeKeys.TRAIN) ==
                                  tf.estimator.ModeKeys.TRAIN))
                    x = common_layers.layer_postprocess(x, y, hparams)
                with tf.variable_scope("ffn"):
                    y = transformer_ffn_layer(common_layers.layer_preprocess(
                        x, hparams),
                                              hparams,
                                              pad_remover,
                                              conv_padding="SAME",
                                              nonpadding_mask=nonpadding,
                                              losses=losses)
                    x = common_layers.layer_postprocess(x, y, hparams)
        # if normalization is done in layer_preprocess, then it should also be done
        # on the output, since the output can grow very large, being the sum of
        # a whole stack of unnormalized layer outputs.
        mlperf_log.transformer_print(
            key=mlperf_log.MODEL_HP_NORM,
            value={"hidden_size": hparams.hidden_size})
        return common_layers.layer_preprocess(x, hparams)
Esempio n. 17
0
def universal_transformer_encoder(encoder_input,
                                  encoder_self_attention_bias,
                                  hparams,
                                  name="encoder",
                                  nonpadding=None,
                                  save_weights_to=None,
                                  make_image_summary=True):
    """Universal Transformer encoder function.

  Prepares all the arguments and the inputs and passes it to a
  universal_transformer_layer to encode the encoder_input.

  Args:
    encoder_input: a Tensor
    encoder_self_attention_bias: bias Tensor for self-attention
       (see common_attention.attention_bias())
    hparams: hyperparameters for model
    name: a string
    nonpadding: optional Tensor with shape [batch_size, encoder_length]
      indicating what positions are not padding.  This must either be
      passed in, which we do for "packed" datasets, or inferred from
      encoder_self_attention_bias.  The knowledge about padding is used
      for pad_remover(efficiency) and to mask out padding in convoltutional
      layers.
    save_weights_to: an optional dictionary to capture attention weights
      for vizualization; the weights tensor will be appended there under
      a string key created from the variable scope (including name).
    make_image_summary: Whether to make an attention image summary.

  Returns:
    y: a Tensors as the output of the encoder
    extra_output: which can be used to pass extra information to the body
  """

    x = encoder_input
    attention_dropout_broadcast_dims = (
        common_layers.comma_separated_string_to_integer_list(
            getattr(hparams, "attention_dropout_broadcast_dims", "")))
    with tf.variable_scope(name):
        if nonpadding is not None:
            padding = 1.0 - nonpadding
        else:
            padding = common_attention.attention_bias_to_padding(
                encoder_self_attention_bias)
            nonpadding = 1.0 - padding
        pad_remover = None
        if hparams.use_pad_remover and not common_layers.is_xla_compiled():
            pad_remover = expert_utils.PadRemover(padding)

        ffn_unit = functools.partial(
            universal_transformer_util.transformer_encoder_ffn_unit,
            hparams=hparams,
            nonpadding_mask=nonpadding,
            pad_remover=pad_remover)

        attention_unit = functools.partial(
            universal_transformer_util.transformer_encoder_attention_unit,
            hparams=hparams,
            encoder_self_attention_bias=encoder_self_attention_bias,
            attention_dropout_broadcast_dims=attention_dropout_broadcast_dims,
            save_weights_to=save_weights_to,
            make_image_summary=make_image_summary)

        x, extra_output = universal_transformer_layer(x,
                                                      hparams,
                                                      ffn_unit,
                                                      attention_unit,
                                                      pad_remover=pad_remover)

        if hparams.get("use_memory_as_last_state", False):
            x = extra_output  # which is memory
        return common_layers.layer_preprocess(x, hparams), extra_output
def transformer_encoder(encoder_input,
                        encoder_self_attention_bias,
                        hparams,
                        name="encoder",
                        nonpadding=None,
                        save_weights_to=None,
                        make_image_summary=True):
    """A stack of transformer layers.

  Args:
    encoder_input: a Tensor
    encoder_self_attention_bias: bias Tensor for self-attention
       (see common_attention.attention_bias())
    hparams: hyperparameters for model
    name: a string
    nonpadding: optional Tensor with shape [batch_size, encoder_length]
      indicating what positions are not padding.  This must either be
      passed in, which we do for "packed" datasets, or inferred from
      encoder_self_attention_bias.  The knowledge about padding is used
      for pad_remover(efficiency) and to mask out padding in convolutional
      layers.
    save_weights_to: an optional dictionary to capture attention weights
      for visualization; the weights tensor will be appended there under
      a string key created from the variable scope (including name).
    make_image_summary: Whether to make an attention image summary.

  Returns:
    y: a Tensors
  """
    x = encoder_input
    attention_dropout_broadcast_dims = (
        common_layers.comma_separated_string_to_integer_list(
            getattr(hparams, "attention_dropout_broadcast_dims", "")))
    mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_NUM_HIDDEN_LAYERS,
                                 value=hparams.num_encoder_layers
                                 or hparams.num_hidden_layers)
    mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_ATTENTION_DROPOUT,
                                 value=hparams.attention_dropout)
    mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_ATTENTION_DENSE,
                                 value={
                                     "use_bias": "false",
                                     "num_heads": hparams.num_heads,
                                     "hidden_size": hparams.hidden_size
                                 })

    with tf.variable_scope(name):
        if nonpadding is not None:
            padding = 1.0 - nonpadding
        else:
            padding = common_attention.attention_bias_to_padding(
                encoder_self_attention_bias)
            nonpadding = 1.0 - padding
        pad_remover = None
        if hparams.use_pad_remover and not common_layers.is_xla_compiled():
            pad_remover = expert_utils.PadRemover(padding)
        for layer in range(hparams.num_encoder_layers
                           or hparams.num_hidden_layers):

            initial_sparsity = None
            if hparams.get("load_masks_from"):
                initial_sparsity = hparams.get("initial_sparsity")

            with tf.variable_scope("layer_%d" % layer):
                with tf.variable_scope("self_attention"):
                    y = sparse_attention.multihead_attention(
                        common_layers.layer_preprocess(x, hparams),
                        None,
                        encoder_self_attention_bias,
                        hparams.attention_key_channels or hparams.hidden_size,
                        hparams.attention_value_channels
                        or hparams.hidden_size,
                        hparams.hidden_size,
                        hparams.num_heads,
                        hparams.attention_dropout,
                        attention_type=hparams.self_attention_type,
                        max_relative_position=hparams.max_relative_position,
                        heads_share_relative_embedding=(
                            hparams.heads_share_relative_embedding),
                        add_relative_to_values=hparams.add_relative_to_values,
                        save_weights_to=save_weights_to,
                        make_image_summary=make_image_summary,
                        dropout_broadcast_dims=attention_dropout_broadcast_dims,
                        max_length=hparams.get("max_length"),
                        vars_3d=hparams.get("attention_variables_3d"),
                        sparsity_technique=hparams.get("sparsity_technique"),
                        threshold=hparams.get("log_alpha_threshold"),
                        training=hparams.get(
                            "mode") == tf_estimator.ModeKeys.TRAIN,
                        clip_alpha=hparams.get("clip_log_alpha"),
                        initial_sparsity=initial_sparsity,
                        split_heads=hparams.get("split_heads"))
                    x = common_layers.layer_postprocess(x, y, hparams)
                with tf.variable_scope("ffn"):
                    y = transformer_ffn_layer(
                        common_layers.layer_preprocess(x, hparams), hparams,
                        pad_remover)
                    x = common_layers.layer_postprocess(x, y, hparams)
        # if normalization is done in layer_preprocess, then it should also be done
        # on the output, since the output can grow very large, being the sum of
        # a whole stack of unnormalized layer outputs.
        mlperf_log.transformer_print(
            key=mlperf_log.MODEL_HP_NORM,
            value={"hidden_size": hparams.hidden_size})
        return common_layers.layer_preprocess(x, hparams)
Esempio n. 19
0
def transformer_encoder(encoder_input,
                        encoder_self_attention_bias,
                        hparams,
                        name="encoder",
                        nonpadding=None,
                        save_weights_to=None,
                        make_image_summary=True,
                        losses=None):
  """A stack of transformer layers.

  Args:
    encoder_input: a Tensor
    encoder_self_attention_bias: bias Tensor for self-attention
       (see common_attention.attention_bias())
    hparams: hyperparameters for model
    name: a string
    nonpadding: optional Tensor with shape [batch_size, encoder_length]
      indicating what positions are not padding.  This must either be
      passed in, which we do for "packed" datasets, or inferred from
      encoder_self_attention_bias.  The knowledge about padding is used
      for pad_remover(efficiency) and to mask out padding in convolutional
      layers.
    save_weights_to: an optional dictionary to capture attention weights
      for visualization; the weights tensor will be appended there under
      a string key created from the variable scope (including name).
    make_image_summary: Whether to make an attention image summary.
    losses: optional list onto which to append extra training losses

  Returns:
    y: a Tensors
  """
  x = encoder_input
  attention_dropout_broadcast_dims = (
      common_layers.comma_separated_string_to_integer_list(
          getattr(hparams, "attention_dropout_broadcast_dims", "")))
  mlperf_log.transformer_print(
      key=mlperf_log.MODEL_HP_NUM_HIDDEN_LAYERS,
      value=hparams.num_encoder_layers or hparams.num_hidden_layers)
  mlperf_log.transformer_print(
      key=mlperf_log.MODEL_HP_ATTENTION_DROPOUT,
      value=hparams.attention_dropout)
  mlperf_log.transformer_print(
      key=mlperf_log.MODEL_HP_ATTENTION_DENSE,
      value={
          "use_bias": "false",
          "num_heads": hparams.num_heads,
          "hidden_size": hparams.hidden_size
      })

  with tf.variable_scope(name):
    if nonpadding is not None:
      padding = 1.0 - nonpadding
    else:
      padding = common_attention.attention_bias_to_padding(
          encoder_self_attention_bias)
      nonpadding = 1.0 - padding
    pad_remover = None
    if hparams.use_pad_remover and not common_layers.is_xla_compiled():
      pad_remover = expert_utils.PadRemover(padding)
    for layer in range(hparams.num_encoder_layers or hparams.num_hidden_layers):
      with tf.variable_scope("layer_%d" % layer):
        with tf.variable_scope("self_attention"):
          y = common_attention.multihead_attention(
              common_layers.layer_preprocess(x, hparams),
              None,
              encoder_self_attention_bias,
              hparams.attention_key_channels or hparams.hidden_size,
              hparams.attention_value_channels or hparams.hidden_size,
              hparams.hidden_size,
              hparams.num_heads,
              hparams.attention_dropout,
              attention_type=hparams.self_attention_type,
              max_relative_position=hparams.max_relative_position,
              heads_share_relative_embedding=(
                  hparams.heads_share_relative_embedding),
              add_relative_to_values=hparams.add_relative_to_values,
              save_weights_to=save_weights_to,
              make_image_summary=make_image_summary,
              dropout_broadcast_dims=attention_dropout_broadcast_dims,
              max_length=hparams.get("max_length"),
              vars_3d=hparams.get("attention_variables_3d"))
          x = common_layers.layer_postprocess(x, y, hparams)
        with tf.variable_scope("ffn"):
          y = transformer_ffn_layer(
              common_layers.layer_preprocess(x, hparams),
              hparams,
              pad_remover,
              conv_padding="SAME",
              nonpadding_mask=nonpadding,
              losses=losses)
          x = common_layers.layer_postprocess(x, y, hparams)
    # if normalization is done in layer_preprocess, then it should also be done
    # on the output, since the output can grow very large, being the sum of
    # a whole stack of unnormalized layer outputs.
    mlperf_log.transformer_print(
        key=mlperf_log.MODEL_HP_NORM,
        value={"hidden_size": hparams.hidden_size})
    return common_layers.layer_preprocess(x, hparams)