def ffn_layer(x, hparams):
    """ffn layer transformer."""
    with tf.variable_scope("ffn"):
        if hparams.ffn_layer == "none":
            return x
        if hparams.ffn_layer == "conv_hidden_relu":
            y = common_layers.dense_relu_dense(x,
                                               hparams.filter_size,
                                               hparams.hidden_size,
                                               dropout=hparams.relu_dropout)
        elif hparams.ffn_layer == "normed_conv_hidden_relu":
            y = common_layers.normed_conv_hidden_relu(
                x,
                hparams.norm_type,
                hparams.layer_norm_epsilon,
                hparams.filter_size,
                hparams.hidden_size,
                dropout=hparams.relu_dropout,
                norm_name="convnorm")
        elif hparams.ffn_layer == "self_attention_ffn":
            x_shape = tf.shape(x)
            x = tf.reshape(x, [x_shape[0], -1, hparams.hidden_size])
            y = common_attention.ffn_self_attention_layer(
                x, hparams.filter_size, hparams.hidden_size, hparams.num_parts,
                hparams.attention_dropout, hparams.share_kv)
            y = tf.reshape(y, x_shape)
        else:
            assert hparams.ffn_layer == "glu_ffn"
            y = common_layers.gated_linear_unit_layer(x)
        return y
def ffn_layer(x, hparams):
  """ffn layer transformer."""
  with tf.variable_scope("ffn"):
    if hparams.ffn_layer == "none":
      return x
    if hparams.ffn_layer == "conv_hidden_relu":
      y = common_layers.dense_relu_dense(
          x,
          hparams.filter_size,
          hparams.hidden_size,
          dropout=hparams.relu_dropout)
    elif hparams.ffn_layer == "normed_conv_hidden_relu":
      y = common_layers.normed_conv_hidden_relu(
          x,
          hparams.norm_type,
          hparams.layer_norm_epsilon,
          hparams.filter_size,
          hparams.hidden_size,
          dropout=hparams.relu_dropout,
          norm_name="convnorm")
    elif hparams.ffn_layer == "self_attention_ffn":
      x_shape = tf.shape(x)
      x = tf.reshape(x, [x_shape[0], -1, hparams.hidden_size])
      y = common_attention.ffn_self_attention_layer(
          x, hparams.filter_size, hparams.hidden_size, hparams.num_parts,
          hparams.attention_dropout, hparams.share_kv)
      y = tf.reshape(y, x_shape)
    else:
      assert hparams.ffn_layer == "glu_ffn"
      y = common_layers.gated_linear_unit_layer(x)
    return y
def ffn_layer(x, hparams, losses=None):
    """ffn layer transformer."""
    with tf.variable_scope("ffn"):
        if hparams.ffn_layer == "none":
            return x
        if hparams.ffn_layer == "conv_hidden_relu":
            y = common_layers.dense_relu_dense(x,
                                               hparams.filter_size,
                                               hparams.hidden_size,
                                               dropout=hparams.relu_dropout)
        elif hparams.ffn_layer == "normed_conv_hidden_relu":
            y = common_layers.normed_conv_hidden_relu(
                x,
                hparams.norm_type,
                hparams.layer_norm_epsilon,
                hparams.filter_size,
                hparams.hidden_size,
                dropout=hparams.relu_dropout,
                norm_name="convnorm")
        elif hparams.ffn_layer == "self_attention_ffn":
            x_shape = tf.shape(x)
            x = tf.reshape(x, [x_shape[0], -1, hparams.hidden_size])
            y = common_attention.ffn_self_attention_layer(
                x, hparams.filter_size, hparams.hidden_size, hparams.num_parts,
                hparams.attention_dropout, hparams.share_kv)
            y = tf.reshape(y, x_shape)
        elif hparams.ffn_layer == "local_moe_tpu":
            overhead = (hparams.moe_overhead_train
                        if hparams.mode == tf.estimator.ModeKeys.TRAIN else
                        hparams.moe_overhead_eval)
            x, x_shape, is_4d = maybe_reshape_4d_to_3d(x)
            y, loss = expert_utils.local_moe_tpu(
                x,
                hparams.filter_size // 2,
                hparams.hidden_size,
                hparams.moe_num_experts,
                overhead=overhead,
                loss_coef=hparams.moe_loss_coef)
            if is_4d:
                y = tf.reshape(y, x_shape)
            if losses is None:
                raise ValueError(
                    "transformer_ffn_layer with type local_moe_tpu must pass in "
                    "a losses list")
            losses.append(loss)
        else:
            assert hparams.ffn_layer == "glu_ffn"
            y = common_layers.gated_linear_unit_layer(x)
        return y
Ejemplo n.º 4
0
def ffn_layer(x, hparams, losses=None):
  """ffn layer transformer."""
  with tf.variable_scope("ffn"):
    if hparams.ffn_layer == "none":
      return x
    if hparams.ffn_layer == "conv_hidden_relu":
      y = common_layers.dense_relu_dense(
          x,
          hparams.filter_size,
          hparams.hidden_size,
          dropout=hparams.relu_dropout)
    elif hparams.ffn_layer == "normed_conv_hidden_relu":
      y = common_layers.normed_conv_hidden_relu(
          x,
          hparams.norm_type,
          hparams.layer_norm_epsilon,
          hparams.filter_size,
          hparams.hidden_size,
          dropout=hparams.relu_dropout,
          norm_name="convnorm")
    elif hparams.ffn_layer == "self_attention_ffn":
      x_shape = tf.shape(x)
      x = tf.reshape(x, [x_shape[0], -1, hparams.hidden_size])
      y = common_attention.ffn_self_attention_layer(
          x, hparams.filter_size, hparams.hidden_size, hparams.num_parts,
          hparams.attention_dropout, hparams.share_kv)
      y = tf.reshape(y, x_shape)
    elif hparams.ffn_layer == "local_moe_tpu":
      overhead = (hparams.moe_overhead_train
                  if hparams.mode == tf.estimator.ModeKeys.TRAIN
                  else hparams.moe_overhead_eval)
      x, x_shape, is_4d = maybe_reshape_4d_to_3d(x)
      y, loss = expert_utils.local_moe_tpu(
          x, hparams.filter_size // 2,
          hparams.hidden_size,
          hparams.moe_num_experts, overhead=overhead,
          loss_coef=hparams.moe_loss_coef)
      if is_4d:
        y = tf.reshape(y, x_shape)
      if losses is None:
        raise ValueError(
            "transformer_ffn_layer with type local_moe_tpu must pass in "
            "a losses list")
      losses.append(loss)
    else:
      assert hparams.ffn_layer == "glu_ffn"
      y = common_layers.gated_linear_unit_layer(x)
    return y