Example #1
0
def cycle_gan_internal(inputs, targets, _, hparams):
  """Cycle GAN, main step used for training."""
  with tf.variable_scope("cycle_gan"):
    # Embed inputs and targets.
    inputs_orig, targets_orig = tf.to_int32(inputs), tf.to_int32(targets)
    inputs = common_layers.embedding(
        inputs_orig, hparams.vocab_size, hparams.hidden_size, "embed")
    targets = common_layers.embedding(
        targets_orig, hparams.vocab_size, hparams.hidden_size,
        "embed", reuse=True)

    # Split the batch into input-input and target-target parts.
    inputs1, _ = split_on_batch(inputs)
    _, targets2 = split_on_batch(targets)

    # Define F and G, called inp2tgt and tgt2inp here.
    def inp2tgt(x, reuse=False):
      return transformer_vae.residual_conv(x, 1, hparams, "inp2tgt", reuse)
    def tgt2inp(x, reuse=False):
      return transformer_vae.residual_conv(x, 1, hparams, "tgt2inp", reuse)

    # Input-input part.
    inp1_tgt = inp2tgt(inputs1)
    inp1_back = tgt2inp(inp1_tgt)

    # Target-target part.
    tgt2_inp = tgt2inp(targets2, reuse=True)
    tgt2_back = inp2tgt(tgt2_inp, reuse=True)

    # Reconstruction losses.
    inp1_orig, _ = split_on_batch(inputs_orig)
    _, tgt2_orig = split_on_batch(targets_orig)
    inp1_loss = reconstruct_loss(
        inp1_back, tf.squeeze(inp1_orig, axis=3), hparams)
    tgt2_loss = reconstruct_loss(
        tgt2_back, tf.squeeze(tgt2_orig, axis=3), hparams, reuse=True)

    # Discriminator losses.
    dloss1 = discriminate_loss(inputs1, tgt2_inp, True, hparams, "inp_disc")
    dloss2 = discriminate_loss(targets2, inp1_tgt, True, hparams, "tgt_disc")

    # Reconstruct targets from inputs.
    tgt = inp2tgt(inputs, reuse=True)
    tgt = tf.layers.dense(tgt, hparams.vocab_size, name="softmax", reuse=True)

    # We use the reconstruction only for tracking progress, no gradients here!
    tgt = tf.stop_gradient(tf.expand_dims(tgt, axis=2))

    losses = {"input_input": hparams.cycle_loss_multiplier * inp1_loss,
              "target_target": hparams.cycle_loss_multiplier * tgt2_loss,
              "input_disc": dloss1,
              "target_disc": dloss2}
    return tgt, losses
Example #2
0
 def bottom(self, x):
   with tf.variable_scope(self.name):
     return common_layers.embedding(
         x,
         self._vocab_size,
         self._body_input_depth,
         multiplier=self._body_input_depth**0.5 if
         self._model_hparams.multiply_embedding_mode == "sqrt_depth" else 1.0)
Example #3
0
def cycle_gan_internal(inputs, targets, _, hparams):
  """Cycle GAN, main step used for training."""
  with tf.variable_scope("cycle_gan"):
    # Embed inputs and targets.
    inputs_orig, targets_orig = tf.to_int32(inputs), tf.to_int32(targets)
    inputs = common_layers.embedding(
        inputs_orig, hparams.vocab_size, hparams.hidden_size, "embed")
    targets = common_layers.embedding(
        targets_orig, hparams.vocab_size, hparams.hidden_size,
        "embed", reuse=True)

    x, _ = split_on_batch(inputs)
    _, y = split_on_batch(targets)

    # Y --> X
    y_fake = generator(y, hparams, "Fy", reuse=False)
    y_to_x_loss = lossfn(y, y_fake, True, hparams, True, "YtoX")

    # X --> Y
    x_fake = generator(x, hparams, "Gx", reuse=False)
    x_to_y_loss = lossfn(y, x_fake, True, hparams, True, "XtoY")

    # Cycle-Consistency
    y_fake_ = generator(y_fake, hparams, "Gx", reuse=True)
    x_fake_ = generator(x_fake, hparams, "Fy", reuse=True)
    x_to_x_loss = hparams.cycle_loss_multiplier1 * tf.reduce_mean(
        tf.abs(x_fake_ - x))
    y_to_y_loss = hparams.cycle_loss_multiplier2 * tf.reduce_mean(
        tf.abs(y_fake_ - y))
    cycloss = x_to_x_loss + y_to_y_loss

    sample_generated = generator(inputs, hparams, "Gx", reuse=True)
    sample_generated = tf.layers.dense(
        sample_generated, hparams.vocab_size, name="softmax", reuse=None)
    sample_generated = tf.stop_gradient(
        tf.expand_dims(sample_generated, axis=2))

    losses = {"cycloss": cycloss,
              "y_to_x_loss": y_to_x_loss,
              "x_to_y_loss": x_to_y_loss}

    return sample_generated, losses
Example #4
0
def transformer_prepare_encoder(inputs, target_space, hparams, features=None):
  """Prepare one shard of the model for the encoder.

  Args:
    inputs: a Tensor.
    target_space: a Tensor.
    hparams: run hyperparameters
    features: optionally pass the entire features dictionary as well.
      This is needed now for "packed" datasets.

  Returns:
    encoder_input: a Tensor, bottom of encoder stack
    encoder_self_attention_bias: a bias tensor for use in encoder self-attention
    encoder_decoder_attention_bias: a bias tensor for use in encoder-decoder
      attention
  """
  ishape_static = inputs.shape.as_list()
  encoder_input = inputs
  if features and "inputs_segmentation" in features:
    # Packed dataset.  Keep the examples from seeing each other.
    inputs_segmentation = features["inputs_segmentation"]
    inputs_position = features["inputs_position"]
    targets_segmentation = features["targets_segmentation"]
    encoder_self_attention_bias = common_attention.attention_bias_same_segment(
        inputs_segmentation, inputs_segmentation)
    encoder_decoder_attention_bias = (
        common_attention.attention_bias_same_segment(
            targets_segmentation, inputs_segmentation))
  else:
    # Usual case - not a packed dataset.
    encoder_padding = common_attention.embedding_to_padding(encoder_input)
    ignore_padding = common_attention.attention_bias_ignore_padding(
        encoder_padding)
    encoder_self_attention_bias = ignore_padding
    encoder_decoder_attention_bias = ignore_padding
    inputs_position = None
  if hparams.proximity_bias:
    encoder_self_attention_bias += common_attention.attention_bias_proximal(
        common_layers.shape_list(inputs)[1])
  # Append target_space_id embedding to inputs.
  emb_target_space = common_layers.embedding(
      target_space, 32, ishape_static[-1], name="target_space_embedding")
  emb_target_space = tf.reshape(emb_target_space, [1, 1, -1])
  encoder_input += emb_target_space
  if hparams.pos == "timing":
    if inputs_position is not None:
      encoder_input = common_attention.add_timing_signal_1d_given_position(
          encoder_input, inputs_position)
    else:
      encoder_input = common_attention.add_timing_signal_1d(encoder_input)
  return (encoder_input, encoder_self_attention_bias,
          encoder_decoder_attention_bias)
Example #5
0
  def targets_bottom(self, inputs):
    with tf.variable_scope(self.name):
      # Reshape inputs to 2-d tensor and embed the RGB pixel values.
      ret = common_layers.embedding(
          tf.to_int32(common_layers.flatten4d3d(inputs)),
          self.top_dimensionality,
          self._body_input_depth,
          name="input_rgb_embedding")
      if self._model_hparams.multiply_embedding_mode == "sqrt_depth":
        ret *= self._body_input_depth**0.5

      reshape_shape = common_layers.shape_list(inputs)[:3]
      reshape_shape.append(self._body_input_depth * 3)
      ret = tf.reshape(ret, reshape_shape)
      return tf.layers.dense(ret, self._body_input_depth)
Example #6
0
    def targets_bottom(self, inputs):
        with tf.variable_scope(self.name):
            # Reshape inputs to 2-d tensor and embed the RGB pixel values.
            ret = common_layers.embedding(tf.to_int32(
                common_layers.flatten4d3d(inputs)),
                                          self.top_dimensionality,
                                          self._body_input_depth,
                                          name="input_rgb_embedding")
            if self._model_hparams.multiply_embedding_mode == "sqrt_depth":
                ret *= self._body_input_depth**0.5

            reshape_shape = [
                common_layers.shape_dim(inputs, i) for i in range(3)
            ]
            reshape_shape.append(self._body_input_depth * 3)
            ret = tf.reshape(ret, reshape_shape)
            return tf.layers.dense(ret, self._body_input_depth)
Example #7
0
def set_embedding(x, vocab_size, dense_size, **kwargs):
    """Each (ID, position) tuple gets a unique embedding.

  Args:
    x: An int Tensor with shape [batch_size, length] whose elements are in
      [0, vocab_size).
    vocab_size: Int. The range of valid ID values elements in x can take.
    dense_size: Int. The dimensionality of an embedding vector.

  Returns:
    A float Tensor with shape [batch_size, length, dense_size].
  """
    x = keep_first_dims(x, 2)
    seq_length = common_layers.shape_list(x)[1]
    x += tf.range(seq_length, dtype=x.dtype) * vocab_size
    new_vocab_size = vocab_size * seq_length
    return common_layers.embedding(x, new_vocab_size, dense_size, **kwargs)
Example #8
0
    def body(self, features):
        observations = features["inputs_raw"]
        # Axis 0    - Batch.
        # Axis 1    - Input Frames, 4 frames.
        # Axis 2, 3 - Height & Width.
        # Axis 4    - Channels RGB, 3 colours.
        x = tf.transpose(observations, [0, 2, 3, 1, 4])
        x_shape = common_layers.shape_list(x)
        x = tf.reshape(x, x_shape[:-2] + [-1])
        dropout = getattr(self.hparams, "dropout_ppo", 0.0)
        with tf.variable_scope("feed_forward_cnn_small"):
            x = tf.cast(x, tf.float32) / 255.0
            x = tf.layers.conv2d(x,
                                 32, (5, 5),
                                 strides=(2, 2),
                                 activation=tf.nn.relu,
                                 padding="same")
            x = tf.layers.conv2d(x,
                                 32, (5, 5),
                                 strides=(2, 2),
                                 activation=tf.nn.relu,
                                 padding="same")

            flat_x = tf.layers.flatten(x)
            if self.use_epochs:
                epoch = features["epoch"] + tf.zeros([x_shape[0]],
                                                     dtype=tf.int32)
                # Randomly set epoch to 0 in some cases as that's the inference value.
                rand = tf.random.uniform([x_shape[0]])
                epoch = tf.where(rand < 0.1, tf.zeros_like(epoch), epoch)
                # Embed the epoch number.
                emb_epoch = common_layers.embedding(epoch, 32,
                                                    32)  # [batch, 32]
                flat_x = tf.concat([flat_x, emb_epoch], axis=1)
            flat_x = tf.layers.dropout(flat_x, rate=dropout)
            x = tf.layers.dense(flat_x, 128, activation=tf.nn.relu)

            logits = tf.layers.dense(x,
                                     self.hparams.problem.num_actions,
                                     name="dense2")
            logits = clip_logits(logits, self.hparams)
            logits = tf.expand_dims(logits, axis=1)
            value = tf.layers.dense(x, self.distributional_value_size)
        return {"target_policy": logits, "target_value": value}
Example #9
0
def transformer_error_tag_prediction_layer(x,
                                           hparams,
                                           features,
                                           loss_mask,
                                           layer_collection=None):
    """Layer that predicts the error tag."""
    with tf.variable_scope('error_tag_prediction'):
        x = maybe_flatten4d3d(x)
        vocab_size = hparams.problem.feature_info[
            'targets_error_tag'].vocab_size
        labels = features['targets_error_tag_raw']
        with tf.variable_scope('projection'):
            bottleneck = common_layers.dense(
                x,
                hparams.error_tag_embed_size,
                layer_collection=layer_collection,
                name='bottleneck',
            )
            logits = common_layers.dense(
                bottleneck,
                vocab_size,
                use_bias=False,
                layer_collection=layer_collection,
                name='logits',
            )
            xent = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=logits, labels=labels)
            loss = tf.reduce_sum(xent * loss_mask)
        with tf.variable_scope('embedding'):
            # embed_mat = get_error_tag_embedding_matrix()
            y = common_layers.layer_preprocess(
                common_layers.embedding(
                    labels,
                    vocab_size,
                    hparams.hidden_size,
                    embedding_var=None,
                ),
                hparams,
                layer_collection=layer_collection,
            )
            x = common_layers.layer_postprocess(x, y, hparams)
        return x, logits, loss
Example #10
0
def transformer_prepare_encoder(inputs, target_space, hparams):
    """Prepare one shard of the model for the encoder.
  
    Args:
      inputs: Tensor with shape [batch, memory_length, depth]
      target_space: a Tensor.
      hparams: run hyperparameters
  
    Returns:
      encoder_input: a Tensor, bottom of encoder stack
      encoder_self_attention_bias: a bias tensor for use in encoder self-attention
      encoder_decoder_attention_bias: a bias tensor for use in encoder-decoder
        attention
    """

    ignore_padding = get_ignore_padding(inputs)
    encoder_self_attention_bias = ignore_padding

    # Bias for self-attention to encourage attention to close positions.
    if hparams.proximity_bias:
        encoder_self_attention_bias += comm_attn.attention_bias_proximal(
            length=tf.shape(inputs)[1])

    # Append target_space_id embedding to inputs.
    emb_target_space = common_layers.embedding(
        x=target_space,
        vocab_size=32,
        dense_size=inputs.shape.as_list[-1],
        name='target_space_embedding')
    emb_target_space = tf.reshape(emb_target_space, [1, 1, -1])

    # Question: wat
    encoder_input = inputs + emb_target_space
    if hparams.pos == 'timing':
        encoder_input = comm_attn.add_timing_signal_1d(encoder_input)
    # Putting this here since always called immediately after...
    encoder_input = with_dropout(encoder_input, hparams)

    return EncoderState(input=encoder_input,
                        self_attn_bias=encoder_self_attention_bias,
                        decoder_attn_bias=ignore_padding,
                        output=None)
Example #11
0
    def body(self, features):
        filters = self.hparams.hidden_size
        cur_frame = features["inputs_0"]
        prev_frame = features["inputs_1"]
        if self.hparams.per_image_standardization:
            cur_frame = tf.map_fn(
                lambda frame: tf.image.per_image_standardization(frame),
                cur_frame)
            prev_frame = tf.map_fn(
                lambda frame: tf.image.per_image_standardization(frame),
                prev_frame)

        action = common_layers.embedding(tf.to_int64(features["action"]), 10,
                                         filters)
        action = tf.reshape(action, [-1, 1, 1, filters])

        frames = tf.concat([cur_frame, prev_frame], axis=3)
        h1 = tf.layers.conv2d(frames,
                              filters,
                              kernel_size=(3, 3),
                              padding="SAME")
        h2 = tf.layers.conv2d(tf.nn.relu(h1 + action),
                              filters,
                              kernel_size=(5, 5),
                              padding="SAME")
        res = tf.layers.conv2d(tf.nn.relu(h2 + action),
                               3 * 256,
                               kernel_size=(3, 3),
                               padding="SAME")
        reward_pred_h1 = tf.reduce_mean(res, axis=[1, 2])
        reward_pred = tf.layers.dense(reward_pred_h1, 2, name="reward")
        # reward_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
        #   labels=tf.to_int32(features["reward"]), logits=reward_pred)
        # reward_loss = tf.reduce_mean(reward_loss)
        x = tf.layers.flatten(h2)
        # l = tf.shape(res)[1]
        # w = tf.shape(res)[2]
        l = 210
        w = 160
        res = tf.reshape(res, [-1, l, w, 768])
        return {"targets": res, "reward": x}
Example #12
0
 def body(self, features):
     filters = self.hparams.hidden_size
     cur_frame = tf.to_float(features["inputs"])
     prev_frame = tf.to_float(features["inputs_prev"])
     action_embedding_size = 32
     action_space_size = 10
     kernel = (3, 3)
     # Gather all inputs.
     action = common_layers.embedding(tf.to_int64(features["action"]),
                                      action_space_size,
                                      action_embedding_size)
     action = tf.reshape(action, [-1, 1, 1, action_embedding_size])
     frames = tf.concat([cur_frame, prev_frame, action], axis=3)
     x = tf.layers.conv2d(frames,
                          filters,
                          kernel,
                          activation=tf.nn.relu,
                          strides=(2, 2),
                          padding="SAME")
     # Run a stack of convolutions.
     for _ in xrange(self.num_hidden_layers):
         y = tf.layers.conv2d(frames,
                              filters,
                              kernel,
                              activation=tf.nn.relu,
                              strides=(1, 1),
                              padding="SAME")
         x = common_layers.layer_norm(x + y)
     # Up-convolve.
     x = tf.layers.conv2d_transpose(frames,
                                    filters,
                                    kernel,
                                    activation=tf.nn.relu,
                                    strides=(2, 2),
                                    padding="SAME")
     # Output size is 3 * 256 for 3-channel color space.
     res = tf.layers.conv2d(x, 3 * 256, kernel, padding="SAME")
     height = tf.shape(res)[1]
     width = tf.shape(res)[2]
     res = tf.reshape(res, [-1, height, width, 3, 256])
     return res
Example #13
0
def transformer_prepare_encoder(inputs, target_space, hparams):
    """Copied from tensor2tensor.models.transformer."""
    ishape_static = inputs.shape.as_list()
    encoder_input = inputs
    encoder_padding = common_attention.embedding_to_padding(encoder_input)
    ignore_padding = common_attention.attention_bias_ignore_padding(
        encoder_padding)
    encoder_self_attention_bias = ignore_padding
    encoder_decoder_attention_bias = ignore_padding
    if hparams.proximity_bias:
        encoder_self_attention_bias += common_attention.attention_bias_proximal(
            tf.shape(inputs)[1])
    # Append target_space_id embedding to inputs.
    emb_target_space = common_layers.embedding(target_space,
                                               32,
                                               ishape_static[-1],
                                               name="target_space_embedding")
    emb_target_space = tf.reshape(emb_target_space, [1, 1, -1])
    encoder_input += emb_target_space
    if hparams.pos == "timing":
        encoder_input = common_attention.add_timing_signal_1d(encoder_input)
    return (encoder_input, encoder_self_attention_bias,
            encoder_decoder_attention_bias)
Example #14
0
def transformer_prepare_encoder(inputs, target_space, hparams):
    """Prepare one shard of the model for the encoder.

  Args:
    inputs: a Tensor.
    target_space: a Tensor.
    hparams: run hyperparameters

  Returns:
    encoder_input: a Tensor, bottom of encoder stack
    encoder_self_attention_bias: a bias tensor for use in encoder self-attention
    encoder_decoder_attention_bias: a bias tensor for use in encoder-decoder
      attention
  """
    ishape_static = inputs.shape.as_list()
    encoder_input = inputs
    encoder_padding = common_attention.embedding_to_padding(encoder_input)
    ignore_padding = common_attention.attention_bias_ignore_padding(
        encoder_padding)
    encoder_self_attention_bias = ignore_padding
    encoder_decoder_attention_bias = ignore_padding
    if hparams.proximity_bias:
        encoder_self_attention_bias += common_attention.attention_bias_proximal(
            common_layers.shape_list(inputs)[1])
    # Append target_space_id embedding to inputs.
    emb_target_space = common_layers.embedding(
        target_space,
        32,
        ishape_static[-1],
        name="target_space_embedding",
        use_eager_mode=hparams.use_eager_mode)
    emb_target_space = tf.reshape(emb_target_space, [1, 1, -1])
    encoder_input += emb_target_space
    if hparams.pos == "timing":
        encoder_input = common_attention.add_timing_signal_1d(encoder_input)
    return (encoder_input, encoder_self_attention_bias,
            encoder_decoder_attention_bias)
Example #15
0
def transformer_prepare_encoder(inputs, target_space, hparams):
  """Prepare one shard of the model for the encoder.

  Args:
    inputs: a Tensor.
    target_space: a Tensor.
    hparams: run hyperparameters

  Returns:
    encoder_input: a Tensor, bottom of encoder stack
    encoder_self_attention_bias: a bias tensor for use in encoder self-attention
    encoder_decoder_attention_bias: a bias tensor for use in encoder-decoder
      attention
  """
  ishape_static = inputs.shape.as_list()
  encoder_input = inputs
  encoder_padding = common_attention.embedding_to_padding(encoder_input)
  ignore_padding = common_attention.attention_bias_ignore_padding(
      encoder_padding)
  encoder_self_attention_bias = ignore_padding
  encoder_decoder_attention_bias = ignore_padding
  if hparams.proximity_bias:
    encoder_self_attention_bias += common_attention.attention_bias_proximal(
        tf.shape(inputs)[1])
  # Append target_space_id embedding to inputs.
  emb_target_space = common_layers.embedding(
      target_space, 32, ishape_static[-1], name="target_space_embedding")
  emb_target_space = tf.reshape(emb_target_space, [1, 1, -1])
  encoder_input += emb_target_space

  # random_uniform_mask = tf.expand_dims(tf.to_float(tf.to_int32(tf.random_uniform([tf.shape(encoder_input)[0], tf.shape(encoder_input)[1]]) < hparams.mask_noise_prob)), axis=2)
  # encoder_input = encoder_input * (1 - random_uniform_mask)

  if hparams.pos == "timing":
    encoder_input = common_attention.add_timing_signal_1d(encoder_input)
  return (encoder_input, encoder_self_attention_bias,
          encoder_decoder_attention_bias)
Example #16
0
def embed_target_space(target_space_id, hidden_size):
  target_space_emb = common_layers.embedding(
      target_space_id, 32, hidden_size, name="target_space_embedding")
  return tf.reshape(target_space_emb, [1, 1, 1, -1])
Example #17
0
 def testEmbedding(self):
     x = np.random.randint(1, high=9, size=(3, 5))
     y = common_layers.embedding(x, 10, 16)
     self.evaluate(tf.global_variables_initializer())
     res = self.evaluate(y)
     self.assertEqual(res.shape, (3, 5, 16))
Example #18
0
 def testFlatten4D3D(self):
     x = np.random.randint(1, high=9, size=(3, 5, 2))
     y = common_layers.flatten4d3d(common_layers.embedding(x, 10, 7))
     self.evaluate(tf.global_variables_initializer())
     res = self.evaluate(y)
     self.assertEqual(res.shape, (3, 5 * 2, 7))
Example #19
0
def cycle_gan_internal(inputs, targets, _, hparams):
    """Cycle GAN, main step used for training."""
    with tf.variable_scope("cycle_gan"):
        # Embed inputs and targets.
        inputs_orig, targets_orig = tf.to_int32(inputs), tf.to_int32(targets)
        inputs = common_layers.embedding(inputs_orig, hparams.vocab_size,
                                         hparams.hidden_size, "embed")
        targets = common_layers.embedding(targets_orig,
                                          hparams.vocab_size,
                                          hparams.hidden_size,
                                          "embed",
                                          reuse=True)

        # Split the batch into input-input and target-target parts.
        inputs1, _ = split_on_batch(inputs)
        _, targets2 = split_on_batch(targets)

        # Define F and G, called inp2tgt and tgt2inp here.
        def inp2tgt(x, reuse=False):
            return transformer_vae.residual_conv(x, 1, hparams, "inp2tgt",
                                                 reuse)

        def tgt2inp(x, reuse=False):
            return transformer_vae.residual_conv(x, 1, hparams, "tgt2inp",
                                                 reuse)

        # Input-input part.
        inp1_tgt = inp2tgt(inputs1)
        inp1_back = tgt2inp(inp1_tgt)

        # Target-target part.
        tgt2_inp = tgt2inp(targets2, reuse=True)
        tgt2_back = inp2tgt(tgt2_inp, reuse=True)

        # Reconstruction losses.
        inp1_orig, _ = split_on_batch(inputs_orig)
        _, tgt2_orig = split_on_batch(targets_orig)
        inp1_loss = reconstruct_loss(inp1_back, tf.squeeze(inp1_orig, axis=3),
                                     hparams)
        tgt2_loss = reconstruct_loss(tgt2_back,
                                     tf.squeeze(tgt2_orig, axis=3),
                                     hparams,
                                     reuse=True)

        # Discriminator losses.
        dloss1 = discriminate_loss(inputs1, tgt2_inp, True, hparams,
                                   "inp_disc")
        dloss2 = discriminate_loss(targets2, inp1_tgt, True, hparams,
                                   "tgt_disc")

        # Reconstruct targets from inputs.
        tgt = inp2tgt(inputs, reuse=True)
        tgt = tf.layers.dense(tgt,
                              hparams.vocab_size,
                              name="softmax",
                              reuse=True)

        # We use the reconstruction only for tracking progress, no gradients here!
        tgt = tf.stop_gradient(tf.expand_dims(tgt, axis=2))

        losses = {
            "input_input": hparams.cycle_loss_multiplier * inp1_loss,
            "target_target": hparams.cycle_loss_multiplier * tgt2_loss,
            "input_disc": dloss1,
            "target_disc": dloss2
        }
        return tgt, losses
Example #20
0
def cycle_vae_gan_internal(inputs, targets, _, hparams):
    """Cycle GAN, main step used for training."""
    with tf.variable_scope("cycle_vae_gan"):
        # Embed inputs and targets.
        inputs_orig, targets_orig = tf.to_int32(inputs), tf.to_int32(targets)
        k = 2**hparams.num_compress_steps
        inputs_orig, targets_orig = common_layers.pad_to_same_length(
            inputs_orig, targets_orig, final_length_divisible_by=k)
        inputs = common_layers.embedding(inputs_orig, hparams.vocab_size,
                                         hparams.hidden_size, "embed")
        targets = common_layers.embedding(targets_orig,
                                          hparams.vocab_size,
                                          hparams.hidden_size,
                                          "embed",
                                          reuse=True)

        # Split the batch into input-input and target-target parts.
        inputs1, _ = split_on_batch(inputs)
        _, targets2 = split_on_batch(targets)

        # Input-input part.
        inp1_back, kl_loss1, inp1_mu, inp1_log_sigma = transformer_vae.vae_compress(
            inputs1, None, hparams, "inp2hyp", "hyp2inp")
        inp1_hyp = tf.concat([inp1_mu, inp1_log_sigma], axis=3)

        # Target-target part.
        tgt2_back, kl_loss2, tgt2_mu, tgt2_log_sigma = transformer_vae.vae_compress(
            targets2, None, hparams, "tgt2hyp", "hyp2tgt")
        tgt2_hyp = tf.concat([tgt2_mu, tgt2_log_sigma], axis=3)

        # Reconstruction losses.
        inp1_orig, _ = split_on_batch(inputs_orig)
        _, tgt2_orig = split_on_batch(targets_orig)
        inp1_loss = reconstruct_loss(inp1_back, tf.squeeze(inp1_orig, axis=3),
                                     hparams)
        tgt2_loss = reconstruct_loss(tgt2_back,
                                     tf.squeeze(tgt2_orig, axis=3),
                                     hparams,
                                     reuse=True)

        # Discriminator loss.
        dloss = discriminate_loss(inp1_hyp, tgt2_hyp, False, hparams, "dloss")

        # Reconstruct targets from inputs.
        tgt, _, _, _ = transformer_vae.vae_compress(inputs,
                                                    None,
                                                    hparams,
                                                    "inp2hyp",
                                                    "hyp2tgt",
                                                    reuse=True)
        tgt = tf.layers.dense(tgt,
                              hparams.vocab_size,
                              name="softmax",
                              reuse=True)
        # We use the reconstruction only for tracking progress, no gradients here!
        tgt = tf.stop_gradient(tf.expand_dims(tgt, axis=2))

        kl_rev_decay = common_layers.inverse_exp_decay(hparams.kl_warmup_steps)
        losses = {
            "input_input": hparams.cycle_loss_multiplier * inp1_loss,
            "target_target": hparams.cycle_loss_multiplier * tgt2_loss,
            "input_kl": kl_loss1 * kl_rev_decay * 15.0,
            "target_kl": kl_loss2 * kl_rev_decay * 15.0,
            "discriminator": dloss
        }
        return tgt, losses
Example #21
0
def cycle_gan_internal(inputs, targets, _, hparams):
  """Cycle GAN, main step used for training."""
  with tf.variable_scope("cycle_gan"):
    # Embed inputs and targets.
    inputs_orig, targets_orig = tf.to_int32(inputs), tf.to_int32(targets)#[? ? 1 1]
    inputs = common_layers.embedding(
        inputs_orig, hparams.vocab_size, hparams.hidden_size, "embed")#[? ? 1 384]
    targets = common_layers.embedding(
        targets_orig, hparams.vocab_size, hparams.hidden_size,
        "embed", reuse=True)


    ###?????
    x, _ = split_on_batch(inputs)
    _, y = split_on_batch(targets)


    whether_compress=True#real
    #whether_compress=False





    # Y --> X
    y_fake = generator(y, hparams, "Fy", reuse=False)# [? ? 1 384]
    #y_to_x_loss = lossfn(y, y_fake, True, hparams, True, "YtoX")###??? wrong?

    y_to_x_loss = lossfn(x, y_fake, whether_compress, hparams, True, "YtoX")##yr add

    # X --> Y
    x_fake = generator(x, hparams, "Gx", reuse=False)
    x_to_y_loss = lossfn(y, x_fake, whether_compress, hparams, True, "XtoY")

    # Cycle-Consistency
    y_fake_ = generator(y_fake, hparams, "Gx", reuse=True)
    x_fake_ = generator(x_fake, hparams, "Fy", reuse=True)
    x_to_x_loss = hparams.cycle_loss_multiplier1 * tf.reduce_mean(
        tf.abs(x_fake_ - x))
    y_to_y_loss = hparams.cycle_loss_multiplier2 * tf.reduce_mean(
        tf.abs(y_fake_ - y))
    cycloss = x_to_x_loss + y_to_y_loss

    sample_generated = generator(inputs, hparams, "Gx", reuse=True)#[? ? 1 384]
    sample_generated = tf.layers.dense(
        sample_generated, hparams.vocab_size, name="softmax", reuse=None)#[? ? 1 6381]
    sample_generated = tf.stop_gradient(
        tf.expand_dims(sample_generated, axis=2))

    # losses = {"cycloss": cycloss,
    #           "y_to_x_loss": y_to_x_loss,
    #           "x_to_y_loss": x_to_y_loss,
    #           'yr1':x_to_x_loss,'yr2':y_to_y_loss}
              #'x':[x,inputs],'yf':y_fake}#fail

    #cycloss | y_to_x_loss sometimes nan ,sometimes otherwise
    # losses = {"cycloss": 1.0,
    #           "y_to_x_loss": 1.0,
    #           "x_to_y_loss": x_to_y_loss,
    #           "training":1.0}# no need to calc loss(generated_sample,target)

    # losses = {"cycloss": 1.0,
    #            "y_to_x_loss": 1.0,
    #            "x_to_y_loss": 1.0}

    losses = {"cycloss": cycloss,
              "y_to_x_loss": y_to_x_loss,
              "x_to_y_loss": x_to_y_loss}#real



    return sample_generated, losses# [? ? 1 1 1471] loss
  def body(self, features):
    hparams = self._hparams
    ps_devices = self._ps_devices
    single_device = (len(ps_devices) == 1)
    assert hparams.num_model_shards % len(ps_devices) == 0
    shards_per_device = hparams.num_model_shards // len(ps_devices)
    model_devices = [ps_devices[i // shards_per_device]
                     for i in range(hparams.num_model_shards)]
    print("model_devices = %s" % model_devices)
    mp = expert_utils.Parallelism(model_devices, reuse=False)
    targets_vocab_size = self._problem_hparams.vocabulary["targets"].vocab_size
    # squeeze out channels, heights
    targets = tf.squeeze(features["targets_raw"], [2, 3])
    targets_embedding_var = mp(
        tf.get_variable, "embedding",
        [[targets_vocab_size, hparams.hidden_size]] * mp.n,
        initializer=tf.random_normal_initializer(
            0.0, hparams.hidden_size**-0.5))
    shifted_targets = common_layers.shift_right_2d(targets)
    # Bypass the symbol modality and use a different embedding on each shard.
    if single_device:
      targets_embedding_var_combined = tf.concat(targets_embedding_var, 1)
      decoder_input_combined = common_layers.embedding(
          shifted_targets, targets_vocab_size,
          hparams.hidden_size * mp.n,
          multiplier=hparams.hidden_size**0.5,
          embedding_var=targets_embedding_var_combined,
      )
      decoder_input = tf.split(decoder_input_combined, mp.n, axis=2)
    else:
      targets_embedding_var_combined = None
      decoder_input = mp(
          common_layers.embedding, shifted_targets, targets_vocab_size,
          hparams.hidden_size,
          multiplier=hparams.hidden_size**0.5,
          embedding_var=targets_embedding_var,
      )
    decoder_self_attention_bias = mp(
        common_attention.attention_bias_lower_triangle,
        tf.shape(targets)[1])
    if "targets_segmentation" in features:
      # "Packed" dataset - keep the examples from seeing each other.
      targets_segmentation = features["targets_segmentation"]
      targets_position = features["targets_position"]
      decoder_self_attention_bias = mp(
          tf.add, decoder_self_attention_bias,
          mp(common_attention.attention_bias_same_segment,
             targets_segmentation, targets_segmentation))
      decoder_input = mp(
          common_attention.add_timing_signal_1d_given_position,
          decoder_input, targets_position)
    else:
      targets_position = None
      decoder_self_attention_bias = mp(
          common_attention.attention_bias_lower_triangle,
          tf.shape(targets)[1])
      decoder_input = mp(common_attention.add_timing_signal_1d, decoder_input)

    if self.has_input:
      inputs = tf.squeeze(features["inputs_raw"], [2, 3])
      inputs_vocab_size = self._problem_hparams.vocabulary["inputs"].vocab_size
      # share everything for now
      share_inputs_and_targets_embedding = True
      if share_inputs_and_targets_embedding:
        assert inputs_vocab_size == targets_vocab_size
        inputs_embedding_var = targets_embedding_var
        inputs_embedding_var_combined = targets_embedding_var_combined
      if single_device:
        encoder_input_combined = common_layers.embedding(
            inputs, inputs_vocab_size,
            hparams.hidden_size * mp.n,
            multiplier=hparams.hidden_size**0.5,
            embedding_var=inputs_embedding_var_combined,
        )
        encoder_input = tf.split(encoder_input_combined, mp.n, axis=2)
      else:
        encoder_input = mp(
            common_layers.embedding, inputs, inputs_vocab_size,
            hparams.hidden_size,
            multiplier=hparams.hidden_size**0.5,
            embedding_var=inputs_embedding_var,
        )
      if "inputs_segmentation" in features:
        # "Packed" dataset - keep the examples from seeing each other.
        inputs_segmentation = features["inputs_segmentation"]
        inputs_position = features["inputs_position"]
        encoder_self_attention_bias = mp(
            common_attention.attention_bias_same_segment,
            inputs_segmentation, inputs_segmentation)
        encoder_decoder_attention_bias = mp(
            common_attention.attention_bias_same_segment,
            targets_segmentation, inputs_segmentation)
        encoder_input = mp(
            common_attention.add_timing_signal_1d_given_position,
            encoder_input, inputs_position)
      else:
        encoder_padding = tf.to_float(tf.equal(inputs, 0))
        ignore_padding = common_attention.attention_bias_ignore_padding(
            encoder_padding)
        encoder_self_attention_bias = ignore_padding
        encoder_decoder_attention_bias = ignore_padding
        inputs_position = None
        encoder_input = mp(common_attention.add_timing_signal_1d, encoder_input)

      # encoder stack here
      with tf.variable_scope("encoder"):
        encoder_input = mp(
            tf.nn.dropout, encoder_input,
            1.0 - hparams.layer_prepostprocess_dropout)
        encoder_output = _layer_stack(
            mp,
            encoder_input,
            encoder_self_attention_bias,
            hparams.encoder_layers,
            hparams)
    else:
      encoder_decoder_attention_bias = None
      encoder_output = None

    with tf.variable_scope("decoder"):
      decoder_input = mp(
          tf.nn.dropout, decoder_input,
          1.0 - hparams.layer_prepostprocess_dropout)
      decoder_output = _layer_stack(
          mp,
          decoder_input,
          decoder_self_attention_bias,
          layers=hparams.decoder_layers,
          hparams=hparams,
          encoder_output=encoder_output,
          encoder_decoder_attention_bias=encoder_decoder_attention_bias)

    # Bypass the symbol modality and compute logits directly.
    # We compute a different set of logits on each shard, and sum them.
    # Share the weights with the target embedding.
    output_var = targets_embedding_var
    output_var_combined = targets_embedding_var_combined
    if single_device:
      decoder_output = tf.concat(decoder_output, 2)
      logits = tf.tensordot(decoder_output, output_var_combined, [[2], [1]])
      num, denom = common_layers.padded_cross_entropy(
          logits, targets, hparams.label_smoothing)
      training_loss = num / denom
    else:
      logits = mp(
          tf.tensordot, decoder_output, output_var, [[[2], [1]]] * mp.n)
      logits = expert_utils.all_reduce_ring(logits, mp)
      # On each device, we compute the loss for a part of the batch.
      # This is faster than computing the whole loss on one shard.
      mp, logits = expert_utils.reduce_by_device(mp, logits, lambda l: l[0])
      def _loss_for_shard(logits, targets, shard):
        logits = common_layers.approximate_split(logits, mp.n, 0)[shard]
        targets = common_layers.approximate_split(targets, mp.n, 0)[shard]
        return common_layers.padded_cross_entropy(
            logits, targets, hparams.label_smoothing)
      num, denom = mp(_loss_for_shard, logits, targets, range(mp.n))
      training_loss = tf.add_n(num) / tf.add_n(denom)
      logits = logits[0]
    logits = tf.expand_dims(tf.expand_dims(logits, 2), 3)
    # override training loss so that it is not computed externally.
    losses = {"training": training_loss}
    return logits, losses
 def testFlatten4D3D(self):
   x = np.random.random_integers(1, high=8, size=(3, 5, 2))
   y = common_layers.flatten4d3d(common_layers.embedding(x, 10, 7))
   self.evaluate(tf.global_variables_initializer())
   res = self.evaluate(y)
   self.assertEqual(res.shape, (3, 5 * 2, 7))
Example #24
0
def transformer_prepare_encoder(inputs, target_space, hparams, features=None):
  """Prepare one shard of the model for the encoder.

  Args:
    inputs: a Tensor.
    target_space: a Tensor.
    hparams: run hyperparameters
    features: optionally pass the entire features dictionary as well.
      This is needed now for "packed" datasets.

  Returns:
    encoder_input: a Tensor, bottom of encoder stack
    encoder_self_attention_bias: a bias tensor for use in encoder self-attention
    encoder_decoder_attention_bias: a bias tensor for use in encoder-decoder
      attention
  """
  ishape_static = inputs.shape.as_list()
  encoder_input = inputs
  if features and "inputs_segmentation" in features:
    # Packed dataset.  Keep the examples from seeing each other.
    inputs_segmentation = features["inputs_segmentation"]
    inputs_position = features["inputs_position"]
    targets_segmentation = features["targets_segmentation"]
    if (hasattr(hparams, "unidirectional_encoder") and
        hparams.unidirectional_encoder):
      tf.logging.info("Using unidirectional encoder")
      encoder_self_attention_bias = (
          common_attention.attention_bias_lower_triangle(
              common_layers.shape_list(inputs)[1]))
    else:
      encoder_self_attention_bias = (
          common_attention.attention_bias_same_segment(
              inputs_segmentation, inputs_segmentation))
    encoder_decoder_attention_bias = (
        common_attention.attention_bias_same_segment(targets_segmentation,
                                                     inputs_segmentation))
  else:
    encoder_padding = common_attention.embedding_to_padding(encoder_input)
    ignore_padding = common_attention.attention_bias_ignore_padding(
        encoder_padding)
    if (hasattr(hparams, "unidirectional_encoder") and
        hparams.unidirectional_encoder):
      tf.logging.info("Using unidirectional encoder")
      encoder_self_attention_bias = (
          common_attention.attention_bias_lower_triangle(
              common_layers.shape_list(inputs)[1]))
    else:
      # Usual case - not a packed dataset.
      encoder_self_attention_bias = ignore_padding
    encoder_decoder_attention_bias = ignore_padding
    inputs_position = None
  if hparams.proximity_bias:
    encoder_self_attention_bias += common_attention.attention_bias_proximal(
        common_layers.shape_list(inputs)[1])
  if hparams.get("use_target_space_embedding", True):
    # Append target_space_id embedding to inputs.
    emb_target_space = common_layers.embedding(
        target_space,
        32,
        ishape_static[-1],
        name="target_space_embedding",
        dtype=tf.bfloat16
        if hparams.activation_dtype == "bfloat16" else tf.float32)
    emb_target_space = tf.reshape(emb_target_space, [1, 1, -1])
    encoder_input += emb_target_space
  if hparams.pos == "timing":
    if inputs_position is not None:
      encoder_input = common_attention.add_timing_signal_1d_given_position(
          encoder_input, inputs_position)
    else:
      encoder_input = common_attention.add_timing_signal_1d(encoder_input)
  elif hparams.pos == "emb":
    encoder_input = common_attention.add_positional_embedding(
        encoder_input, hparams.max_length, "inputs_positional_embedding",
        inputs_position)
  if hparams.activation_dtype == "bfloat16":
    encoder_self_attention_bias = tf.cast(encoder_self_attention_bias,
                                          tf.bfloat16)
    encoder_decoder_attention_bias = tf.cast(encoder_decoder_attention_bias,
                                             tf.bfloat16)
  return (encoder_input, encoder_self_attention_bias,
          encoder_decoder_attention_bias)
Example #25
0
def transformer_prepare_encoder(inputs, target_space, hparams, features=None):
    """Prepare one shard of the model for the encoder.

  Args:
    inputs: a Tensor.
      sg: inputs here have been flattened to 3d
        [batch, height, width, embed_size] ->
        [batch, height*width, embed_size]
    target_space: a Tensor.
    hparams: run hyperparameters
    features: optionally pass the entire features dictionary as well.
      This is needed now for "packed" datasets.

  Returns:
    encoder_input: a Tensor, bottom of encoder stack
    encoder_self_attention_bias: a bias tensor for use in encoder self-attention
    encoder_decoder_attention_bias: a bias tensor for use in encoder-decoder
      attention
  """
    ishape_static = inputs.shape.as_list()
    encoder_input = inputs
    if features and "inputs_segmentation" in features:
        # Packed dataset.  Keep the examples from seeing each other.
        inputs_segmentation = features["inputs_segmentation"]
        inputs_position = features["inputs_position"]
        targets_segmentation = features["targets_segmentation"]
        encoder_self_attention_bias = common_attention.attention_bias_same_segment(
            inputs_segmentation, inputs_segmentation)
        encoder_decoder_attention_bias = (
            common_attention.attention_bias_same_segment(
                targets_segmentation, inputs_segmentation))
    else:
        # Usual case - not a packed dataset.
        encoder_padding = common_attention.embedding_to_padding(encoder_input)
        # sg: [batch_size, sentence_len]
        ignore_padding = common_attention.attention_bias_ignore_padding(
            encoder_padding)
        # sg: [batch_size, 1, 1, sentence_len]
        # an bias tensor to be added to attention logits
        # for padded words, the biases equal -1e9
        # non padded words equal 0
        encoder_self_attention_bias = ignore_padding
        encoder_decoder_attention_bias = ignore_padding
        inputs_position = None
    if hparams.proximity_bias:
        encoder_self_attention_bias += common_attention.attention_bias_proximal(
            common_layers.shape_list(inputs)[1])
    # Append target_space_id embedding to inputs.
    emb_target_space = common_layers.embedding(
        target_space,
        32,
        # sg: 32 vocab_size (comments in fun, may be not exactly)
        # this is because at current time t2t only have
        # SpaceID in problem.py from 1 to 32
        ishape_static[-1],
        # sg: embedding dimension
        name="target_space_embedding",
        dtype=tf.bfloat16
        if hparams.activation_dtype == "bfloat16" else tf.float32)
    # sg: [1,128] a dense vector to represent SpaceID
    emb_target_space = tf.reshape(emb_target_space, [1, 1, -1])
    # sg: [1,1,128]
    encoder_input += emb_target_space
    if hparams.pos == "timing":
        if inputs_position is not None:
            encoder_input = common_attention.add_timing_signal_1d_given_position(
                encoder_input, inputs_position)
        else:
            encoder_input = common_attention.add_timing_signal_1d(
                encoder_input)
    if hparams.activation_dtype == "bfloat16":
        encoder_self_attention_bias = tf.cast(encoder_self_attention_bias,
                                              tf.bfloat16)
        encoder_decoder_attention_bias = tf.cast(
            encoder_decoder_attention_bias, tf.bfloat16)
    return (encoder_input, encoder_self_attention_bias,
            encoder_decoder_attention_bias)
def transformer_prepare_encoder(inputs,
                                target_space,
                                hparams,
                                features=None,
                                type_ids=None,
                                num_types=None,
                                reuse_target_embedding=tf.AUTO_REUSE):
    """Prepare one shard of the model for the encoder.

  Args:
    inputs: a Tensor.
    target_space: a Tensor.
    hparams: run hyperparameters
    features: optionally pass the entire features dictionary as well.
      This is needed now for "packed" datasets.
    type_ids: optional, an int64 Tensor of shape [batch, length] that allows
      for adding type embeddings, similar to positional embeddings.
    num_types: optional, an int that decides the number of types in type_ids.
    reuse_target_embedding: option to reuse variable name in the case that
      symbol modalities are reused between inputs/targets.

  Returns:
    encoder_input: a Tensor, bottom of encoder stack
    encoder_self_attention_bias: a bias tensor for use in encoder self-attention
    encoder_decoder_attention_bias: a bias tensor for use in encoder-decoder
      attention
  """
    ishape_static = inputs.shape.as_list()
    encoder_input = inputs
    if features and "inputs_segmentation" in features:
        # Packed dataset.  Keep the examples from seeing each other.
        inputs_segmentation = features["inputs_segmentation"]
        inputs_position = features["inputs_position"]
        targets_segmentation = features["targets_segmentation"]
        if (hasattr(hparams, "unidirectional_encoder")
                and hparams.unidirectional_encoder):
            tf.logging.info("Using unidirectional encoder")
            encoder_self_attention_bias = (
                common_attention.attention_bias_lower_triangle(
                    common_layers.shape_list(inputs)[1]))
        else:
            encoder_self_attention_bias = (
                common_attention.attention_bias_same_segment(
                    inputs_segmentation, inputs_segmentation))
        encoder_decoder_attention_bias = (
            common_attention.attention_bias_same_segment(
                targets_segmentation, inputs_segmentation))
    else:
        encoder_padding = common_attention.embedding_to_padding(encoder_input)
        ignore_padding = common_attention.attention_bias_ignore_padding(
            encoder_padding)
        if (hasattr(hparams, "unidirectional_encoder")
                and hparams.unidirectional_encoder):
            tf.logging.info("Using unidirectional encoder")
            encoder_self_attention_bias = (
                common_attention.attention_bias_lower_triangle(
                    common_layers.shape_list(inputs)[1]))
        else:
            # Usual case - not a packed dataset.
            encoder_self_attention_bias = ignore_padding
        encoder_decoder_attention_bias = ignore_padding
        inputs_position = None
    if hparams.proximity_bias:
        encoder_self_attention_bias += common_attention.attention_bias_proximal(
            common_layers.shape_list(inputs)[1])
    if target_space is not None and hparams.get("use_target_space_embedding",
                                                True):
        # Append target_space_id embedding to inputs.
        emb_target_space = common_layers.embedding(
            target_space,
            32,
            ishape_static[-1],
            name="target_space_embedding",
            dtype=hparams.get("activation_dtype", "float32"),
            reuse=reuse_target_embedding)
        emb_target_space = tf.reshape(emb_target_space, [1, 1, -1])
        encoder_input += emb_target_space
    if hparams.pos == "timing":
        if inputs_position is not None:
            encoder_input = common_attention.add_timing_signal_1d_given_position(
                encoder_input, inputs_position)
        else:
            encoder_input = common_attention.add_timing_signal_1d(
                encoder_input)
    elif hparams.pos == "timing_from_features":
        encoder_input = common_attention.add_timing_signals_from_features(
            encoder_input, features, hparams.position_features)
    elif hparams.pos == "emb":
        encoder_input = common_attention.add_positional_embedding(
            encoder_input, hparams.max_length, "inputs_positional_embedding",
            inputs_position)

    # Add type embeddings
    if type_ids is not None:
        if not num_types:
            raise ValueError("Need to set num_types as well.")
        encoder_input = common_attention.add_positional_embedding(
            encoder_input, num_types, "inputs_type_embedding", type_ids)

    encoder_self_attention_bias = common_layers.cast_like(
        encoder_self_attention_bias, encoder_input)
    encoder_decoder_attention_bias = common_layers.cast_like(
        encoder_decoder_attention_bias, encoder_input)
    return (encoder_input, encoder_self_attention_bias,
            encoder_decoder_attention_bias)
Example #27
0
 def _embedding(self, x, reuse=None):
   with tf.variable_scope(self.name):
     return common_layers.embedding(x, self.vocab_size, self.dense_size,
                                   reuse=reuse, multiplier=self.multiplier)
Example #28
0
def transformer_prepare_encoder(inputs, target_space, hparams, features=None):
    """Prepare one shard of the model for the encoder.

  Args:
    inputs: a Tensor.
    target_space: a Tensor.
    hparams: run hyperparameters
    features: optionally pass the entire features dictionary as well.
      This is needed now for "packed" datasets.

  Returns:
    encoder_input: a Tensor, bottom of encoder stack
    encoder_self_attention_bias: a bias tensor for use in encoder self-attention
    encoder_decoder_attention_bias: a bias tensor for use in encoder-decoder
      attention
  """
    ishape_static = inputs.shape.as_list()
    encoder_input = inputs
    if features and "inputs_segmentation" in features:
        # Packed dataset.  Keep the examples from seeing each other.
        inputs_segmentation = features["inputs_segmentation"]
        inputs_position = features["inputs_position"]
        targets_segmentation = features["targets_segmentation"]
        encoder_self_attention_bias = common_attention.attention_bias_same_segment(
            inputs_segmentation, inputs_segmentation)
        encoder_decoder_attention_bias = (
            common_attention.attention_bias_same_segment(
                targets_segmentation, inputs_segmentation))
    else:
        # Usual case - not a packed dataset.
        encoder_padding = common_attention.embedding_to_padding(encoder_input)
        ignore_padding = common_attention.attention_bias_ignore_padding(
            encoder_padding)
        encoder_self_attention_bias = ignore_padding
        encoder_decoder_attention_bias = ignore_padding
        inputs_position = None
    if hparams.proximity_bias:
        encoder_self_attention_bias += common_attention.attention_bias_proximal(
            common_layers.shape_list(inputs)[1])
    # Append target_space_id embedding to inputs.
    emb_target_space = common_layers.embedding(target_space,
                                               32,
                                               ishape_static[-1],
                                               name="target_space_embedding")
    emb_target_space = tf.reshape(emb_target_space, [1, 1, -1])
    encoder_input += emb_target_space
    #if hparams.pos == "timing":
    #  if inputs_position is not None:
    #    encoder_input = common_attention.add_timing_signal_1d_given_position(
    #        encoder_input, inputs_position)
    #  else:
    #    encoder_input = common_attention.add_timing_signal_1d(encoder_input)
    raw_encoder_input = tf.squeeze(features['inputs_raw'], axis=[-2, -1])
    pos_signals = generate_positional_signals(raw_encoder_input, hparams)
    pos_embeddings = generate_positional_embeddings(pos_signals,
                                                    hparams.encoder_pos,
                                                    hparams)
    if "sum" in hparams.encoder_pos_integration:
        encoder_input = encoder_input + pos_embeddings
    elif "ffn" in hparams.encoder_pos_integration:
        with tf.variable_scope("encoder_pos_ffn"):
            encoder_input = tf.concat([encoder_input, pos_embeddings], axis=2)
            encoder_input = transformer_ffn_layer(encoder_input,
                                                  hparams,
                                                  conv_padding="SAME")
    return (encoder_input, encoder_self_attention_bias,
            encoder_decoder_attention_bias)
Example #29
0
    def encode_lex(self, encoder_input, target_space, hparams):
        '''
        encoder_input: [batch_size, input_len, hidden_dim]
        return: 
            encoder_output: [batch_size, input_len, hidden_dim]
            encoder_decoder_attention_bias: [batch_size, input_len]
        '''
        encoder_output_slices = []
        for i in range(encoder_input.get_shape()[2].value):
            encoder_input_slice = encoder_input[:, :, i, :]

            # bias
            encoder_padding = common_attention.embedding_to_padding(
                encoder_input_slice)
            print(encoder_padding.shape.as_list()
                  )  # ==> [None, None] (None, None, 4)
            ignore_padding = common_attention.attention_bias_ignore_padding(
                encoder_padding)
            encoder_self_attention_bias = ignore_padding
            encoder_decoder_attention_bias = ignore_padding
            print(ignore_padding.shape.as_list()
                  )  # ==> [None, 1, 1, None] (None, 1, 1, None, 4)

            # add target space to encoder input?
            ishape_static = encoder_input_slice.shape.as_list()
            print(ishape_static)  # ==> [None, None, 300] (None, None, 4, 300)
            emb_target_space = common_layers.embedding(
                target_space,
                32,
                ishape_static[-1],
                name="target_space_embedding")
            print(emb_target_space.shape.as_list())  # ==> [300]
            emb_target_space = tf.reshape(emb_target_space, [1, 1, -1])
            print(emb_target_space.shape.as_list())  # ==> [1, 1, 300]
            encoder_input_slice += emb_target_space
            print(encoder_input_slice.shape.as_list()
                  )  # ==> [None, None, 300] (None, None, 4, 300)

            # add timing signals to encoder input
            if hparams.pos == "timing":
                encoder_input_slice = common_attention.add_timing_signal_1d(
                    encoder_input_slice)

            # dropout
            encoder_input_slice = tf.nn.dropout(
                encoder_input_slice,
                1.0 - hparams.layer_prepostprocess_dropout)

            # encoder
            '''
            multihead_attention(
            query_antecedent: [batch, length_q, channels], -- x, x
            memory_antecedent: [batch, length_m, channels], -- None, encoder_output
            bias: bias tensor, -- encoder_self_attention_bias
            total_key_depth: int, -- hparams.attention_key_channels or hparams.hidden_size
            total_value_depth: int, -- hparams.attention_value_channels or hparams.hidden_size
            output_depth: integer, -- hparams.hidden_size
            num_heads: integer dividing total_key_depth and total_value_depth, -- hparams.num_heads (8)
            dropout_rate: float, -- hparams.attention_dropout
            ...
            cache=None: dict, containing tensors which are the results of previous attentions used for fast decoding, {'k': [batch_size, 0, key_channels], 'v': [batch_size, 0, value_channels], used in decoder self-attention)
            '''
            x = encoder_input_slice
            with tf.variable_scope("encoder" + str(i)):
                # remove pad
                pad_remover = None
                if hparams.use_pad_remover:
                    pad_remover = expert_utils.PadRemover(
                        common_attention.attention_bias_to_padding(
                            encoder_self_attention_bias))

                # self-attention along the sentence dimension
                for layer in xrange(hparams.num_encoder_layers
                                    or hparams.num_hidden_layers):
                    with tf.variable_scope("layer_%d" % layer):
                        with tf.variable_scope("self_attention"):
                            query_antecedent = common_layers.layer_preprocess(
                                x, hparams)
                            y = common_attention.multihead_attention(
                                query_antecedent=query_antecedent,
                                memory_antecedent=None,
                                bias=encoder_self_attention_bias,
                                total_key_depth=hparams.attention_key_channels
                                or hparams.hidden_size,
                                total_value_depth=hparams.
                                attention_value_channels
                                or hparams.hidden_size,
                                output_depth=hparams.hidden_size,
                                num_heads=hparams.num_heads,
                                dropout_rate=hparams.attention_dropout,
                                attention_type=hparams.self_attention_type,
                                max_relative_position=hparams.
                                max_relative_position)
                            x = common_layers.layer_postprocess(x, y, hparams)
                        with tf.variable_scope("ffn"):
                            y = transformer.transformer_ffn_layer(
                                common_layers.layer_preprocess(x, hparams),
                                hparams, pad_remover)
                            x = common_layers.layer_postprocess(x, y, hparams)
                encoder_output_slice = common_layers.layer_preprocess(
                    x, hparams)
                print(encoder_output_slice.shape.as_list()
                      )  # ==> [None, None, 300] (None, None, 4, 300)

            encoder_output_slices.append(encoder_output_slice)
        encoder_output = tf.stack(encoder_output_slices, 2)
        print(encoder_output.shape.as_list())  # ==> [None, None, 4, 300]

        # --------

        encoder_output_slices = []
        #hparams2 = copy.deepcopy(hparams)
        #hparams2.hidden_size = hparams.lex_cap
        num_heads = int(hparams.lex_cap / 2)
        hparams2 = tf.contrib.training.HParams(
            layer_preprocess_sequence=hparams.layer_preprocess_sequence,
            layer_postprocess_sequence=hparams.layer_postprocess_sequence,
            layer_prepostprocess_dropout=hparams.layer_prepostprocess_dropout,
            norm_type=hparams.norm_type,
            hidden_size=hparams.lex_cap,
            norm_epsilon=hparams.norm_epsilon,
            ffn_layer=hparams.ffn_layer,
            filter_size=hparams.filter_size,
            relu_dropout=hparams.relu_dropout,
            num_heads=num_heads,
            attention_dropout=hparams.attention_dropout,
            parameter_attention_key_channels=hparams.
            parameter_attention_key_channels,
            parameter_attention_value_channels=hparams.
            parameter_attention_value_channels)

        for i in range(encoder_output.get_shape()[3].value):
            encoder_input_slice = encoder_output[:, :, :, i]
            #print(encoder_input_slice.shape.as_list()) # ==> [None, None, 4]

            encoder_padding = common_attention.embedding_to_padding(
                encoder_input_slice)
            ignore_padding = common_attention.attention_bias_ignore_padding(
                encoder_padding)
            encoder_self_attention_bias = ignore_padding
            #print(encoder_self_attention_bias.shape.as_list()) # ==> [None, 1, 1, None]

            # encoder
            '''
            multihead_attention(
            query_antecedent: [batch, length_q, channels], -- x, x
            memory_antecedent: [batch, length_m, channels], -- None, encoder_output
            bias: bias tensor, -- encoder_self_attention_bias
            total_key_depth: int, -- hparams.attention_key_channels or hparams.hidden_size
            total_value_depth: int, -- hparams.attention_value_channels or hparams.hidden_size
            output_depth: integer, -- hparams.hidden_size
            num_heads: integer dividing total_key_depth and total_value_depth, -- hparams.num_heads (8)
            dropout_rate: float, -- hparams.attention_dropout
            ...
            cache=None: dict, containing tensors which are the results of previous attentions used for fast decoding, {'k': [batch_size, 0, key_channels], 'v': [batch_size, 0, value_channels], used in decoder self-attention)
            '''
            x = encoder_input_slice
            with tf.variable_scope("encoder_extra" + str(i)):
                # remove pad
                pad_remover = None
                if hparams.use_pad_remover:
                    pad_remover = expert_utils.PadRemover(
                        common_attention.attention_bias_to_padding(
                            encoder_self_attention_bias))

                # self-attention along the lexicon dimension
                with tf.variable_scope("layer_extra"):
                    with tf.variable_scope("self_attention"):
                        #query_antecedent = layer_preprocess2(x, hparams, hparams.lex_cap)
                        query_antecedent = common_layers.layer_preprocess(
                            x, hparams2)

                        y = common_attention.multihead_attention(
                            query_antecedent=query_antecedent,
                            memory_antecedent=None,
                            bias=encoder_self_attention_bias,
                            total_key_depth=hparams.attention_key_channels
                            or hparams.lex_cap,
                            total_value_depth=hparams.attention_value_channels
                            or hparams.lex_cap,
                            output_depth=hparams.lex_cap,
                            num_heads=num_heads,
                            dropout_rate=hparams.attention_dropout,
                            attention_type=hparams.self_attention_type,
                            max_relative_position=hparams.max_relative_position
                        )
                        #x = layer_postprocess2(x, y, hparams, hparams.lex_cap)
                        x = common_layers.layer_postprocess(x, y, hparams2)
                    with tf.variable_scope("ffn"):
                        y = transformer.transformer_ffn_layer(
                            common_layers.layer_preprocess(x, hparams2),
                            hparams2, pad_remover)
                        #x = layer_postprocess2(x, y, hparams, hparams.lex_cap)
                        x = common_layers.layer_postprocess(x, y, hparams2)
                #encoder_output_slice = layer_preprocess2(x, hparams, hparams.lex_cap)
                encoder_output_slice = common_layers.layer_preprocess(
                    x, hparams2)
                #print(encoder_output_slice.shape.as_list()) # ==> [None, None, 4] (None, None, 4, 300)

            encoder_output_slices.append(encoder_output_slice)
        encoder_output = tf.stack(encoder_output_slices, 3)
        print(encoder_output.shape.as_list())  # ==> [None, None, 4, 300]

        # --------

        lex_cap = encoder_output.get_shape()[2].value
        embed_len = encoder_output.get_shape()[3].value
        assert (lex_cap == hparams.lex_cap)
        aggregate_layer = tf.get_variable(
            name="Aggregate",
            shape=[embed_len, embed_len, lex_cap],
            initializer=tf.random_normal_initializer(mean=0.0, stddev=0.1))
        encoder_output = tf.tensordot(encoder_output,
                                      aggregate_layer,
                                      axes=[[2, 3], [1, 2]])
        print(encoder_output.shape.as_list())  # ==> [None, None, 300]

        return encoder_output, encoder_decoder_attention_bias
 def testEmbedding(self):
   x = np.random.random_integers(1, high=8, size=(3, 5))
   y = common_layers.embedding(x, 10, 16)
   self.evaluate(tf.global_variables_initializer())
   res = self.evaluate(y)
   self.assertEqual(res.shape, (3, 5, 16))
def embed_target_space(target_space_id, model_d):
  target_space_emb = common_layers.embedding(
      target_space_id, 32, model_d, name="target_space_embedding")
  return tf.reshape(target_space_emb, [1, 1, 1, -1])
Example #32
0
def embed_target_space(target_space_id, hidden_size):
    target_space_emb = common_layers.embedding(target_space_id,
                                               32,
                                               hidden_size,
                                               name="target_space_embedding")
    return tf.reshape(target_space_emb, [1, 1, 1, -1])
  def body(self, features):
    hparams = self._hparams
    ps_devices = self._ps_devices
    single_device = (len(ps_devices) == 1)
    assert hparams.num_model_shards % len(ps_devices) == 0
    shards_per_device = hparams.num_model_shards // len(ps_devices)
    model_devices = [ps_devices[i // shards_per_device]
                     for i in range(hparams.num_model_shards)]
    print("model_devices = %s" % model_devices)
    mp = expert_utils.Parallelism(model_devices, reuse=False)
    targets_vocab_size = self._problem_hparams.vocabulary["targets"].vocab_size
    # squeeze out channels, heights
    targets = tf.squeeze(features["targets_raw"], [2, 3])
    targets_embedding_var = mp(
        tf.get_variable, "embedding",
        [[targets_vocab_size, hparams.hidden_size]] * mp.n,
        initializer=tf.random_normal_initializer(
            0.0, hparams.hidden_size**-0.5))
    shifted_targets = common_layers.shift_right_2d(targets)
    # Bypass the symbol modality and use a different embedding on each shard.
    if single_device:
      targets_embedding_var_combined = tf.concat(targets_embedding_var, 1)
      decoder_input_combined = common_layers.embedding(
          shifted_targets, targets_vocab_size,
          hparams.hidden_size * mp.n,
          multiplier=hparams.hidden_size**0.5,
          embedding_var=targets_embedding_var_combined,
      )
      decoder_input = tf.split(decoder_input_combined, mp.n, axis=2)
    else:
      targets_embedding_var_combined = None
      decoder_input = mp(
          common_layers.embedding, shifted_targets, targets_vocab_size,
          hparams.hidden_size,
          multiplier=hparams.hidden_size**0.5,
          embedding_var=targets_embedding_var,
      )
    decoder_self_attention_bias = mp(
        common_attention.attention_bias_lower_triangle,
        tf.shape(targets)[1])
    if "targets_segmentation" in features:
      # "Packed" dataset - keep the examples from seeing each other.
      targets_segmentation = features["targets_segmentation"]
      targets_position = features["targets_position"]
      decoder_self_attention_bias = mp(
          tf.add, decoder_self_attention_bias,
          mp(common_attention.attention_bias_same_segment,
             targets_segmentation, targets_segmentation))
      decoder_input = mp(
          common_attention.add_timing_signal_1d_given_position,
          decoder_input, targets_position)
    else:
      targets_position = None
      decoder_self_attention_bias = mp(
          common_attention.attention_bias_lower_triangle,
          tf.shape(targets)[1])
      decoder_input = mp(common_attention.add_timing_signal_1d, decoder_input)

    if self.has_input:
      inputs = tf.squeeze(features["inputs_raw"], [2, 3])
      inputs_vocab_size = self._problem_hparams.vocabulary["inputs"].vocab_size
      # share everything for now
      share_inputs_and_targets_embedding = True
      if share_inputs_and_targets_embedding:
        assert inputs_vocab_size == targets_vocab_size
        inputs_embedding_var = targets_embedding_var
        inputs_embedding_var_combined = targets_embedding_var_combined
      if single_device:
        encoder_input_combined = common_layers.embedding(
            inputs, inputs_vocab_size,
            hparams.hidden_size * mp.n,
            multiplier=hparams.hidden_size**0.5,
            embedding_var=inputs_embedding_var_combined,
        )
        encoder_input = tf.split(encoder_input_combined, mp.n, axis=2)
      else:
        encoder_input = mp(
            common_layers.embedding, inputs, inputs_vocab_size,
            hparams.hidden_size,
            multiplier=hparams.hidden_size**0.5,
            embedding_var=inputs_embedding_var,
        )
      if "inputs_segmentation" in features:
        # "Packed" dataset - keep the examples from seeing each other.
        inputs_segmentation = features["inputs_segmentation"]
        inputs_position = features["inputs_position"]
        encoder_self_attention_bias = mp(
            common_attention.attention_bias_same_segment,
            inputs_segmentation, inputs_segmentation)
        encoder_decoder_attention_bias = mp(
            common_attention.attention_bias_same_segment,
            targets_segmentation, inputs_segmentation)
        encoder_input = mp(
            common_attention.add_timing_signal_1d_given_position,
            encoder_input, inputs_position)
      else:
        encoder_padding = tf.to_float(tf.equal(inputs, 0))
        ignore_padding = common_attention.attention_bias_ignore_padding(
            encoder_padding)
        encoder_self_attention_bias = ignore_padding
        encoder_decoder_attention_bias = ignore_padding
        inputs_position = None
        encoder_input = mp(common_attention.add_timing_signal_1d, encoder_input)

      # encoder stack here
      with tf.variable_scope("encoder"):
        encoder_input = mp(
            tf.nn.dropout, encoder_input,
            1.0 - hparams.layer_prepostprocess_dropout)
        encoder_output = _layer_stack(
            mp,
            encoder_input,
            encoder_self_attention_bias,
            hparams.encoder_layers,
            hparams)
    else:
      encoder_decoder_attention_bias = None
      encoder_output = None

    with tf.variable_scope("decoder"):
      decoder_input = mp(
          tf.nn.dropout, decoder_input,
          1.0 - hparams.layer_prepostprocess_dropout)
      decoder_output = _layer_stack(
          mp,
          decoder_input,
          decoder_self_attention_bias,
          layers=hparams.decoder_layers,
          hparams=hparams,
          encoder_output=encoder_output,
          encoder_decoder_attention_bias=encoder_decoder_attention_bias)

    # Bypass the symbol modality and compute logits directly.
    # We compute a different set of logits on each shard, and sum them.
    # Share the weights with the target embedding.
    output_var = targets_embedding_var
    output_var_combined = targets_embedding_var_combined
    if single_device:
      decoder_output = tf.concat(decoder_output, 2)
      logits = tf.tensordot(decoder_output, output_var_combined, [[2], [1]])
      num, denom = common_layers.padded_cross_entropy(
          logits, targets, hparams.label_smoothing)
      training_loss = num / denom
    else:
      logits = mp(
          tf.tensordot, decoder_output, output_var, [[[2], [1]]] * mp.n)
      logits = expert_utils.all_reduce_ring(logits, mp)
      # On each device, we compute the loss for a part of the batch.
      # This is faster than computing the whole loss on one shard.
      mp, logits = expert_utils.reduce_by_device(mp, logits, lambda l: l[0])
      def _loss_for_shard(logits, targets, shard):
        logits = common_layers.approximate_split(logits, mp.n, 0)[shard]
        targets = common_layers.approximate_split(targets, mp.n, 0)[shard]
        return common_layers.padded_cross_entropy(
            logits, targets, hparams.label_smoothing)
      num, denom = mp(_loss_for_shard, logits, targets, range(mp.n))
      training_loss = tf.add_n(num) / tf.add_n(denom)
      logits = logits[0]
    logits = tf.expand_dims(tf.expand_dims(logits, 2), 3)
    # override training loss so that it is not computed externally.
    losses = {"training": training_loss}
    return logits, losses