Exemple #1
0
def lstm_seq2seq_internal(inputs, targets, hparams, train):
  """The basic LSTM seq2seq model, main step used for training."""
  with tf.variable_scope("lstm_seq2seq"):
    if inputs is not None:
      inputs_length = common_layers.length_from_embedding(inputs)
      # Flatten inputs.
      inputs = common_layers.flatten4d3d(inputs)

      # LSTM encoder.
      inputs = tf.reverse_sequence(inputs, inputs_length, seq_axis=1)
      _, final_encoder_state = lstm(inputs, inputs_length, hparams, train,
                                    "encoder")
    else:
      final_encoder_state = None

    # LSTM decoder.
    shifted_targets = common_layers.shift_right(targets)
    # Add 1 to account for the padding added to the left from shift_right
    targets_length = common_layers.length_from_embedding(shifted_targets) + 1
    decoder_outputs, _ = lstm(
        common_layers.flatten4d3d(shifted_targets),
        targets_length,
        hparams,
        train,
        "decoder",
        initial_state=final_encoder_state)
    return tf.expand_dims(decoder_outputs, axis=2)
def lstm_seq2seq_internal(inputs, targets, hparams, train):
    """The basic LSTM seq2seq model, main step used for training."""
    with tf.variable_scope("lstm_seq2seq"):
        if inputs is not None:
            inputs_length = common_layers.length_from_embedding(inputs)
            # Flatten inputs.
            inputs = common_layers.flatten4d3d(inputs)

            # LSTM encoder.
            inputs = tf.reverse_sequence(inputs, inputs_length, seq_axis=1)
            _, final_encoder_state = lstm(inputs, inputs_length, hparams,
                                          train, "encoder")
        else:
            final_encoder_state = None

        # LSTM decoder.
        shifted_targets = common_layers.shift_right(targets)
        # Add 1 to account for the padding added to the left from shift_right
        targets_length = common_layers.length_from_embedding(
            shifted_targets) + 1
        decoder_outputs, _ = lstm(common_layers.flatten4d3d(shifted_targets),
                                  targets_length,
                                  hparams,
                                  train,
                                  "decoder",
                                  initial_state=final_encoder_state)
        return tf.expand_dims(decoder_outputs, axis=2)
Exemple #3
0
def lstm_seq2seq_internal_attention(inputs, targets, hparams, train):
    """LSTM seq2seq model with attention, main step used for training."""
    with tf.variable_scope("lstm_seq2seq_attention"):
        # This is a temporary fix for varying-length sequences within in a batch.
        # A more complete fix should pass a length tensor from outside so that
        # all the lstm variants can use it.
        inputs_length = common_layers.length_from_embedding(inputs)
        # Flatten inputs.
        inputs = common_layers.flatten4d3d(inputs)
        # LSTM encoder.
        encoder_outputs, final_encoder_state = lstm(
            inputs, hparams, train, "encoder", sequence_length=inputs_length)
        # LSTM decoder with attention
        shifted_targets = common_layers.shift_right(targets)
        # Add 1 to account for the padding added to the left from shift_right
        targets_length = common_layers.length_from_embedding(
            shifted_targets) + 1
        decoder_outputs, _ = lstm_attention_decoder(
            common_layers.flatten4d3d(shifted_targets),
            hparams,
            train,
            "decoder",
            final_encoder_state,
            encoder_outputs,
            encoder_output_length=inputs_length,
            decoder_input_length=targets_length)
        return tf.expand_dims(decoder_outputs, axis=2)
Exemple #4
0
  def body(self, features):
    """Build the main body of the model.

    Args:
      features: A dict of "inputs" and "targets" which have already been passed
        through an embedding layer. Inputs should have shape
        [batch_size, max_seq_length, 1, embedding_size]. Targets should have
        shape [batch_size, max_seq_length, 1, 1]

    Returns:
      The logits which get passed to the top of the model for inference.
      A tensor of shape [batch_size, seq_length, 1, embedding_size]
    """
    inputs = features.get("inputs")
    targets = features["targets"]

    if inputs is not None:
      inputs = common_layers.flatten4d3d(inputs)
      _, final_encoder_state = self._rnn(tf.reverse(inputs, axis=[1]),
                                         "encoder")
    else:
      final_encoder_state = None

    shifted_targets = common_layers.shift_right(targets)
    decoder_outputs, _ = self._rnn(
        common_layers.flatten4d3d(shifted_targets),
        "decoder",
        initial_state=final_encoder_state)
    return decoder_outputs
Exemple #5
0
    def _preprocess(self, features):
        """Preprocesses features for multilingual translation."""
        inputs = features["inputs"]
        targets = features["targets"]
        target_tags = features["target_tags"]

        # Expand target tags to beam width, if necessary.
        if self._hparams.mode == tf_estimator.ModeKeys.PREDICT:
            # <float32> [batch_size * beam_width, 1, 1, emb_size].
            beam_width = self._hparams.beam_width
            target_tags = tf.tile(target_tags, [beam_width, 1, 1, 1])

        # Add target tags to the input sequences.
        # <float32> [batch_size, seq_len + 1, 1, emb_size].
        inputs = tf.concat([target_tags, inputs], axis=1)

        # Compute length of the input sequences.
        inputs_length = common_layers.length_from_embedding(inputs)
        inputs = common_layers.flatten4d3d(inputs)

        # Preprocess targets.
        targets = common_layers.shift_right(targets)
        # Add 1 to account for the padding added to the left from shift_right.
        targets_length = common_layers.length_from_embedding(targets) + 1
        targets = common_layers.flatten4d3d(targets)

        return inputs, inputs_length, targets, targets_length
def lstm_seq2seq_internal_bid_encoder(inputs, targets, hparams, train):
    """The basic LSTM seq2seq model with bidirectional encoder."""
    with tf.variable_scope("lstm_seq2seq_bid_encoder"):
        if inputs is not None:
            inputs_length = common_layers.length_from_embedding(inputs)
            # Flatten inputs.
            inputs = common_layers.flatten4d3d(inputs)
            # LSTM encoder.
            _, final_encoder_state = lstm_bid_encoder(inputs, inputs_length,
                                                      hparams, train,
                                                      "encoder")
        else:
            inputs_length = None
            final_encoder_state = None
        # LSTM decoder.
        shifted_targets = common_layers.shift_right(targets)
        # Add 1 to account for the padding added to the left from shift_right
        targets_length = common_layers.length_from_embedding(
            shifted_targets) + 1
        hparams_decoder = copy.copy(hparams)
        hparams_decoder.hidden_size = 2 * hparams.hidden_size
        decoder_outputs, _ = lstm(common_layers.flatten4d3d(shifted_targets),
                                  targets_length,
                                  hparams_decoder,
                                  train,
                                  "decoder",
                                  initial_state=final_encoder_state)
        return tf.expand_dims(decoder_outputs, axis=2)
Exemple #7
0
    def body(self, features):
        #print(2.1)
        if self._hparams.initializer == "orthogonal":
            raise ValueError("LSTM models fail with orthogonal initializer.")

        train = self._hparams.mode == tf.estimator.ModeKeys.TRAIN

        inputs = features["targets"]
        encoder_outputs = common_layers.flatten4d3d(inputs)
        #print(inputs)
        shifted_targets = common_layers.shift_right(inputs)
        final_encoder_state = None
        size0, size1, size2, size3 = tf.shape(shifted_targets)

        #I think embedding may be handled by problem

        # Flatten inputs.
        inputs = common_layers.flatten4d3d(shifted_targets)
        #rnn=RNN(hparams,train)
        # LSTM decoder
        #decoder_output, _ = lstm_attention_decoder(inputs, self._hparams, train, "decoder",final_encoder_state, encoder_outputs)
        #decoder_output = LSTM_custom(inputs, self._hparams, train, "decoder",final_encoder_state, encoder_outputs)[0]
        #decoder_output, _ = lstm(inputs, self._hparams, train, "decoder")
        decoder_output, _ = lstm_SA(inputs, self._hparams, train, "decoder")

        return tf.expand_dims(decoder_output, axis=2)
Exemple #8
0
    def _build_inputs_and_targets(self,
                                  from_seqs=None,
                                  from_tags=None,
                                  to_seqs=None,
                                  to_tags=None):
        """Given from and to sequences and tags, construct inputs and targets."""
        del from_tags  # Unused.
        if from_seqs is not None:
            inputs = from_seqs
            inputs_length = common_layers.length_from_embedding(inputs)
            if to_tags is not None:
                # Add to-tags to the inputs and adjust lengths.
                # <float32> [batch_size, seq_len + 1, 1, emb_size].
                inputs = tf.concat([to_tags, inputs], axis=1)
                inputs_length = inputs_length + 1
            inputs = common_layers.flatten4d3d(inputs)
        else:
            inputs = None
            inputs_length = None

        if to_seqs is not None:
            # Shift to-sequences to form targets.
            # <float32> [batch_size, seq_len, 1, emb_size].
            targets = common_layers.shift_right(to_seqs)
            # Add 1 to account for the padding added to the left from shift_right.
            targets_length = common_layers.length_from_embedding(targets) + 1
            targets = common_layers.flatten4d3d(targets)
        else:
            targets = None
            targets_length = None

        return (inputs, inputs_length), (targets, targets_length)
Exemple #9
0
def bytenet_internal(inputs, targets, hparams):
  """ByteNet, main step used for training."""
  with tf.variable_scope("bytenet"):
    # Flatten inputs and extend length by 50%.
    inputs = tf.expand_dims(common_layers.flatten4d3d(inputs), axis=2)
    extend_length = tf.to_int32(0.5 * tf.to_float(tf.shape(inputs)[1]))
    inputs_shape = inputs.shape.as_list()
    inputs = tf.pad(inputs, [[0, 0], [0, extend_length], [0, 0], [0, 0]])
    inputs_shape[1] = None
    inputs.set_shape(inputs_shape)  # Don't lose the other shapes when padding.
    # Pad inputs and targets to be the same length, divisible by 50.
    inputs, targets = common_layers.pad_to_same_length(
        inputs, targets, final_length_divisible_by=50)
    final_encoder = residual_dilated_conv(inputs, hparams.num_block_repeat,
                                          "SAME", "encoder", hparams)

    shifted_targets = common_layers.shift_right(targets)
    kernel = (hparams.kernel_height, hparams.kernel_width)
    decoder_start = common_layers.conv_block(
        tf.concat([final_encoder, shifted_targets], axis=3),
        hparams.hidden_size, [((1, 1), kernel)],
        padding="LEFT")

    return residual_dilated_conv(decoder_start, hparams.num_block_repeat,
                                 "LEFT", "decoder", hparams)
Exemple #10
0
def lstm_seq2seq_internal_bid_encoder(inputs, targets, hparams, train):
  """The basic LSTM seq2seq model with bidirectional encoder."""
  with tf.variable_scope("lstm_seq2seq_bid_encoder"):
    if inputs is not None:
      inputs_length = common_layers.length_from_embedding(inputs)
      # Flatten inputs.
      inputs = common_layers.flatten4d3d(inputs)
      # LSTM encoder.
      _, final_encoder_state = lstm_bid_encoder(
          inputs, inputs_length, hparams, train, "encoder")
    else:
      inputs_length = None
      final_encoder_state = None
    # LSTM decoder.
    shifted_targets = common_layers.shift_right(targets)
    # Add 1 to account for the padding added to the left from shift_right
    targets_length = common_layers.length_from_embedding(shifted_targets) + 1
    hparams_decoder = copy.copy(hparams)
    hparams_decoder.hidden_size = 2 * hparams.hidden_size
    decoder_outputs, _ = lstm(
        common_layers.flatten4d3d(shifted_targets),
        targets_length,
        hparams_decoder,
        train,
        "decoder",
        initial_state=final_encoder_state)
    return tf.expand_dims(decoder_outputs, axis=2)
Exemple #11
0
def bytenet_internal(inputs, targets, hparams):
    """ByteNet, main step used for training."""
    with tf.variable_scope("bytenet"):
        # Flatten inputs and extend length by 50%.
        inputs = tf.expand_dims(common_layers.flatten4d3d(inputs), axis=2)
        extend_length = tf.to_int32(0.5 * tf.to_float(tf.shape(inputs)[1]))
        inputs_shape = inputs.shape.as_list()
        inputs = tf.pad(inputs, [[0, 0], [0, extend_length], [0, 0], [0, 0]])
        inputs_shape[1] = None
        inputs.set_shape(
            inputs_shape)  # Don't lose the other shapes when padding.
        # Pad inputs and targets to be the same length, divisible by 50.
        inputs, targets = common_layers.pad_to_same_length(
            inputs, targets, final_length_divisible_by=50)
        final_encoder = residual_dilated_conv(inputs, hparams.num_block_repeat,
                                              "SAME", "encoder", hparams)

        shifted_targets = common_layers.shift_right(targets)
        kernel = (hparams.kernel_height, hparams.kernel_width)
        decoder_start = common_layers.conv_block(
            tf.concat([final_encoder, shifted_targets], axis=3),
            hparams.hidden_size, [((1, 1), kernel)],
            padding="LEFT")

        return residual_dilated_conv(decoder_start, hparams.num_block_repeat,
                                     "LEFT", "decoder", hparams)
Exemple #12
0
def lstm_seq2seq_internal_dynamic(inputs, targets, hparams, train):
  """The basic LSTM seq2seq model, main step used for training."""
  with tf.variable_scope("lstm_seq2seq"):
    if inputs is not None:
      # Flatten inputs.
      inputs = common_layers.flatten4d3d(inputs)
      # LSTM encoder.
      _, final_encoder_state = lstm(
          tf.reverse(inputs, axis=[1]), hparams, train, "encoder")
    else:
      final_encoder_state = None
    # LSTM decoder.
    shifted_targets = common_layers.shift_right(targets)
    decoder_outputs, _ = lstm(
        common_layers.flatten4d3d(shifted_targets),
        hparams,
        train,
        "decoder",
        initial_state=final_encoder_state)

    # project the outputs
    with tf.variable_scope("projection"):
      projected_outputs=tf.layers.dense(
          decoder_outputs,
          2048,
          activation=None,
          use_bias=False)

    return tf.expand_dims(projected_outputs, axis=2)
Exemple #13
0
 def testShiftLeft(self):
     x1 = np.zeros((5, 7, 1, 11))
     x1[:, 0, :] = np.ones_like(x1[:, 0, :])
     expected = np.zeros((5, 7, 1, 11))
     expected[:, 1, :] = np.ones_like(expected[:, 1, :])
     a = common_layers.shift_right(tf.constant(x1, dtype=tf.float32))
     actual = self.evaluate(a)
     self.assertAllEqual(actual, expected)
 def testShiftLeft(self):
   x1 = np.zeros((5, 7, 1, 11))
   x1[:, 0, :] = np.ones_like(x1[:, 0, :])
   expected = np.zeros((5, 7, 1, 11))
   expected[:, 1, :] = np.ones_like(expected[:, 1, :])
   a = common_layers.shift_right(tf.constant(x1, dtype=tf.float32))
   actual = self.evaluate(a)
   self.assertAllEqual(actual, expected)
        def infer_step(result, length):
            """Inference step."""
            def print_info(result, length, new_length):
                vocab = self._hparams.problem_hparams.vocabulary["targets"]
                tf.logging.info(
                    "length=%s new_length=%s length_diff=%s new_suffix=%s",
                    length,
                    new_length,
                    new_length - length,
                    str([
                        vocab._subtoken_id_to_subtoken_string(index)  # pylint: disable=protected-access
                        for index in result[0, -block_size:, 0,
                                            0][:new_length - length]
                    ]).decode("unicode-escape"),
                )

            features["targets"] = tf.pad(result,
                                         [[0, 0], [0, 1], [0, 0], [0, 0]])
            samples, logits, losses = self.sample(features)  # pylint: disable=unused-variable

            _, top_k_indices = tf.nn.top_k(
                logits[:, :-1, :1, :, :],
                k=self._decode_hparams.guess_and_check_top_k)
            in_top_k = tf.reduce_any(tf.equal(tf.to_int64(top_k_indices),
                                              tf.expand_dims(result, 4)),
                                     axis=4)

            eos_cumsum = tf.cumsum(tf.to_int32(
                tf.equal(result, text_encoder.EOS_ID)),
                                   axis=1)
            after_eos = tf.greater(common_layers.shift_right(eos_cumsum), 0)

            correct = tf.logical_and(in_top_k, tf.logical_not(after_eos))
            correct_cumsum = tf.cumsum(tf.to_int32(correct), axis=1)
            perfect_cumsum = 1 + tf.range(tf.shape(correct)[1])
            for axis in [0, 2, 3]:
                perfect_cumsum = tf.expand_dims(perfect_cumsum, axis=axis)

            new_length = tf.reduce_sum(tf.to_int32(
                tf.equal(correct_cumsum, perfect_cumsum)),
                                       axis=1)
            new_length = tf.squeeze(new_length, axis=[0, 1, 2])
            new_length = tf.minimum(new_length, decode_length)

            new_result = tf.concat([
                result[:, :new_length, :, :],
                tf.reshape(samples[:, new_length, :block_size, :],
                           [1, block_size, 1, 1])
            ],
                                   axis=1)

            with tf.control_dependencies(
                [tf.py_func(print_info, [result, length, new_length], [])]):
                new_result = tf.identity(new_result)

            return new_result, new_length
Exemple #16
0
    def body(self, features):
        inputs = features["inputs"]
        train = self._hparams.mode == tf.estimator.ModeKeys.TRAIN

        encoder_outputs, final_encoder_state, encoder_decoder_attention_bias, inputs_length = \
            self.encode(inputs, self._hparams)

        if "targets_actions" in features:
            targets = features["targets_actions"]
        else:
            tf.logging.warn(
                "CopySeq2Seq must be used with a SemanticParsing problem with a ShiftReduceGrammar; bad things will happen otherwise"
            )
            targets = features["targets"]

        # LSTM decoder with attention
        shifted_targets = common_layers.shift_right(targets)

        # Add 1 to account for the padding added to the left from shift_right
        targets_length = common_layers.length_from_embedding(
            shifted_targets) + 1
        shifted_targets = common_layers.flatten4d3d(shifted_targets)

        hparams_decoder = copy.copy(self._hparams)
        hparams_decoder.hidden_size = 2 * self._hparams.hidden_size

        decoder_output = lstm_attention_decoder(shifted_targets,
                                                hparams_decoder, train,
                                                "decoder", final_encoder_state,
                                                encoder_outputs, inputs_length,
                                                targets_length)
        decoder_output = tf.expand_dims(decoder_output, axis=2)

        body_output = dict()
        target_modality = self._problem_hparams.target_modality \
            if self._problem_hparams else {"targets": None}

        assert self._hparams.pointer_layer in ("attentive",
                                               "decaying_attentive")

        for key, modality in target_modality.items():
            if isinstance(modality, CopyModality):
                with tf.variable_scope("copy_layer/" + key):
                    if self._hparams.pointer_layer == "decaying_attentive":
                        output_layer = DecayingAttentivePointerLayer(
                            encoder_outputs)
                    else:
                        output_layer = AttentivePointerLayer(encoder_outputs)
                    scores = output_layer(decoder_output)
                    scores += encoder_decoder_attention_bias
                    body_output[key] = scores
            else:
                body_output[key] = decoder_output

        return body_output
Exemple #17
0
def transformer_prepare_decoder(targets, hparams, features=None):
    """Prepare one shard of the model for the decoder.

  Args:
    targets: a Tensor.
    hparams: run hyperparameters
    features: optionally pass the entire features dictionary as well.
      This is needed now for "packed" datasets.

  Returns:
    decoder_input: a Tensor, bottom of decoder stack
    decoder_self_attention_bias: a bias tensor for use in encoder self-attention
  """
    decoder_self_attention_bias = (
        common_attention.attention_bias_lower_triangle(
            common_layers.shape_list(targets)[1]))
    if features and "targets_segmentation" in features:
        # "Packed" dataset - keep the examples from seeing each other.
        targets_segmentation = features["targets_segmentation"]
        targets_position = features["targets_position"]
        decoder_self_attention_bias += common_attention.attention_bias_same_segment(
            targets_segmentation, targets_segmentation)
    else:
        targets_position = None
    if hparams.proximity_bias:
        decoder_self_attention_bias += common_attention.attention_bias_proximal(
            common_layers.shape_list(targets)[1])
    decoder_input = common_layers.shift_right_3d(targets)
    #if hparams.pos == "timing":
    #  if targets_position is not None:
    #    decoder_input = common_attention.add_timing_signal_1d_given_position(
    #        decoder_input, targets_position)
    #  else:
    #    decoder_input = common_attention.add_timing_signal_1d(decoder_input)
    raw_decoder_input = common_layers.shift_right(features['targets_raw'])
    terminal_decoder_bias, nonterminal_decoder_bias = _get_t_nt_bias(
        raw_decoder_input, hparams, decoder_self_attention_bias)
    pop_decoder_bias = _get_pop_bias(raw_decoder_input, hparams)
    raw_decoder_input = tf.squeeze(raw_decoder_input, axis=[-2, -1])
    pos_signals = generate_positional_signals(raw_decoder_input, hparams,
                                              terminal_decoder_bias,
                                              nonterminal_decoder_bias)
    pos_embeddings = generate_positional_embeddings(pos_signals,
                                                    hparams.decoder_pos,
                                                    hparams)
    if "sum" in hparams.decoder_pos_integration:
        decoder_input = decoder_input + pos_embeddings
    elif "ffn" in hparams.decoder_pos_integration:
        with tf.variable_scope("decoder_pos_ffn"):
            decoder_input = tf.concat([decoder_input, pos_embeddings], axis=2)
            decoder_input = transformer_ffn_layer(decoder_input,
                                                  hparams,
                                                  conv_padding="LEFT")
    return (decoder_input, decoder_self_attention_bias, terminal_decoder_bias,
            nonterminal_decoder_bias, pop_decoder_bias, pos_signals)
Exemple #18
0
 def model_fn_body(self, features):
     if self._hparams.initializer == "orthogonal":
         raise ValueError("LSTM models fail with orthogonal initializer.")
     train = self._hparams.mode == tf.estimator.ModeKeys.TRAIN
     with tf.variable_scope("lstm_lm"):
         # Flatten and shift inputs.
         shifted_targets = common_layers.shift_right(
             features.get("targets"))
         inputs = common_layers.flatten4d3d(shifted_targets)
         outputs, _ = lstm.lstm(inputs, self._hparams, train, "lstm")
         return tf.expand_dims(outputs, axis=2)
def decode(cond_vec, cond_add, gold, c, ed, hparams):
    """Transformer decoder."""
    drop_gold = tf.nn.dropout(gold, 1.0 - hparams.layer_prepostprocess_dropout)
    decoder_input = common_layers.shift_right(drop_gold, pad_value=cond_vec)
    if cond_add is not None:
        decoder_input += cond_add
    decoder_input = tf.squeeze(decoder_input, axis=2)
    decoder_input = common_attention.add_timing_signal_1d(decoder_input)
    bias = common_attention.attention_bias_lower_triangle(tf.shape(gold)[1])
    if c is not None and len(c.get_shape()) > 3:
        c = tf.squeeze(c, axis=2)
    return transformer.transformer_decoder(decoder_input, c, bias, ed, hparams)
Exemple #20
0
def slicenet_middle(inputs_encoded, targets, target_space_emb, mask, hparams):
    """Middle part of slicenet, connecting encoder and decoder."""
    def norm_fn(x, name):
        with tf.variable_scope(name, default_name="norm"):
            return common_layers.apply_norm(x, hparams.norm_type,
                                            hparams.hidden_size,
                                            hparams.norm_epsilon)

    # Flatten targets and embed target_space_id.
    targets_flat = tf.expand_dims(common_layers.flatten4d3d(targets), axis=2)
    target_space_emb = tf.tile(target_space_emb,
                               [tf.shape(targets_flat)[0], 1, 1, 1])

    # Calculate similarity loss (but don't run if not needed).
    if len(hparams.problems) > 1 and hparams.sim_loss_mult > 0.00001:
        targets_timed = common_layers.add_timing_signal(targets_flat)
        extra_layers = int(hparams.num_hidden_layers * 1.5)
        with tf.variable_scope(tf.get_variable_scope(), reuse=True):
            targets_encoded = multi_conv_res(targets_timed, "SAME", "encoder",
                                             extra_layers, hparams)
        with tf.variable_scope("similarity_loss"):
            similarity_loss = similarity_cost(inputs_encoded, targets_encoded)
            similarity_loss *= hparams.sim_loss_mult
    else:
        similarity_loss = 0.0

    # Use attention from each target to look at input and retrieve.
    targets_shifted = common_layers.shift_right(targets_flat,
                                                pad_value=target_space_emb)
    if hparams.attention_type == "none":
        targets_with_attention = tf.zeros_like(targets_shifted)
    else:
        inputs_padding_bias = (1.0 -
                               mask) * -1e9  # Bias to not attend to padding.
        targets_with_attention = attention(targets_shifted,
                                           inputs_encoded,
                                           norm_fn,
                                           hparams,
                                           bias=inputs_padding_bias)

    # Positional targets: merge attention and raw.
    kernel = (hparams.kernel_height, hparams.kernel_width)
    targets_merged = common_layers.subseparable_conv_block(
        tf.concat([targets_with_attention, targets_shifted], axis=3),
        hparams.hidden_size, [((1, 1), kernel)],
        normalizer_fn=norm_fn,
        padding="LEFT",
        separability=4,
        name="targets_merge")

    return targets_merged, similarity_loss
Exemple #21
0
def slicenet_middle(inputs_encoded, targets, target_space_emb, mask, hparams):
  """Middle part of slicenet, connecting encoder and decoder."""

  def norm_fn(x, name):
    with tf.variable_scope(name, default_name="norm"):
      return common_layers.apply_norm(x, hparams.norm_type, hparams.hidden_size,
                                      hparams.norm_epsilon)

  # Flatten targets and embed target_space_id.
  targets_flat = tf.expand_dims(common_layers.flatten4d3d(targets), axis=2)
  target_space_emb = tf.tile(target_space_emb,
                             [tf.shape(targets_flat)[0], 1, 1, 1])

  # Calculate similarity loss (but don't run if not needed).
  if len(hparams.problems) > 1 and hparams.sim_loss_mult > 0.00001:
    targets_timed = common_layers.add_timing_signal(targets_flat)
    extra_layers = int(hparams.num_hidden_layers * 1.5)
    with tf.variable_scope(tf.get_variable_scope(), reuse=True):
      targets_encoded = multi_conv_res(targets_timed, "SAME", "encoder",
                                       extra_layers, hparams)
    with tf.variable_scope("similarity_loss"):
      similarity_loss = similarity_cost(inputs_encoded, targets_encoded)
      similarity_loss *= hparams.sim_loss_mult
  else:
    similarity_loss = 0.0

  # Use attention from each target to look at input and retrieve.
  targets_shifted = common_layers.shift_right(
      targets_flat, pad_value=target_space_emb)
  if hparams.attention_type == "none":
    targets_with_attention = tf.zeros_like(targets_shifted)
  else:
    inputs_padding_bias = (1.0 - mask) * -1e9  # Bias to not attend to padding.
    targets_with_attention = attention(
        targets_shifted,
        inputs_encoded,
        norm_fn,
        hparams,
        bias=inputs_padding_bias)

  # Positional targets: merge attention and raw.
  kernel = (hparams.kernel_height, hparams.kernel_width)
  targets_merged = common_layers.subseparable_conv_block(
      tf.concat([targets_with_attention, targets_shifted], axis=3),
      hparams.hidden_size, [((1, 1), kernel)],
      normalizer_fn=norm_fn,
      padding="LEFT",
      separability=4,
      name="targets_merge")

  return targets_merged, similarity_loss
Exemple #22
0
def lstm_seq2seq_internal_attention(inputs, targets, hparams, train):
  """LSTM seq2seq model with attention, main step used for training."""
  with tf.variable_scope("lstm_seq2seq_attention"):
    # Flatten inputs.
    inputs = common_layers.flatten4d3d(inputs)
    # LSTM encoder.
    encoder_outputs, final_encoder_state = lstm(
        tf.reverse(inputs, axis=[1]), hparams, train, "encoder")
    # LSTM decoder with attention
    shifted_targets = common_layers.shift_right(targets)
    decoder_outputs, _ = lstm_attention_decoder(
        common_layers.flatten4d3d(shifted_targets), hparams, train, "decoder",
        final_encoder_state, encoder_outputs)
    return tf.expand_dims(decoder_outputs, axis=2)
Exemple #23
0
def lstm_seq2seq_internal_attention(inputs, targets, hparams, train):
    """LSTM seq2seq model with attention, main step used for training."""
    with tf.variable_scope("lstm_seq2seq_attention"):
        # Flatten inputs.
        inputs = common_layers.flatten4d3d(inputs)
        # LSTM encoder.
        encoder_outputs, final_encoder_state = lstm(
            tf.reverse(inputs, axis=[1]), hparams, train, "encoder")
        # LSTM decoder with attention
        shifted_targets = common_layers.shift_right(targets)
        decoder_outputs, _ = lstm_attention_decoder(
            common_layers.flatten4d3d(shifted_targets), hparams, train,
            "decoder", final_encoder_state, encoder_outputs)
        return tf.expand_dims(decoder_outputs, axis=2)
Exemple #24
0
  def _build_lm_inputs(self, features):
    """Builds inputs and targets for LM training."""
    targets = features["targets"]
    target_tags = features["target_tags"]

    if self._hparams.mode == tf.estimator.ModeKeys.PREDICT:
      target_tags = tf.tile(target_tags, [self._hparams.beam_width, 1, 1, 1])

    # Construct LM inputs.
    inputs = common_layers.shift_right(targets, pad_value=target_tags)
    inputs_length = common_layers.length_from_embedding(targets) + 1
    inputs = common_layers.flatten4d3d(inputs)

    return inputs, inputs_length
Exemple #25
0
def lstm_seq2seq_internal(inputs, targets, hparams, train):
    """The basic LSTM seq2seq model, main step used for training."""
    with tf.variable_scope("lstm_seq2seq"):
        # Flatten inputs.
        inputs = common_layers.flatten4d3d(inputs)
        # LSTM encoder.
        _, final_encoder_state = lstm(tf.reverse(inputs, axis=[1]), hparams,
                                      train, "encoder")
        # LSTM decoder.
        shifted_targets = common_layers.shift_right(targets)
        decoder_outputs, _ = lstm(common_layers.flatten4d3d(shifted_targets),
                                  hparams,
                                  train,
                                  "decoder",
                                  initial_state=final_encoder_state)
        return tf.expand_dims(decoder_outputs, axis=2)
Exemple #26
0
    def create_model_encode_decode(
            self, inputs, y_id):  # inp[batch step 1 hid]  yid[batch step 1 1]
        hparams = self.hparams
        train_flag = self.train_flag
        vocab_size = self.vocabsz
        embeddings_y = self.embeddings_y
        with tf.variable_scope("foo", reuse=tf.AUTO_REUSE):
            ### y embed

            y = tf.nn.embedding_lookup(embeddings_y, y_id)
            y = tf.squeeze(y, axis=3)  # [? ? 1 hid]

            if len(inputs.shape) == 2:  # [batch hid]
                inputs = tf.expand_dims(tf.expand_dims(inputs, axis=1), axis=1)
            inputs_length = common_layers.length_from_embedding(
                inputs)  # [batch step 1 hid]
            #  Flatten inputs.
            inputs = common_layers.flatten4d3d(inputs)

            # LSTM encoder.
            inputs = tf.reverse_sequence(inputs, inputs_length, seq_axis=1)
            _, final_encoder_state = lstm_yr(
                inputs, inputs_length, hparams, train_flag,
                "encoder")  # finale_encode_state must be lstmStateTuple

            ##
            # LSTM decoder.
            shifted_targets = common_layers.shift_right(
                y)  # [46,23,78]->[0,46,23] | [batch step 1 hid]
            # Add 1 to account for the padding added to the left from shift_right
            targets_length = common_layers.length_from_embedding(
                shifted_targets) + 1

            decoder_outputs, _ = lstm_yr(
                common_layers.flatten4d3d(shifted_targets),
                targets_length,
                hparams,
                train_flag,
                "decoder",
                initial_state=final_encoder_state)

            # decode output [batch step hid]
            decoder_outputs = tf.layers.dense(inputs=decoder_outputs,
                                              units=vocab_size)
            # ->[batch step vocabsz]
            decoder_outputs = self.tensor3dto4d(decoder_outputs)
            return decoder_outputs
Exemple #27
0
def gru_seq2seq_internal_attention_bid_encoder(inputs, targets, hparams,
                                               train):
    """GRU seq2seq model with attention, main step used for training."""
    with tf.variable_scope("gru_seq2seq_attention_bid_encoder"):
        # Flatten inputs.
        inputs = common_layers.flatten4d3d(inputs)
        # GRU encoder.
        encoder_outputs, final_encoder_state = gru_bid_encoder(
            tf.reverse(inputs, axis=[1]), hparams, train, "encoder")
        # GRU decoder with attention
        shifted_targets = common_layers.shift_right(targets)
        hparams_decoder = copy.copy(hparams)
        hparams_decoder.hidden_size = 2 * hparams.hidden_size
        decoder_outputs, _ = gru_attention_decoder(
            common_layers.flatten4d3d(shifted_targets), hparams_decoder, train,
            "decoder", final_encoder_state, encoder_outputs)
        return tf.expand_dims(decoder_outputs, axis=2)
Exemple #28
0
def lstm_seq2seq_internal_attention(inputs, targets, hparams, train):
  """LSTM seq2seq model with attention, main step used for training."""
  with tf.variable_scope("lstm_seq2seq_attention"):
    # This is a temporary fix for varying-length sequences within in a batch.
    # A more complete fix should pass a length tensor from outside so that
    # all the lstm variants can use it.
    lengths = tf.reduce_sum(
        common_layers.mask_from_embedding(inputs), [1, 2, 3])
    # Flatten inputs.
    inputs = common_layers.flatten4d3d(inputs)
    # LSTM encoder.
    encoder_outputs, final_encoder_state = lstm(
        inputs, hparams, train, "encoder", lengths=lengths)
    # LSTM decoder with attention
    shifted_targets = common_layers.shift_right(targets)
    decoder_outputs, _ = lstm_attention_decoder(
        common_layers.flatten4d3d(shifted_targets), hparams, train, "decoder",
        final_encoder_state, encoder_outputs, lengths=lengths)
    return tf.expand_dims(decoder_outputs, axis=2)
def lstm_seq2seq_internal_static(inputs, targets, hparams, train):
    """The basic LSTM seq2seq model, main step used for training."""
    with tf.variable_scope("lstm_seq2seq"):
        if inputs is not None:
            # Flatten inputs.
            inputs = tf.reverse(common_layers.flatten4d3d(inputs), axis=[1])

            # Construct static rnn input list.
            # TODO: the length should be a parameter.
            input_list = [inputs[:, i, :] for i in range(21)]

            # LSTM encoder.
            _, final_encoder_state = lstm(input_list, hparams, train,
                                          "encoder")
        else:
            final_encoder_state = None
        input_list.clear()
        # LSTM decoder.
        # Get a list of tensors.
        shifted_trg = common_layers.flatten4d3d(
            common_layers.shift_right(targets))
        target_list = [shifted_trg[:, i, :] for i in range(21)]

        decoder_outputs, _ = lstm(target_list,
                                  hparams,
                                  train,
                                  "decoder",
                                  initial_state=final_encoder_state)
        target_list.clear()

        # Convert decoder outputs to tensor.
        tensors = tf.transpose(tf.convert_to_tensor(decoder_outputs),
                               perm=[1, 0, 2])
        decoder_outputs.clear()

        # project the outputs
        with tf.variable_scope("projection"):
            projected_outputs = tf.layers.dense(tensors,
                                                2048,
                                                activation=None,
                                                use_bias=False)
        return tf.expand_dims(projected_outputs, axis=2)
def lstm_seq2seq_internal_attention(inputs, targets, hparams, train,
                                    inputs_length, targets_length):
  """LSTM seq2seq model with attention, main step used for training."""
  with tf.variable_scope("lstm_seq2seq_attention"):
    # Flatten inputs.
    inputs = common_layers.flatten4d3d(inputs)

    # LSTM encoder.
    inputs = tf.reverse_sequence(inputs, inputs_length, seq_axis=1)
    encoder_outputs, final_encoder_state = lstm(
        inputs, inputs_length, hparams, train, "encoder")

    # LSTM decoder with attention.
    shifted_targets = common_layers.shift_right(targets)
    # Add 1 to account for the padding added to the left from shift_right
    targets_length = targets_length + 1
    decoder_outputs = lstm_attention_decoder(
        common_layers.flatten4d3d(shifted_targets), hparams, train, "decoder",
        final_encoder_state, encoder_outputs, inputs_length, targets_length)
    return tf.expand_dims(decoder_outputs, axis=2)
Exemple #31
0
def lstm_seq2seq_internal(inputs, targets, hparams, train):
  """The basic LSTM seq2seq model, main step used for training."""
  with tf.variable_scope("lstm_seq2seq"):
    if inputs is not None:
      # Flatten inputs.
      inputs = common_layers.flatten4d3d(inputs)
      # LSTM encoder.
      _, final_encoder_state = lstm(
          tf.reverse(inputs, axis=[1]), hparams, train, "encoder")
    else:
      final_encoder_state = None
    # LSTM decoder.
    shifted_targets = common_layers.shift_right(targets)
    decoder_outputs, _ = lstm(
        common_layers.flatten4d3d(shifted_targets),
        hparams,
        train,
        "decoder",
        initial_state=final_encoder_state)
    return tf.expand_dims(decoder_outputs, axis=2)
def lstm_seq2seq_search_based_attention(inputs, targets, hparams, train,
                                        build_storage, storage, n):
    """LSTM seq2seq search-based model with attention"""
    with tf.variable_scope("lstm_seq2seq_attention", reuse=tf.AUTO_REUSE):
        # Flatten inputs.
        inputs = common_layers.flatten4d3d(inputs)
        # LSTM encoder.
        lstm_cell = tf.contrib.rnn.BasicLSTMCell(hparams.hidden_size)
        encoder_outputs, final_encoder_state = rnn(
            tf.reverse(inputs, axis=[1]), lstm_cell, hparams, train, "encoder")
        # LSTM decoder with attention
        shifted_targets = common_layers.shift_right(targets)
        decoder_outputs, p_copy = lstm_attention_search_based_decoder(
            common_layers.flatten4d3d(shifted_targets), hparams, train,
            "decoder", final_encoder_state, encoder_outputs, build_storage,
            storage, n)

        if build_storage:
            return tf.expand_dims(decoder_outputs, axis=2)
        else:
            return tf.expand_dims(decoder_outputs, axis=2), p_copy
Exemple #33
0
def lstm_seq2seq_internal_bid_encoder(inputs, targets, hparams, train):
    """The basic LSTM seq2seq model with bidirectional encoder."""
    with tf.variable_scope("lstm_seq2seq_bid_encoder"):
        if inputs is not None:
            # Flatten inputs.
            inputs = common_layers.flatten4d3d(inputs)
            # LSTM encoder.
            _, final_encoder_state = lstm_bid_encoder(
                tf.reverse(inputs, axis=[1]), hparams, train, "encoder")
        else:
            final_encoder_state = None
        # LSTM decoder.
        shifted_targets = common_layers.shift_right(targets)
        hparams_decoder = copy.copy(hparams)
        hparams_decoder.hidden_size = 2 * hparams.hidden_size
        decoder_outputs, _ = lstm(common_layers.flatten4d3d(shifted_targets),
                                  hparams_decoder,
                                  train,
                                  "decoder",
                                  initial_state=final_encoder_state)
        return tf.expand_dims(decoder_outputs, axis=2)
def lstm_seq2seq_internal_attention_bid_encoder(inputs, targets, hparams,
                                                train):
  """LSTM seq2seq model with attention, main step used for training."""
  with tf.variable_scope("lstm_seq2seq_attention_bid_encoder"):
    inputs_length = common_layers.length_from_embedding(inputs)
    # Flatten inputs.
    inputs = common_layers.flatten4d3d(inputs)
    # LSTM encoder.
    encoder_outputs, final_encoder_state = lstm_bid_encoder(
        inputs, inputs_length, hparams, train, "encoder")
    # LSTM decoder with attention
    shifted_targets = common_layers.shift_right(targets)
    # Add 1 to account for the padding added to the left from shift_right
    targets_length = common_layers.length_from_embedding(shifted_targets) + 1
    hparams_decoder = copy.copy(hparams)
    hparams_decoder.hidden_size = 2 * hparams.hidden_size
    decoder_outputs = lstm_attention_decoder(
        common_layers.flatten4d3d(shifted_targets), hparams_decoder, train,
        "decoder", final_encoder_state, encoder_outputs,
        inputs_length, targets_length)
    return tf.expand_dims(decoder_outputs, axis=2)
Exemple #35
0
def lstm_seq2seq_internal_bid_encoder(inputs, targets, hparams, train):
  """The basic LSTM seq2seq model with bidirectional encoder."""
  with tf.variable_scope("lstm_seq2seq_bid_encoder"):
    if inputs is not None:
      # Flatten inputs.
      inputs = common_layers.flatten4d3d(inputs)
      # LSTM encoder.
      _, final_encoder_state = lstm_bid_encoder(
          tf.reverse(inputs, axis=[1]), hparams, train, "encoder")
    else:
      final_encoder_state = None
    # LSTM decoder.
    shifted_targets = common_layers.shift_right(targets)
    hparams_decoder = copy.copy(hparams)
    hparams_decoder.hidden_size = 2 * hparams.hidden_size
    decoder_outputs, _ = lstm(
        common_layers.flatten4d3d(shifted_targets),
        hparams_decoder,
        train,
        "decoder",
        initial_state=final_encoder_state)
    return tf.expand_dims(decoder_outputs, axis=2)
def slicenet_middle(inputs_encoded, targets, target_space_emb, mask, hparams):
  """Middle part of slicenet, connecting encoder and decoder."""

  def norm_fn(x, name):
    with tf.variable_scope(name, default_name="norm"):
      return common_layers.apply_norm(x, hparams.norm_type, hparams.model_d,
                                      hparams.norm_epsilon)

  # Flatten targets and embed target_space_id.
  targets_flat = tf.expand_dims(common_layers.flatten4d3d(targets), axis=2)
  target_space_emb = tf.tile(target_space_emb,
                             [tf.shape(targets_flat)[0], 1, 1, 1])

  # Use attention from each target to look at input and retrieve.
  targets_shifted = common_layers.shift_right(
      targets_flat, pad_value=target_space_emb)
  if hparams.attention_type == "none":
    targets_with_attention = tf.zeros_like(targets_shifted)
  else:
    inputs_padding_bias = (1.0 - mask) * -1e9  # Bias to not attend to padding.
    targets_with_attention = attention(
        targets_shifted,
        inputs_encoded,
        norm_fn,
        hparams,
        bias=inputs_padding_bias)

  # Positional targets: merge attention and raw.
  kernel = (hparams.kernel_height, hparams.kernel_width)
  targets_merged = common_layers.subseparable_conv_block(
      tf.concat([targets_with_attention, targets_shifted], axis=3),
      hparams.model_d, [((1, 1), kernel)],
      normalizer_fn=norm_fn,
      padding="LEFT",
      separability=4,
      name="targets_merge")

  return targets_merged, 0.0
Exemple #37
0
def lstm_seq2seq_internal_attention(inputs, targets, hparams, train):
  """LSTM seq2seq model with attention, main step used for training."""
  with tf.variable_scope("lstm_seq2seq_attention"):
    # This is a temporary fix for varying-length sequences within in a batch.
    # A more complete fix should pass a length tensor from outside so that
    # all the lstm variants can use it.
    inputs_length = common_layers.length_from_embedding(inputs)
    # Flatten inputs.
    inputs = common_layers.flatten4d3d(inputs)

    # LSTM encoder.
    inputs = tf.reverse_sequence(inputs, inputs_length, seq_axis=1)
    encoder_outputs, final_encoder_state = lstm(
        inputs, inputs_length, hparams, train, "encoder")

    # LSTM decoder with attention.
    shifted_targets = common_layers.shift_right(targets)
    # Add 1 to account for the padding added to the left from shift_right
    targets_length = common_layers.length_from_embedding(shifted_targets) + 1
    decoder_outputs = lstm_attention_decoder(
        common_layers.flatten4d3d(shifted_targets), hparams, train, "decoder",
        final_encoder_state, encoder_outputs, inputs_length, targets_length)
    return tf.expand_dims(decoder_outputs, axis=2)
    def render2cmd_v3_internal(self, features, hparams, train):
        # inputs and targets are both sequences with
        # shape = [batch, seq_len, 1, hparams.problem.feature_dim]
        print(
            "render2cmd_v3_internal render2cmd_v3_internalrender2cmd_v3_internalrender2cmd_v3_internalrender2cmd_v3_internal"
        )
        all_targets = features['targets']
        all_targets_cls = features['targets_cls']
        all_targets_font_cls = features['targets_fnt']
        all_targets_psr = features['targets_psr']
        all_batch_size = common_layers.shape_list(all_targets)[0]
        batch_size = all_batch_size // 2
        sources = all_targets[:batch_size, ...]
        sources_cls = all_targets_cls[:batch_size, ...]
        sources_fnt = all_targets_font_cls[:batch_size, ...]
        sources_psr = all_targets_psr[:batch_size, ...]
        targets = all_targets[batch_size:, ...]
        targets_cls = all_targets_cls[batch_size:, ...]
        targets_fnt = all_targets_font_cls[batch_size:, ...]
        targets_psr = all_targets_psr[batch_size:, ...]

        losses = {}
        # sampled_bottleneck = self.pretrained_visual_encoder(features, hparams)

        # if hparams.sg_bottleneck:
        #     sampled_bottleneck = tf.stop_gradient(sampled_bottleneck)
        # embd = self.cls_embedding(sources_cls, sources_fnt, targets_cls, targets_fnt)
        vis_embd = self.vis_encoder(sources_psr, targets_psr, targets_cls)
        # print("embd embd embd embd embd embd embd ", embd.shape)
        print("vis embd vis embd vis embd vis embd vis", vis_embd.shape)
        sampled_bottleneck = vis_embd

        with tf.variable_scope('render2cmd_v3_internal'):
            # override bottleneck, or return it, if requested
            # if 'bottleneck' in features:
            #     if common_layers.shape_list(features['bottleneck'])[0] == 0:
            #         # return sampled_bottleneck,
            #         # set losses['training'] = 0 so self.top() doesn't get called on it
            #         print("RETURNRETURNRETURNRETURNRETURNRETURNRETURNRETURNRETURNRETURNRETURN")
            #         return sampled_bottleneck, {'training': 0.0}
            #     else:
            #         # we want to use the given bottleneck
            #         sampled_bottleneck = features['bottleneck']

            # finalize bottleneck
            unbottleneck_dim = hparams.hidden_size * 2  # twice because using LSTM
            if hparams.twice_decoder:
                unbottleneck_dim = unbottleneck_dim * 2

            dec_initial_state = []

            # LSTM encoder
            _, encoder_output_states = self.lstm_encoder(
                common_layers.flatten4d3d(sources), hparams)

            print(
                "targets shape targets shape targets shape targets shape targets shape ",
                targets.shape)
            print('run stacking...')
            print(
                "sample bottleneck shape sample bottleneck shape sample bottleneck shape ",
                sampled_bottleneck.shape)
            print(
                "sources shape sources shape sources shape sources shape sources shape",
                sources.shape)
            # input()
            for hi in range(hparams.num_hidden_layers):
                unbottleneck = self.unbottleneck(sampled_bottleneck,
                                                 unbottleneck_dim,
                                                 name_append='_{}'.format(hi))
                c, h = encoder_output_states[hi]
                # print(unbottleneck.shape)
                # print(c.shape, h.shape)
                # first_dim = common_layers.shape_list(unbottleneck)[0]
                # print(first_dim)
                # c = tf.tile(c,[first_dim,1])
                # h = tf.tile(h,[first_dim,1])
                # input()
                dec_initial_state.append(
                    tf.nn.rnn_cell.LSTMStateTuple(
                        c=tf.concat(
                            [unbottleneck[:, :unbottleneck_dim // 2], c], 1),
                        h=tf.concat(
                            [unbottleneck[:, unbottleneck_dim // 2:], h], 1)))

            dec_initial_state = tuple(dec_initial_state)
            # print('checkshape dec_initial_state')
            # print(dec_initial_state)
            # input()
            shifted_targets = common_layers.shift_right(targets)
            # Add 1 to account for the padding added to the left from shift_right
            targets_length = common_layers.length_from_embedding(
                shifted_targets) + 1

            # LSTM decoder
            hparams_decoder = copy.copy(hparams)
            if hparams.twice_decoder:
                hparams_decoder.hidden_size = 2 * hparams.hidden_size

            if hparams.mode == tf.estimator.ModeKeys.PREDICT:
                decoder_outputs, _ = self.lstm_decoder_infer(
                    common_layers.flatten4d3d(shifted_targets),
                    targets_length,
                    hparams_decoder,
                    targets_cls,
                    train,
                    initial_state=dec_initial_state,
                    bottleneck=sampled_bottleneck)
            else:
                decoder_outputs, _ = self.lstm_decoder(
                    common_layers.flatten4d3d(shifted_targets),
                    targets_length,
                    hparams_decoder,
                    targets_cls,
                    train,
                    initial_state=dec_initial_state,
                    bottleneck=sampled_bottleneck)

            ret = tf.expand_dims(decoder_outputs, axis=2)
        return ret, losses
Exemple #39
0
    def render2cmd_v3_internal(self, features, hparams, train):
        # inputs and targets are both sequences with
        # shape = [batch, seq_len, 1, hparams.problem.feature_dim]
        targets = features['targets']
        losses = {}

        sampled_bottleneck = self.pretrained_visual_encoder(features, hparams)
        if hparams.sg_bottleneck:
            sampled_bottleneck = tf.stop_gradient(sampled_bottleneck)

        with tf.variable_scope('render2cmd_v3_internal'):
            # override bottleneck, or return it, if requested
            if 'bottleneck' in features:
                if common_layers.shape_list(features['bottleneck'])[0] == 0:
                    # return sampled_bottleneck,
                    # set losses['training'] = 0 so self.top() doesn't get called on it
                    return sampled_bottleneck, {'training': 0.0}
                else:
                    # we want to use the given bottleneck
                    sampled_bottleneck = features['bottleneck']

            # finalize bottleneck
            unbottleneck_dim = hparams.hidden_size * 2  # twice because using LSTM
            if hparams.twice_decoder:
                unbottleneck_dim = unbottleneck_dim * 2

            # unbottleneck back to LSTMStateTuple
            dec_initial_state = []
            for hi in range(hparams.num_hidden_layers):
                unbottleneck = self.unbottleneck(sampled_bottleneck,
                                                 unbottleneck_dim,
                                                 name_append='_{}'.format(hi))
                dec_initial_state.append(
                    rnn.LSTMStateTuple(
                        c=unbottleneck[:, :unbottleneck_dim // 2],
                        h=unbottleneck[:, unbottleneck_dim // 2:]))

            dec_initial_state = tuple(dec_initial_state)

            shifted_targets = common_layers.shift_right(targets)
            # Add 1 to account for the padding added to the left from shift_right
            targets_length = common_layers.length_from_embedding(
                shifted_targets) + 1

            # LSTM decoder
            hparams_decoder = copy.copy(hparams)
            if hparams.twice_decoder:
                hparams_decoder.hidden_size = 2 * hparams.hidden_size

            if hparams.mode == tf.estimator.ModeKeys.PREDICT:
                decoder_outputs, _ = self.lstm_decoder_infer(
                    common_layers.flatten4d3d(shifted_targets),
                    targets_length,
                    hparams_decoder,
                    features['targets_cls'],
                    train,
                    initial_state=dec_initial_state,
                    bottleneck=sampled_bottleneck)
            else:
                decoder_outputs, _ = self.lstm_decoder(
                    common_layers.flatten4d3d(shifted_targets),
                    targets_length,
                    hparams_decoder,
                    features['targets_cls'],
                    train,
                    initial_state=dec_initial_state,
                    bottleneck=sampled_bottleneck)

            ret = tf.expand_dims(decoder_outputs, axis=2)

        return ret, losses