Ejemplo n.º 1
0
def MoS(x, hidden_size, vocab_size, n_experts=10):
    '''
  a Mixture-of-Softamx operator, as specified in arXiv:1711.03953
  relatively untested. 
  x - tensor, dtype = tf.float32 shape = [..., hidden_size]
  hidden_size - the final dimension of x
  vocab_size - the vocab_size to calculate
  n_experts - the number of experts, int
  '''
    batch_size = tf.shape(x)[0]
    sequence_size = tf.shape(x)[1]
    with tf.variable_scope('latent'):
        latent = utils.dense(x,
                             output_dim=n_experts * hidden_size,
                             name='latent')
        latent = tf.nn.tanh(latent)

    with tf.variable_scope('decoder'):
        latent = tf.reshape(latent, [-1, hidden_size])
        logit = utils.dense(latent, output_dim=vocab_size, name='decoder')

    with tf.variable_scope('prior'):
        prior_logit = utils.dense(x, output_dim=n_experts, name='prior')
        prior_logit = tf.reshape(prior_logit, [-1, n_experts])
        prior = tf.nn.softmax(prior_logit, axis=-1)
        prob = tf.reshape(
            tf.nn.softmax(tf.reshape(logit, [-1, vocab_size]), axis=-1),
            [-1, n_experts, vocab_size])
        prior = tf.expand_dims(prior, axis=2)
        prior = tf.tile(prior, [1, 1, vocab_size])
        prob = (prob * prior)
        prob = tf.reduce_sum(prob, axis=1)
        prob = tf.reshape(prob, [batch_size, sequence_size, vocab_size])
    return prob
Ejemplo n.º 2
0
 def transformer_ffd(self, x):
     x = utils.dense(x,
                     output_dim=self.arg.filter_size,
                     use_bias=True,
                     name='ffd_1')
     x = self.dropout_fn(x)
     if self.arg.use_relu:
         x = tf.nn.relu(x)
     else:
         x = utils.gelu(x)
     return utils.dense(x,
                        output_dim=self.arg.hidden_size,
                        use_bias=True,
                        name='ffd_2')
Ejemplo n.º 3
0
 def __call__(self, inputs, state, scope=None, *args, **kwargs):
     with tf.variable_scope('attention'):
         hidden_with_time_axis = tf.expand_dims(state, axis=1)
         score = utils.dense(tf.nn.tanh(
             utils.dense(
                 self.encoder_output, output_dim=self.state_size, name='W1')
             + utils.dense(hidden_with_time_axis,
                           output_dim=self.state_size,
                           name='W2')),
                             output_dim=1,
                             name='V')
         attention_weights = tf.nn.softmax(score, axis=1)
         context_vector = attention_weights * self.encoder_output
         context_vector = tf.reduce_sum(context_vector, axis=1)
         inputs = tf.concat([inputs, context_vector], axis=1)
     return self.cell.__call__(inputs, state, scope=scope, *args, **kwargs)
        def body(x, i, halting_probability, remainders, n_updates):
            with tf.variable_scope('decoder_layer'):
                state = x
                x += self.timing_position(x)
                pondering = utils.dense(x, output_dim=1, name='pondering')
                pondering = tf.squeeze(pondering, axis=-1)
                pondering = tf.nn.sigmoid(pondering)
                update_weights, halting_probability, remainders, n_updates = act(
                    pondering, halt_threshold, halting_probability, remainders,
                    n_updates)
                with tf.variable_scope('attention'):
                    y = utils.layer_norm(x)
                    y = utils.multihead_attention(
                        query=y,
                        memory=None,
                        bias=decoder_self_attention_bias,
                        total_key_depth=self.arg.head_size *
                        self.arg.num_heads,
                        total_value_depth=self.arg.head_size *
                        self.arg.num_heads,
                        output_depth=self.arg.hidden_size,
                        num_heads=self.arg.num_heads,
                        deparameterize=self.arg.deparameterize,
                        dropout_keep_prob=self.keep_prob,
                        dropout_type=self.arg.dropout_type,
                        relative_attention=self.arg.relative_attention,
                        max_relative_position=self.arg.max_relative_position)
                    y = self.dropout_fn(y)
                    x += y
                with tf.variable_scope('encoder_attention'):
                    y = utils.layer_norm(x)
                    y = utils.multihead_attention(
                        query=y,
                        memory=memory,
                        bias=encoder_decoder_attention_bias,
                        total_key_depth=self.arg.head_size *
                        self.arg.num_heads,
                        total_value_depth=self.arg.head_size *
                        self.arg.num_heads,
                        output_depth=self.arg.hidden_size,
                        num_heads=self.arg.num_heads,
                        dropout_keep_prob=self.keep_prob,
                        dropout_type=self.arg.dropout_type,
                        relative_attention=False,
                        max_relative_position=self.arg.max_relative_position)
                    y = self.dropout_fn(y)
                    x += y
                with tf.variable_scope('ffd'):
                    y = utils.layer_norm(x)
                    y = self.ffd(y)
                    y = self.dropout_fn(y)
                    x += y

                x = (x * update_weights) + (state * (1 - update_weights))
            return x, i + 1, halting_probability, remainders, n_updates
Ejemplo n.º 5
0
def SRU(x,
        num_layers=2,
        activation=None,
        initial_state=None,
        name=None,
        reuse=None,
        reuse_layer=False):
    '''
  SRU introduced in arXiv:1709.02755
  code based on tensor2tensor
  x - tensor, dtype = tf.float32 shape = [batch_size, sequence_size, hidden_size]
  '''
    with tf.variable_scope(name, default_name='SRU', reuse=reuse):
        tf_x_shape = tf.shape(x)
        x_shape = x.shape.as_list()
        x = tf.transpose(x, perm=[1, 0, 2], name='input_transpose')
        if initial_state is None:
            initial_state = tf.zeros(
                [tf.shape(x)[1], tf.shape(x)[2]], dtype=x.dtype)
        for i in range(num_layers):
            with tf.variable_scope('layer_{}'.format(i + 1),
                                   reuse=(i != 0 and reuse_layer)):
                x_orig = x
                x, f, r = tf.split(utils.dense(x,
                                               output_dim=3 * x_shape[-1],
                                               name='dense'),
                                   num_or_size_splits=3,
                                   axis=-1)
                f, r = tf.sigmoid(f), tf.sigmoid(r)
                x_times_one_minus_f = x * (1.0 - f)
                c_states = tf.scan(next_state, (x_times_one_minus_f, f),
                                   initializer=initial_state,
                                   parallel_iterations=2,
                                   name='scan_{}'.format(i))
                if activation is not None:
                    c_states = activation(c_states)
                h = c_states * r + (1.0 - r) * x_orig
                x = h
        x = tf.transpose(x, perm=[1, 0, 2])
        return tf.reshape(x, tf_x_shape)
Ejemplo n.º 6
0
 def decoder(self, inputs, memory, decoder_self_attention_bias,
             encoder_decoder_attention_bias):
     x = inputs
     if self.arg.adaptive_mask:
         self.decoder_l0 = []
     for layer in range(1, self.arg.decoder_layers + 1):
         with tf.variable_scope('layer_{}'.format(layer)):
             with tf.variable_scope('16_head_self_attention'):
                 y = utils.layer_norm(x)
                 left_state = utils.multihead_attention(
                     query=y,
                     memory=None,
                     bias=self.decoder_self_attention_bias,
                     total_key_depth=self.arg.head_size *
                     max(min(self.arg.num_heads * 2, 16),
                         self.arg.num_heads),
                     total_value_depth=self.arg.head_size *
                     max(min(self.arg.num_heads * 2, 16),
                         self.arg.num_heads),
                     output_depth=self.arg.hidden_size,
                     num_heads=max(min(self.arg.num_heads * 2, 16),
                                   self.arg.num_heads),
                     dropout_keep_prob=self.keep_prob,
                     dropout_type=self.arg.dropout_type,
                     name='self_attention',
                     relative_attention=self.arg.relative_attention,
                     max_relative_position=self.arg.max_relative_position,
                     adaptive_mask=self.arg.adaptive_mask,
                     dynamic_attention_span=self.arg.dynamic_attention_span)
                 if self.arg.adaptive_mask:
                     self.decoder_l0.append(left_state[1])
                     left_state = left_state[0]
                 right_state = utils.multihead_attention(
                     query=y,
                     memory=memory,
                     bias=self.encoder_decoder_attention_bias,
                     total_key_depth=self.arg.head_size *
                     self.arg.num_heads,
                     total_value_depth=self.arg.head_size *
                     self.arg.num_heads,
                     output_depth=self.arg.hidden_size,
                     num_heads=self.arg.num_heads,
                     dropout_keep_prob=self.keep_prob,
                     dropout_type=self.arg.dropout_type,
                     name='encoder_attention',
                     relative_attention=False,
                     max_relative_position=self.arg.max_relative_position,
                     adaptive_mask=self.arg.adaptive_mask,
                     dynamic_attention_span=self.arg.dynamic_attention_span)
                 if self.arg.adaptive_mask:
                     self.decoder_l0.append(right_state[1])
                     right_state = right_state[0]
                 x += self.dropout_fn(left_state) + self.dropout_fn(
                     right_state)
             with tf.variable_scope('conv_branches'):
                 y = utils.layer_norm(x)
                 if self.arg.unidirectional_decoder:
                     left_state = tf.concat([
                         tf.zeros(
                             [self.batch_size, 10, self.arg.hidden_size]), y
                     ],
                                            axis=1)
                     padding = 'VALID'
                 else:
                     padding = 'SAME'
                     left_state = y
                 left_state = utils.separable_conv(
                     left_state,
                     filters=self.arg.hidden_size * 2,
                     kernel_size=11,
                     padding=padding,
                     name='separable_11x1')
                 if self.arg.use_relu:
                     left_state = tf.nn.relu(left_state)
                 else:
                     left_state = utils.gelu(left_state)
                 left_state = self.dropout_fn(left_state)
                 if self.arg.unidirectional_decoder:
                     right_state = tf.concat([
                         tf.zeros(
                             [self.batch_size, 6, self.arg.hidden_size]), y
                     ],
                                             axis=1)
                     padding = 'VALID'
                 else:
                     padding = 'SAME'
                     right_state = y
                 right_state = utils.separable_conv(
                     right_state,
                     filters=int(self.arg.hidden_size / 2),
                     kernel_size=7,
                     padding=padding,
                     name='separable_7x1')
                 right_state = tf.pad(
                     right_state,
                     paddings=[[0, 0], [0, 0],
                               [0, int(self.arg.hidden_size * 1.5)]],
                     constant_values=0)
                 y = left_state + right_state
                 y = utils.layer_norm(y)
                 if self.arg.unidirectional_decoder:
                     y = tf.concat([
                         tf.zeros([
                             self.batch_size, 6, self.arg.hidden_size * 2
                         ]), y
                     ],
                                   axis=1)
                     padding = 'VALID'
                 else:
                     padding = 'SAME'
                 y = utils.separable_conv(y,
                                          filters=self.arg.hidden_size,
                                          kernel_size=7,
                                          padding=padding,
                                          name='separable_7x1_2')
                 x += self.dropout_fn(y)
             with tf.variable_scope('self_attention'):
                 y = utils.layer_norm(x)
                 y = utils.multihead_attention(
                     query=y,
                     memory=None,
                     bias=self.decoder_self_attention_bias,
                     total_key_depth=self.arg.head_size *
                     self.arg.num_heads,
                     total_value_depth=self.arg.head_size *
                     self.arg.num_heads,
                     output_depth=self.arg.hidden_size,
                     num_heads=self.arg.num_heads,
                     dropout_keep_prob=self.keep_prob,
                     dropout_type=self.arg.dropout_type,
                     relative_attention=self.arg.relative_attention,
                     max_relative_position=self.arg.max_relative_position,
                     adaptive_mask=self.arg.adaptive_mask,
                     dynamic_attention_span=self.arg.dynamic_attention_span)
                 if self.arg.adaptive_mask:
                     self.decoder_l0.append(y[1])
                     y = y[0]
                 x += self.dropout_fn(y)
             with tf.variable_scope('encoder_attention'):
                 y = utils.layer_norm(x)
                 y = utils.multihead_attention(
                     query=y,
                     memory=memory,
                     bias=self.encoder_decoder_attention_bias,
                     total_key_depth=self.arg.head_size *
                     self.arg.num_heads,
                     total_value_depth=self.arg.head_size *
                     self.arg.num_heads,
                     output_depth=self.arg.hidden_size,
                     num_heads=self.arg.num_heads,
                     dropout_keep_prob=self.keep_prob,
                     dropout_type=self.arg.dropout_type,
                     relative_attention=False,
                     max_relative_position=self.arg.max_relative_position,
                     adaptive_mask=self.arg.adaptive_mask,
                     dynamic_attention_span=self.arg.dynamic_attention_span)
                 if self.arg.adaptive_mask:
                     self.decoder_l0.append(y[1])
                     y = y[0]
                 x += self.dropout_fn(y)
             with tf.variable_scope('dense_layers'):
                 y = utils.layer_norm(x)
                 y = utils.dense(y,
                                 output_dim=self.arg.hidden_size * 4,
                                 name='dense_1')
                 y = tf.nn.swish(y)
                 y = utils.layer_norm(y)
                 y = utils.dense(y,
                                 output_dim=self.arg.hidden_size,
                                 name='dense_2')
                 x += self.dropout_fn(y)
     return utils.layer_norm(x)
Ejemplo n.º 7
0
 def encoder(self, inputs, encoder_self_attention_bias):
     x = inputs
     if self.arg.adaptive_mask:
         self.encoder_l0 = []
     for layer in range(1, self.arg.encoder_layers + 1):
         with tf.variable_scope('layer_{}'.format(layer)):
             with tf.variable_scope('gated_linear_unit'):
                 y = utils.layer_norm(x)
                 y = utils.convolution_gating(
                     y,
                     kernel_size=1,
                     input_dim=y.shape.as_list()[-1],
                     output_dim=y.shape.as_list()[-1])
                 y = self.dropout_fn(y)
                 x += y
             with tf.variable_scope('conv_branches'):
                 y = utils.layer_norm(x)
                 if self.arg.use_relu:
                     left_state = tf.nn.relu(
                         utils.dense(y,
                                     output_dim=int(self.arg.hidden_size *
                                                    4),
                                     name='left_branch'))
                 else:
                     left_state = utils.gelu(
                         utils.dense(y,
                                     output_dim=int(self.arg.hidden_size *
                                                    4),
                                     name='right_branch'))
                 left_state = self.dropout_fn(left_state)
                 with tf.variable_scope('right_branch'):
                     kernel = tf.get_variable('kernel',
                                              shape=[
                                                  3,
                                                  y.shape.as_list()[-1],
                                                  int(self.arg.hidden_size /
                                                      2)
                                              ],
                                              dtype=tf.float32)
                     '''
         given that the tensor, at this point, is of shape [batch_size, sequence_size, hidden_size], 
         and the kernel size of the convolution is 3, 
         then an unmoderated convolution, at time-step t, would analyze the time-steps (t-1, t, t+1) for the output of time-step t
         If the anaylsis is unidirectional, and that the analysis at time-step t cannot see 'ahead through time', this form of analysis if invalid.
         Therefore, in order to avoid this illegal analysis, a zeros vector is concatenated to the left of the tensor.
         Therefore, at time-step t, the time-steps (t-2, t-1, t) is analyzed, where the tokens at -2 and -1 are 0
         If the analysis is bidirectional, analyzing the time-steps (t-1, t, t+1) is a legal move
         '''
                     if self.arg.unidirectional_encoder:
                         padding = 'VALID'
                         y = tf.concat([
                             tf.zeros([
                                 self.batch_size, 2, self.arg.hidden_size
                             ]), y
                         ],
                                       axis=1)
                     else:
                         padding = 'SAME'
                     right_state = tf.nn.convolution(
                         y,
                         kernel,
                         padding=padding,
                         name='convolution_conv_3x1')
                     if self.arg.use_relu:
                         right_state = tf.nn.relu(right_state)
                     else:
                         right_state = utils.gelu(right_state)
                     right_state = self.dropout_fn(right_state)
                 right_state = tf.pad(right_state,
                                      [[0, 0], [0, 0],
                                       [
                                           0,
                                           int(self.arg.hidden_size * 4) -
                                           int(self.arg.hidden_size / 2)
                                       ]],
                                      constant_values=0)
                 y = left_state + right_state
                 y = utils.layer_norm(y)
                 if self.arg.unidirectional_encoder:
                     padding = 'VALID'
                     y = tf.concat([
                         tf.zeros([
                             self.batch_size, 8, self.arg.hidden_size * 4
                         ]), y
                     ],
                                   axis=1)
                 else:
                     padding = 'SAME'
                 y = utils.separable_conv(y,
                                          filters=int(self.arg.hidden_size /
                                                      2),
                                          kernel_size=9,
                                          padding=padding,
                                          name='separable_9x1')
                 y = tf.pad(
                     y,
                     [[0, 0], [0, 0], [0, int(self.arg.hidden_size / 2)]],
                     constant_values=0)
                 x += self.dropout_fn(y)
             with tf.variable_scope('self_attention'):
                 y = utils.layer_norm(x)
                 y = utils.multihead_attention(
                     query=y,
                     memory=None,
                     bias=self.encoder_self_attention_bias,
                     total_key_depth=self.arg.head_size *
                     self.arg.num_heads,
                     total_value_depth=self.arg.head_size *
                     self.arg.num_heads,
                     output_depth=self.arg.hidden_size,
                     num_heads=self.arg.num_heads,
                     dropout_keep_prob=self.keep_prob,
                     dropout_type=self.arg.dropout_type,
                     relative_attention=self.arg.relative_attention,
                     max_relative_position=self.arg.max_relative_position,
                     adaptive_mask=self.arg.adaptive_mask,
                     dynamic_attention_span=self.arg.dynamic_attention_span)
                 if self.arg.adaptive_mask:
                     self.encoder_l0.append(y[1])
                     y = y[0]
                 x += self.dropout_fn(y)
             with tf.variable_scope('dense_layers'):
                 y = utils.layer_norm(x)
                 y = utils.dense(y,
                                 output_dim=int(self.arg.hidden_size * 4),
                                 name='dense_1')
                 if self.arg.use_relu:
                     y = tf.nn.relu(y)
                 else:
                     y = utils.gelu(y)
                 y = self.dropout_fn(y)
                 y = utils.dense(y,
                                 output_dim=int(self.arg.hidden_size),
                                 name='dense_2')
                 x += self.dropout_fn(y)
     return utils.layer_norm(x)
Ejemplo n.º 8
0
    def encoder(self, inputs, encoder_self_attention_bias):
        with tf.variable_scope('positional_embedding'):
            pos_seq = tf.range(
                tf.shape(self.encoder_self_attention_bias)[-1] - 1, -1, -1.0)
            inv_freq = 1 / (10000**(tf.range(0, self.arg.hidden_size, 2.0) /
                                    self.arg.hidden_size))
            sinusoid_inp = tf.einsum('i,j->ij', pos_seq, inv_freq)
            pos_emb = tf.concat([tf.sin(sinusoid_inp),
                                 tf.cos(sinusoid_inp)],
                                axis=-1)
            pos_emb = tf.tile(pos_emb[None, :, :], [self.batch_size, 1, 1])
            if self.arg.tie_weights:
                r_w_bias = tf.get_variable(
                    'r_w_bias',
                    shape=[1, self.arg.num_heads, 1, self.arg.head_size],
                    dtype=tf.float32)
                r_r_bias = tf.get_variable(
                    'r_r_bias',
                    shape=[1, self.arg.num_heads, 1, self.arg.head_size],
                    dtype=tf.float32)
            else:
                r_w_bias = tf.get_variable('r_w_bias',
                                           shape=[
                                               self.arg.encoder_layers, 1,
                                               self.arg.num_heads, 1,
                                               self.arg.head_size
                                           ],
                                           dtype=tf.float32)
                r_r_bias = tf.get_variable('r_r_bias',
                                           shape=[
                                               self.arg.encoder_layers, 1,
                                               self.arg.num_heads, 1,
                                               self.arg.head_size
                                           ],
                                           dtype=tf.float32)

        x = inputs
        for layer in range(1, self.arg.encoder_layers + 1):
            with tf.variable_scope('layer_{}'.format(layer)):
                x = self.timing_position(x)
                with tf.variable_scope('attention'):

                    self.new_mems.append(
                        self._cache_mem(x, self.memory[layer - 1]))

                    memory = tf.concat([self.memory[layer - 1], x], axis=1)
                    y = utils.layer_norm(x)
                    memory = utils.layer_norm(memory)
                    q, k, v = utils.compute_qkv(
                        query=y,
                        memory=memory,
                        total_key_depth=self.arg.head_size *
                        self.arg.num_heads,
                        total_value_depth=self.arg.head_size *
                        self.arg.num_heads,
                        deparameterize=self.arg.deparameterize)
                    r = utils.dense(pos_emb,
                                    output_dim=self.arg.head_size *
                                    self.arg.num_heads,
                                    use_bias=False,
                                    name='pos_emb')
                    r = tf.reshape(r, [
                        self.batch_size, self.arg.num_heads, -1,
                        self.arg.head_size
                    ])
                    q = utils.split_heads(q, self.arg.num_heads)
                    k = utils.split_heads(k, self.arg.num_heads)
                    v = utils.split_heads(v, self.arg.num_heads)
                    if self.arg.tie_weights:
                        AD = tf.matmul(q + r_w_bias, k, transpose_b=True)
                        BD = tf.matmul(q + r_r_bias, r, transpose_b=True)
                    else:
                        AD = tf.matmul(q + r_w_bias[layer - 1],
                                       k,
                                       transpose_b=True)
                        BD = tf.matmul(q + r_r_bias[layer - 1],
                                       r,
                                       transpose_b=True)

                    BD = self.rel_shift(BD)

                    logits = AD + BD
                    logits /= k.shape.as_list()[-1]
                    logits += self.encoder_self_attention_bias
                    weights = tf.nn.softmax(logits, name='attention_weights')
                    y = tf.matmul(weights, v)
                    y = utils.combine_heads(y)
                    y.set_shape(y.shape.as_list()[:-1] +
                                [self.arg.head_size * self.arg.num_heads])
                    with tf.variable_scope('output'):
                        y = utils.dense(y,
                                        output_dim=self.arg.hidden_size,
                                        use_bias=False,
                                        name='output_transform')
                    y = self.dropout_fn(y)
                    x += y

                with tf.variable_scope('ffd'):
                    y = utils.layer_norm(x)
                    y = self.ffd(y)
                    y = self.dropout_fn(y)
                    x += y
        with tf.variable_scope('output'):
            return utils.layer_norm(x)
Ejemplo n.º 9
0
    def __init__(self, arg, name=None):
        '''
    a Seq2Seq model based on the model described in arXiv:1804.00946
    the stop-feature mechanism, in particular, was taken from these mechanisms
    '''
        if name:
            self.name = name
        else:
            self.name = 'Seq2Seq'
        batch_size = 32
        input_sequence_size = 10
        output_sequence_size = 12
        if __name__ != '__main__':
            batch_size = input_sequence_size = output_sequence_size = None
        self.arg = arg
        self.inputs = tf.placeholder(tf.int32,
                                     shape=[batch_size, input_sequence_size],
                                     name='inputs')
        self.targets = tf.placeholder(tf.int32,
                                      shape=[batch_size, output_sequence_size],
                                      name='targets')
        self.training = tf.placeholder(tf.bool, name='training')
        self.learning_rate = tf.placeholder(tf.float32, name='learning_rate')
        self.keep_prob = tf.placeholder(tf.float32, name='keep_prob')
        self.input_stop_feature = tf.placeholder(
            tf.float32,
            shape=[batch_size, input_sequence_size, 1],
            name='input_stop_feature')
        self.target_stop_feature = tf.placeholder(
            tf.float32,
            shape=[batch_size, output_sequence_size, 1],
            name='target_stop_feature')
        self.batch_size = tf.shape(self.inputs)[0]
        self.input_sequence_size = tf.shape(self.inputs)[1]
        self.target_sequence_size = tf.shape(self.targets)[1]

        if self.arg.mask_loss:
            self.loss_mask = tf.placeholder(
                tf.float32,
                shape=[batch_size, output_sequence_size
                       ],  # (batch_size, output_sequence_size)
                name='loss_mask')
        else:
            self.loss_mask = None

        with tf.variable_scope('embedding'):
            embedded_inputs, embedded_targets = self.embedding()
            embedded_inputs = tf.concat(
                [embedded_inputs, self.input_stop_feature], axis=2)
            embedded_targets = tf.concat(
                [embedded_targets, self.target_stop_feature], axis=2)
        with tf.variable_scope('encode'):
            encoder_output, encoder_state = self.encode(embedded_inputs)
            encoder_output = self.dropout_fn(encoder_output)
        with tf.variable_scope('decode'):
            decoder_output, _ = self.decode(encoder_output, encoder_state,
                                            embedded_targets)
            decoder_output = self.dropout_fn(decoder_output)
        with tf.variable_scope('output'):
            self.logits = utils.dense(decoder_output,
                                      output_dim=self.arg.target_vocab_size,
                                      name='logits')
        with tf.variable_scope('loss'):
            self.loss_cl = loss.Loss(self.logits,
                                     self.targets,
                                     self.arg.loss,
                                     vocab_size=self.arg.target_vocab_size,
                                     label_smoothing=self.arg.label_smoothing)
            cost = self.loss_cl.loss
            if self.arg.mask_loss:
                self.cost = tf.reduce_mean(cost * self.loss_mask)
            else:
                self.cost = tf.reduce_mean(cost)
            if self.arg.weight_decay_regularization:
                l2_loss = self.loss_cl.l2_loss(tf.trainable_variables())
                l2_loss *= self.arg.weight_decay_hyperparameter
                self.cost += l2_loss
            self.optimizer = optimize.Optimizer(
                arg, loss=self.cost, learning_rate=self.learning_rate)
            self.optimizer.accuracy(self.logits,
                                    self.targets,
                                    mask=self.loss_mask)
            self.train_op = self.optimizer.train_op
            self.predict = self.optimizer.predict
            self.correct_prediction = self.optimizer.correct_prediction
            self.accuracy = self.optimizer.accuracy
            self.optimizer.sequential_accuracy(self.logits,
                                               self.targets,
                                               mask=self.loss_mask)
            self.sequential_accuracy = self.optimizer.sequential_accuracy
            self.fetches = [embedded_inputs, encoder_output, self.logits]