Python get_optimizer Examples

Programming Language: Python

Namespace/Package Name: optimization_ops

Method/Function: get_optimizer

Examples at hotexamples.com: 3

Python get_optimizer - 3 examples found. These are the top rated real world Python examples of optimization_ops.get_optimizer extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

    def __init__(self,
                 is_training,
                 learning_rate=1.0,
                 optimizer="sgd",
                 max_grad_norm=5,
                 num_layers=2,
                 use_lstm=True,
                 num_steps=35,
                 num_steps_valid=120,
                 proj_size=650,
                 hidden_size=650,
                 hidden_proj=650,
                 num_samples=512,
                 init_scale=0.1,
                 dropout_rate=0.0,
                 lr_decay=0.8,
                 batch_size=20,
                 attentive=False,
                 projection_attention_f=None,
                 output_form=lm_ops.OUTPUT_CONCAT,
                 vocab_size=10000):

        with tf.device("/gpu:0"):

            if attentive:
                assert projection_attention_f is not None

            self.batch_size = batch_size = batch_size
            self.num_steps = num_steps
            self.num_steps_valid = num_steps_valid
            vocab_size = vocab_size

            self._input_data_train = []
            self._targets_train = []
            self.mask_train = []

            for i in xrange(num_steps):  # Last bucket is the biggest one.
                self.input_data_train.append(
                    tf.placeholder(tf.int32,
                                   shape=[None],
                                   name="input_train{0}".format(i)))
                self.targets_train.append(
                    tf.placeholder(tf.int32,
                                   shape=[None],
                                   name="target_train{0}".format(i)))
                self.mask_train.append(
                    tf.placeholder(tf.float32,
                                   shape=[None],
                                   name="mask_train{0}".format(i)))

            self._input_data_valid = []
            self._targets_valid = []
            self.mask_valid = []

            for i in xrange(
                    num_steps_valid):  # Last bucket is the biggest one.
                self.input_data_valid.append(
                    tf.placeholder(tf.int32,
                                   shape=[None],
                                   name="input_valid{0}".format(i)))
                self.targets_valid.append(
                    tf.placeholder(tf.int32,
                                   shape=[None],
                                   name="target_valid{0}".format(i)))
                self.mask_valid.append(
                    tf.placeholder(tf.float32,
                                   shape=[None],
                                   name="mask_valid{0}".format(i)))

            hidden_projection = None
            if hidden_proj > 0:
                hidden_projection = hidden_proj

            self.cell = cells.build_lm_multicell_rnn(
                num_layers,
                hidden_size,
                proj_size,
                use_lstm=use_lstm,
                hidden_projection=hidden_projection,
                dropout=dropout_rate)

            self.dropout_feed = tf.placeholder(tf.float32, name="dropout_rate")

            self._initial_state_train = self.cell.zero_state(
                batch_size, tf.float32)
            self._initial_state_valid = self.cell.zero_state(1, tf.float32)

            # learning rate ops
            self.learning_rate = tf.Variable(float(learning_rate),
                                             trainable=False)
            self.learning_rate_decay_op = self.learning_rate.assign(
                self.learning_rate * lr_decay)

            # epoch ops
            self.epoch = tf.Variable(0, trainable=False)
            self.epoch_update_op = self.epoch.assign(self.epoch + 1)

            # samples seen ops
            self.samples_seen = tf.Variable(0, trainable=False)
            self.samples_seen_update_op = self.samples_seen.assign(
                self.samples_seen + batch_size)
            self.samples_seen_reset_op = self.samples_seen.assign(0)

            # global step variable - controled by the model
            self.global_step = tf.Variable(0.0, trainable=False)

            # average loss ops
            self.current_ppx = tf.Variable(1.0, trainable=False)
            self.current_loss = tf.Variable(0.0, trainable=False)
            # self.current_loss_update_op = None

            self.best_eval_ppx = tf.Variable(numpy.inf, trainable=False)
            self.estop_counter = tf.Variable(0, trainable=False)
            self.estop_counter_update_op = self.estop_counter.assign(
                self.estop_counter + 1)
            self.estop_counter_reset_op = self.estop_counter.assign(0)

            initializer = tf.random_uniform_initializer(minval=init_scale,
                                                        maxval=init_scale,
                                                        seed=_SEED)

            out_proj = hidden_size
            if hidden_proj > 0:
                out_proj = hidden_proj

            with tf.device("/cpu:0"):
                w = tf.get_variable("proj_w", [out_proj, vocab_size])
                w_t = tf.transpose(w)
                b = tf.get_variable("proj_b", [vocab_size])
            self.output_projection = (w, b)

            sampled_softmax = False

            # Sampled softmax only makes sense if we sample less than vocabulary size.
            if 0 < num_samples < vocab_size:
                sampled_softmax = True

                def sampled_loss(logits, labels):
                    with tf.device("/cpu:0"):
                        labels = tf.reshape(labels, [-1, 1])
                        losses = tf.nn.sampled_softmax_loss(
                            w_t, b, logits, labels, num_samples, vocab_size)
                        return losses

                loss_function = sampled_loss

            with tf.device("/cpu:0"):
                # input come as one big tensor so we have to split it into a list of tensors to run the rnn cell
                embedding = tf.Variable(tf.random_uniform(
                    [vocab_size, proj_size],
                    minval=-init_scale,
                    maxval=init_scale),
                                        name="embedding")
                # embedding = tf.get_variable("embedding", [vocab_size, proj_size])

                inputs_train = [
                    tf.nn.embedding_lookup(embedding, i)
                    for i in self.input_data_train
                ]
                inputs_valid = [
                    tf.nn.embedding_lookup(embedding, i)
                    for i in self.input_data_valid
                ]

            with tf.variable_scope("RNN", initializer=initializer):

                if attentive:
                    outputs_train, state_train, _ = lm_ops.apply_attentive_lm(
                        self.cell,
                        inputs_train,
                        sequence_length=array_ops.squeeze(
                            math_ops.add_n(self.mask_train)),
                        projection_attention_f=projection_attention_f,
                        output_form=output_form,
                        dropout=self.dropout_feed,
                        initializer=initializer,
                        dtype=tf.float32)

                    outputs_valid, state_valid, _ = lm_ops.apply_attentive_lm(
                        self.cell,
                        inputs_valid,
                        sequence_length=array_ops.squeeze(
                            math_ops.add_n(self.mask_valid)),
                        projection_attention_f=projection_attention_f,
                        output_form=output_form,
                        dropout=self.dropout_feed,
                        initializer=initializer,
                        dtype=tf.float32)

                else:
                    outputs_train, state_train = lm_ops.apply_lm(
                        self.cell,
                        inputs_train,
                        sequence_length=math_ops.add_n(self.mask_train),
                        dropout=self.dropout_feed,
                        dtype=tf.float32)

                    outputs_valid, state_valid = lm_ops.apply_lm(
                        self.cell,
                        inputs_valid,
                        sequence_length=math_ops.add_n(self.mask_valid),
                        dropout=self.dropout_feed,
                        dtype=tf.float32)

                if sampled_softmax is False:
                    logits_train = [
                        tf.nn.xw_plus_b(o, self.output_projection[0],
                                        self.output_projection[1])
                        for o in outputs_train
                    ]
                    logits_valid = [
                        tf.nn.xw_plus_b(o, self.output_projection[0],
                                        self.output_projection[1])
                        for o in outputs_valid
                    ]
                else:
                    logits_train = outputs_train
                    logits_valid = outputs_valid

            loss_train = seq2seq.sequence_loss_by_example(
                logits_train,
                self.targets_train,
                self.mask_train,
                average_across_timesteps=True)

            loss_valid = seq2seq.sequence_loss_by_example(
                logits_valid,
                self.targets_valid,
                self.mask_valid,
                average_across_timesteps=True)

            self._cost_train = cost = tf.reduce_sum(loss_train) / float(
                batch_size)
            self._final_state_train = state_train

            self._cost_valid = tf.reduce_sum(loss_valid) / float(batch_size)
            self._final_state_valid = state_valid

            if not is_training:
                return

            tvars = tf.trainable_variables()
            grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars),
                                              max_grad_norm)

            opt = optimization_ops.get_optimizer(optimizer, learning_rate)
            self._train_op = opt.apply_gradients(zip(grads, tvars),
                                                 global_step=self.global_step)
            self._valid_op = tf.no_op()

            self.saver = tf.train.Saver(tf.all_variables())
            self.saver_best = tf.train.Saver(tf.all_variables())

Example #2

Show file

File: nmt_models.py Project: yeab/tsf_nmt

    def __init__(self,
                 source_vocab_size,
                 target_vocab_size,
                 buckets,
                 source_proj_size,
                 target_proj_size,
                 encoder_size,
                 decoder_size,
                 max_gradient_norm,
                 batch_size,
                 learning_rate,
                 learning_rate_decay_factor,
                 optimizer='sgd',
                 input_feeding=False,
                 combine_inp_attn=False,
                 dropout=0.0,
                 attention_f=None,
                 window_size=10,
                 content_function=None,
                 decoder_attention_f="None",
                 num_samples=512,
                 forward_only=False,
                 max_len=100,
                 cpu_only=False,
                 early_stop_patience=0,
                 save_best_model=True,
                 dtype=tf.float32):
        super(NMTModel, self).__init__()

        if cpu_only:
            device = "/cpu:0"
        else:
            device = "/gpu:0"

        with tf.device(device):

            self.source_vocab_size = source_vocab_size
            self.target_vocab_size = target_vocab_size
            self.buckets = buckets
            self.batch_size = batch_size
            self.attention_f = attention_f
            self.content_function = content_function
            self.window_size = window_size

            self.combine_inp_attn = combine_inp_attn

            if decoder_attention_f == "None":
                self.decoder_attention_f = None
            else:
                self.decoder_attention_f = decoder_attention_f

            # learning rate ops
            self.learning_rate = tf.Variable(float(learning_rate), trainable=False)
            self.learning_rate_decay_op = self.learning_rate.assign(self.learning_rate * learning_rate_decay_factor)

            # epoch ops
            self.epoch = tf.Variable(0, trainable=False)
            self.epoch_update_op = self.epoch.assign(self.epoch + 1)

            # samples seen ops
            self.samples_seen = tf.Variable(0, trainable=False)
            self.samples_seen_update_op = self.samples_seen.assign(self.samples_seen + batch_size)
            self.samples_seen_reset_op = self.samples_seen.assign(0)

            # global step variable - controled by the model
            self.global_step = tf.Variable(0.0, trainable=False)

            # average loss ops
            self.current_loss = tf.Variable(0.0, trainable=False)
            self.current_loss_update_op = None
            self.avg_loss = tf.Variable(0.0, trainable=False)
            self.avg_loss_update_op = self.avg_loss.assign(tf.div(self.current_loss, self.global_step))

            if early_stop_patience > 0 or save_best_model:
                self.best_eval_loss = tf.Variable(numpy.inf, trainable=False)
                self.estop_counter = tf.Variable(0, trainable=False)
                self.estop_counter_update_op = self.estop_counter.assign(self.estop_counter + 1)
                self.estop_counter_reset_op = self.estop_counter.assign(0)
            else:
                self.best_eval_loss = None
                self.estop_counter = None
                self.estop_counter_update_op = None
                self.estop_counter_reset_op = None

            self.source_proj_size = source_proj_size
            self.target_proj_size = target_proj_size
            self.encoder_size = encoder_size
            self.decoder_size = decoder_size

            self.input_feeding = input_feeding

            self.max_len = max_len
            self.dropout = dropout
            self.dropout_feed = tf.placeholder(tf.float32, name="dropout_rate")
            self.step_num = tf.Variable(0, trainable=False)

            self.dtype = dtype

            # If we use sampled softmax, we need an output projection.
            loss_function = None

            with tf.device("/cpu:0"):
                w = tf.get_variable("proj_w", [decoder_size, self.target_vocab_size])
                w_t = tf.transpose(w)
                b = tf.get_variable("proj_b", [self.target_vocab_size])
            self.output_projection = (w, b)

            self.sampled_softmax = False

            # Sampled softmax only makes sense if we sample less than vocabulary size.
            if 0 < num_samples < self.target_vocab_size:
                self.sampled_softmax = True
                def sampled_loss(inputs, labels):
                    with tf.device("/cpu:0"):
                        labels = tf.reshape(labels, [-1, 1])
                        return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, num_samples,
                                                          self.target_vocab_size)

                loss_function = sampled_loss

            # create the embedding matrix - this must be done in the CPU for now
            with tf.device("/cpu:0"):
                self.src_embedding = tf.Variable(
                    tf.truncated_normal(
                        [source_vocab_size, source_proj_size], stddev=0.01
                    ),
                    name='embedding_src'
                )

                # decoder with attention
                with tf.name_scope('decoder_with_attention') as scope:
                    # create this variable to be used inside the embedding_attention_decoder
                    self.tgt_embedding = tf.Variable(
                        tf.truncated_normal(
                            [target_vocab_size, target_proj_size], stddev=0.01
                        ),
                        name='embedding'
                    )

            # Create the internal multi-layer cell for our RNN.
            self.encoder_cell_fw, self.encoder_cell_bw, self.decoder_cell = cells.build_nmt_bidirectional_cell(
                encoder_size, decoder_size, source_proj_size, target_proj_size, dropout=dropout)

            # The seq2seq function: we use embedding for the input and attention.
            def seq2seq_f(encoder_inputs, decoder_inputs):
                return self.inference(encoder_inputs, decoder_inputs)

            # Feeds for inputs.
            self.encoder_inputs = []
            self.decoder_inputs = []
            self.target_weights = []

            for i in xrange(buckets[-1][0]):  # Last bucket is the biggest one.
                self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i)))

            for i in xrange(buckets[-1][1] + 1):
                self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None, ], name="decoder{0}".format(i)))
                self.target_weights.append(tf.placeholder(tf.float32, shape=[None], name="weight{0}".format(i)))

            # Our targets are decoder inputs shifted by one.
            targets = [self.decoder_inputs[i + 1]
                       for i in xrange(len(self.decoder_inputs) - 1)]

            self.decoder_states_holders = None

            # Training outputs and losses.
            if forward_only:

                # self.batch_size = beam_size

                for i in xrange(len(self.encoder_inputs), self.max_len):
                    self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i)))

                b_size = array_ops.shape(self.encoder_inputs[0])[0]

                # context, decoder_initial_state, attention_states, input_length
                self.ret0, self.ret1, self.ret2 = self.encode(self.encoder_inputs, b_size)

                self.decoder_init_plcholder = tf.placeholder(tf.float32,
                                                             shape=[None, (target_proj_size) * 2],
                                                             name="decoder_init")

                # shape of this placeholder: the first None indicate the batch size and the second the input length
                self.attn_plcholder = tf.placeholder(tf.float32,
                                                     shape=[None, self.ret2.get_shape()[1], target_proj_size],
                                                     name="attention_states")

                # decoder_states = None
                if self.decoder_attention_f is not None:
                    self.decoder_states_holders = tf.placeholder(tf.float32, shape=[None, None, 1, decoder_size],
                                                                 name="decoder_state")
                decoder_states = self.decoder_states_holders

                self.logits, self.states = attention_decoder_nmt(
                    decoder_inputs=[self.decoder_inputs[0]], initial_state=self.decoder_init_plcholder,
                    attention_states=self.attn_plcholder, cell=self.decoder_cell,
                    num_symbols=target_vocab_size, attention_f=attention_f,
                    window_size=window_size, content_function=content_function,
                    decoder_attention_f=decoder_attention_f, combine_inp_attn=combine_inp_attn,
                    input_feeding=input_feeding, dropout=self.dropout_feed, initializer=None,
                    dtype=dtype
                )

                # If we use output projection, we need to project outputs for decoding.
                self.logits = tf.nn.xw_plus_b(self.logits[-1], self.output_projection[0], self.output_projection[1])
                self.logits = nn_ops.softmax(self.logits)

            else:

                tf_version = pkg_resources.get_distribution("tensorflow").version

                if tf_version == "0.6.0" or tf_version == "0.5.0":

                    self.outputs, self.losses = seq2seq.model_with_buckets(
                        encoder_inputs=self.encoder_inputs, decoder_inputs=self.decoder_inputs,
                        targets=targets, weights=self.target_weights, num_decoder_symbols=self.target_vocab_size,
                        buckets=buckets, seq2seq=lambda x, y: seq2seq_f(x, y), softmax_loss_function=loss_function)

                else:

                    self.outputs, self.losses = model_with_buckets(
                        encoder_inputs=self.encoder_inputs, decoder_inputs=self.decoder_inputs,
                        targets=targets, weights=self.target_weights, buckets=buckets,
                        seq2seq_f=lambda x, y: seq2seq_f(x, y), softmax_loss_function=loss_function)

            # Gradients and SGD update operation for training the model.
            params = tf.trainable_variables()
            if not forward_only:
                self.gradient_norms = []
                self.updates = []
                # opt = tf.train.GradientDescentOptimizer(self.learning_rate)
                opt = optimization_ops.get_optimizer(optimizer, learning_rate)
                for b in xrange(len(buckets)):
                    gradients = tf.gradients(self.losses[b], params)
                    clipped_gradients, norm = tf.clip_by_global_norm(gradients,
                                                                     max_gradient_norm)
                    self.gradient_norms.append(norm)
                    self.updates.append(opt.apply_gradients(
                        zip(clipped_gradients, params), global_step=self.global_step))

            self.saver = tf.train.Saver(tf.all_variables())
            self.saver_best = tf.train.Saver(tf.all_variables())

Example #3

Show file

File: lm_models.py Project: chagge/attentive_lm

    def __init__(self,
                 is_training,
                 learning_rate=1.0,
                 optimizer="sgd",
                 max_grad_norm=5,
                 num_layers=2,
                 use_lstm=True,
                 num_steps=35,
                 num_steps_valid=120,
                 proj_size=650,
                 hidden_size=650,
                 hidden_proj=650,
                 num_samples=512,
                 init_scale=0.1,
                 dropout_rate=0.0,
                 lr_decay=0.8,
                 batch_size=20,
                 attentive=False,
                 projection_attention_f=None,
                 output_form=lm_ops.OUTPUT_CONCAT,
                 vocab_size=10000):

        with tf.device("/gpu:0"):

            if attentive:
                assert projection_attention_f is not None

            self.batch_size = batch_size = batch_size
            self.num_steps = num_steps
            self.num_steps_valid = num_steps_valid
            vocab_size = vocab_size

            self._input_data_train = []
            self._targets_train = []
            self.mask_train = []

            for i in xrange(num_steps):  # Last bucket is the biggest one.
                self.input_data_train.append(tf.placeholder(tf.int32, shape=[None], name="input_train{0}".format(i)))
                self.targets_train.append(tf.placeholder(tf.int32, shape=[None], name="target_train{0}".format(i)))
                self.mask_train.append(tf.placeholder(tf.float32, shape=[None], name="mask_train{0}".format(i)))

            self._input_data_valid = []
            self._targets_valid = []
            self.mask_valid = []

            for i in xrange(num_steps_valid):  # Last bucket is the biggest one.
                self.input_data_valid.append(tf.placeholder(tf.int32, shape=[None], name="input_valid{0}".format(i)))
                self.targets_valid.append(tf.placeholder(tf.int32, shape=[None], name="target_valid{0}".format(i)))
                self.mask_valid.append(tf.placeholder(tf.float32, shape=[None], name="mask_valid{0}".format(i)))

            hidden_projection = None
            if hidden_proj > 0:
                hidden_projection = hidden_proj

            self.cell = cells.build_lm_multicell_rnn(num_layers, hidden_size, proj_size, use_lstm=use_lstm,
                                                     hidden_projection=hidden_projection, dropout=dropout_rate)

            self.dropout_feed = tf.placeholder(tf.float32, name="dropout_rate")

            self._initial_state_train = self.cell.zero_state(batch_size, tf.float32)
            self._initial_state_valid = self.cell.zero_state(1, tf.float32)

            # learning rate ops
            self.learning_rate = tf.Variable(float(learning_rate), trainable=False)
            self.learning_rate_decay_op = self.learning_rate.assign(self.learning_rate * lr_decay)

            # epoch ops
            self.epoch = tf.Variable(0, trainable=False)
            self.epoch_update_op = self.epoch.assign(self.epoch + 1)

            # samples seen ops
            self.samples_seen = tf.Variable(0, trainable=False)
            self.samples_seen_update_op = self.samples_seen.assign(self.samples_seen + batch_size)
            self.samples_seen_reset_op = self.samples_seen.assign(0)

            # global step variable - controled by the model
            self.global_step = tf.Variable(0.0, trainable=False)

            # average loss ops
            self.current_ppx = tf.Variable(1.0, trainable=False)
            self.current_loss = tf.Variable(0.0, trainable=False)
            # self.current_loss_update_op = None

            self.best_eval_ppx = tf.Variable(numpy.inf, trainable=False)
            self.estop_counter = tf.Variable(0, trainable=False)
            self.estop_counter_update_op = self.estop_counter.assign(self.estop_counter + 1)
            self.estop_counter_reset_op = self.estop_counter.assign(0)

            initializer = tf.random_uniform_initializer(minval=init_scale, maxval=init_scale, seed=_SEED)

            out_proj = hidden_size
            if hidden_proj > 0:
                out_proj = hidden_proj

            with tf.device("/cpu:0"):
                w = tf.get_variable("proj_w", [out_proj, vocab_size])
                w_t = tf.transpose(w)
                b = tf.get_variable("proj_b", [vocab_size])
            self.output_projection = (w, b)

            sampled_softmax = False

            # Sampled softmax only makes sense if we sample less than vocabulary size.
            if 0 < num_samples < vocab_size:
                sampled_softmax = True

                def sampled_loss(logits, labels):
                    with tf.device("/cpu:0"):
                        labels = tf.reshape(labels, [-1, 1])
                        losses = tf.nn.sampled_softmax_loss(w_t, b, logits, labels, num_samples, vocab_size)
                        return losses

                loss_function = sampled_loss

            with tf.device("/cpu:0"):
                # input come as one big tensor so we have to split it into a list of tensors to run the rnn cell
                embedding = tf.Variable(
                    tf.random_uniform(
                        [vocab_size, proj_size],
                        minval=-init_scale, maxval=init_scale
                    ),
                    name="embedding"
                )
                # embedding = tf.get_variable("embedding", [vocab_size, proj_size])

                inputs_train = [tf.nn.embedding_lookup(embedding, i) for i in self.input_data_train]
                inputs_valid = [tf.nn.embedding_lookup(embedding, i) for i in self.input_data_valid]

            with tf.variable_scope("RNN", initializer=initializer):

                if attentive:
                    outputs_train, state_train, _ = lm_ops.apply_attentive_lm(
                        self.cell, inputs_train, sequence_length=array_ops.squeeze(math_ops.add_n(self.mask_train)),
                        projection_attention_f=projection_attention_f, output_form=output_form,
                        dropout=self.dropout_feed, initializer=initializer, dtype=tf.float32
                    )

                    outputs_valid, state_valid, _ = lm_ops.apply_attentive_lm(
                        self.cell, inputs_valid, sequence_length=array_ops.squeeze(math_ops.add_n(self.mask_valid)),
                        projection_attention_f=projection_attention_f, output_form=output_form,
                        dropout=self.dropout_feed, initializer=initializer, dtype=tf.float32
                    )

                else:
                   outputs_train, state_train = lm_ops.apply_lm(
                       self.cell, inputs_train, sequence_length=math_ops.add_n(self.mask_train),
                       dropout=self.dropout_feed, dtype=tf.float32
                   )

                   outputs_valid, state_valid = lm_ops.apply_lm(
                       self.cell, inputs_valid, sequence_length=math_ops.add_n(self.mask_valid),
                       dropout=self.dropout_feed, dtype=tf.float32
                   )

                if sampled_softmax is False:
                    logits_train = [tf.nn.xw_plus_b(o, self.output_projection[0], self.output_projection[1])
                                    for o in outputs_train]
                    logits_valid = [tf.nn.xw_plus_b(o, self.output_projection[0], self.output_projection[1])
                                    for o in outputs_valid]
                else:
                    logits_train = outputs_train
                    logits_valid = outputs_valid

            loss_train = seq2seq.sequence_loss_by_example(
                logits_train, self.targets_train, self.mask_train, average_across_timesteps=True
            )

            loss_valid = seq2seq.sequence_loss_by_example(
                logits_valid, self.targets_valid, self.mask_valid, average_across_timesteps=True
            )

            self._cost_train = cost = tf.reduce_sum(loss_train) / float(batch_size)
            self._final_state_train = state_train

            self._cost_valid = tf.reduce_sum(loss_valid) / float(batch_size)
            self._final_state_valid = state_valid

            if not is_training:
                return

            tvars = tf.trainable_variables()
            grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars),
                                              max_grad_norm)

            opt = optimization_ops.get_optimizer(optimizer, learning_rate)
            self._train_op = opt.apply_gradients(zip(grads, tvars), global_step=self.global_step)
            self._valid_op = tf.no_op()

            self.saver = tf.train.Saver(tf.all_variables())
            self.saver_best = tf.train.Saver(tf.all_variables())