def _createStackBidirectionalDynamicRNN(self,
                                            use_gpu,
                                            use_shape,
                                            use_state_tuple,
                                            initial_states_fw=None,
                                            initial_states_bw=None,
                                            scope=None):
        self.layers = [2, 3]
        input_size = 5
        batch_size = 2
        max_length = 8

        initializer = init_ops.random_uniform_initializer(-0.01,
                                                          0.01,
                                                          seed=self._seed)
        sequence_length = array_ops.placeholder(dtypes.int64)

        self.cells_fw = [
            core_rnn_cell_impl.LSTMCell(num_units,
                                        input_size,
                                        initializer=initializer,
                                        state_is_tuple=False)
            for num_units in self.layers
        ]
        self.cells_bw = [
            core_rnn_cell_impl.LSTMCell(num_units,
                                        input_size,
                                        initializer=initializer,
                                        state_is_tuple=False)
            for num_units in self.layers
        ]

        inputs = max_length * [
            array_ops.placeholder(
                dtypes.float32,
                shape=(batch_size, input_size) if use_shape else
                (None, input_size))
        ]
        inputs_c = array_ops.stack(inputs)
        inputs_c = array_ops.transpose(inputs_c, [1, 0, 2])
        outputs, st_fw, st_bw = rnn.stack_bidirectional_dynamic_rnn(
            self.cells_fw,
            self.cells_bw,
            inputs_c,
            initial_states_fw=initial_states_fw,
            initial_states_bw=initial_states_bw,
            dtype=dtypes.float32,
            sequence_length=sequence_length,
            scope=scope)

        # Outputs has shape (batch_size, max_length, 2* layer[-1].
        output_shape = [None, max_length, 2 * self.layers[-1]]
        if use_shape:
            output_shape[0] = batch_size

        self.assertAllEqual(outputs.get_shape().as_list(), output_shape)

        input_value = np.random.randn(batch_size, input_size)

        return input_value, inputs, outputs, st_fw, st_bw, sequence_length
    def _createStackBidirectionalRNN(self,
                                     use_gpu,
                                     use_shape,
                                     use_sequence_length,
                                     initial_states_fw=None,
                                     initial_states_bw=None,
                                     scope=None):
        self.layers = [2, 3]
        input_size = 5
        batch_size = 2
        max_length = 8

        initializer = init_ops.random_uniform_initializer(-0.01,
                                                          0.01,
                                                          seed=self._seed)
        sequence_length = array_ops.placeholder(
            dtypes.int64) if use_sequence_length else None

        self.cells_fw = [
            core_rnn_cell_impl.LSTMCell(num_units,
                                        input_size,
                                        initializer=initializer,
                                        state_is_tuple=False)
            for num_units in self.layers
        ]
        self.cells_bw = [
            core_rnn_cell_impl.LSTMCell(num_units,
                                        input_size,
                                        initializer=initializer,
                                        state_is_tuple=False)
            for num_units in self.layers
        ]

        inputs = max_length * [
            array_ops.placeholder(
                dtypes.float32,
                shape=(batch_size, input_size) if use_shape else
                (None, input_size))
        ]
        outputs, state_fw, state_bw = rnn.stack_bidirectional_rnn(
            self.cells_fw,
            self.cells_bw,
            inputs,
            initial_states_fw,
            initial_states_bw,
            dtype=dtypes.float32,
            sequence_length=sequence_length,
            scope=scope)

        self.assertEqual(len(outputs), len(inputs))
        for out in outputs:
            self.assertAlmostEqual(
                out.get_shape().as_list(),
                [batch_size if use_shape else None, 2 * self.layers[-1]])

        input_value = np.random.randn(batch_size, input_size)
        outputs = array_ops.stack(outputs)

        return input_value, inputs, outputs, state_fw, state_bw, sequence_length
Exemple #3
0
 def testUsingSecondCellInScopeWithExistingVariablesFails(self):
     # This test should go away when this behavior is no longer an
     # error (Approx. May 2017)
     cell1 = core_rnn_cell_impl.LSTMCell(3)
     cell2 = core_rnn_cell_impl.LSTMCell(3)
     x = array_ops.zeros([1, 3])
     m = core_rnn_cell_impl.LSTMStateTuple(*[array_ops.zeros([1, 3])] * 2)
     cell1(x, m)
     with self.assertRaisesRegexp(ValueError,
                                  r"LSTMCell\(..., reuse=True\)"):
         cell2(x, m)
Exemple #4
0
    def testCompatibleNames(self):
        with self.test_session(use_gpu=self._use_gpu, graph=ops.Graph()):
            cell = core_rnn_cell_impl.LSTMCell(10)
            pcell = core_rnn_cell_impl.LSTMCell(10, use_peepholes=True)
            inputs = [array_ops.zeros([4, 5])] * 6
            core_rnn.static_rnn(cell,
                                inputs,
                                dtype=dtypes.float32,
                                scope="basic")
            core_rnn.static_rnn(pcell,
                                inputs,
                                dtype=dtypes.float32,
                                scope="peephole")
            basic_names = {
                v.name: v.get_shape()
                for v in variables.trainable_variables()
            }

        with self.test_session(use_gpu=self._use_gpu, graph=ops.Graph()):
            cell = lstm_ops.LSTMBlockCell(10)
            pcell = lstm_ops.LSTMBlockCell(10, use_peephole=True)
            inputs = [array_ops.zeros([4, 5])] * 6
            core_rnn.static_rnn(cell,
                                inputs,
                                dtype=dtypes.float32,
                                scope="basic")
            core_rnn.static_rnn(pcell,
                                inputs,
                                dtype=dtypes.float32,
                                scope="peephole")
            block_names = {
                v.name: v.get_shape()
                for v in variables.trainable_variables()
            }

        with self.test_session(use_gpu=self._use_gpu, graph=ops.Graph()):
            cell = lstm_ops.LSTMBlockFusedCell(10)
            pcell = lstm_ops.LSTMBlockFusedCell(10, use_peephole=True)
            inputs = [array_ops.zeros([4, 5])] * 6
            cell(inputs, dtype=dtypes.float32, scope="basic/lstm_cell")
            pcell(inputs, dtype=dtypes.float32, scope="peephole/lstm_cell")
            fused_names = {
                v.name: v.get_shape()
                for v in variables.trainable_variables()
            }

        self.assertEqual(basic_names, block_names)
        self.assertEqual(basic_names, fused_names)
Exemple #5
0
def _create_multi_lstm_cell_ops(batch_size, num_units, input_depth,
                                num_layers, max_time, compiled):
  with variable_scope.variable_scope(
      "root",
      initializer=init_ops.random_uniform_initializer(-0.1, 0.1, seed=2)):
    inputs = variable_scope.get_variable(
        "inputs", initializer=random_ops.random_uniform(
            (max_time, batch_size, input_depth), seed=1))
    maybe_xla = lambda c: rnn_cell.CompiledWrapper(c) if compiled else c
    cell = core_rnn_cell_impl.MultiRNNCell(
        [maybe_xla(core_rnn_cell_impl.LSTMCell(num_units))
         for _ in range(num_layers)])
    initial_state = cell.zero_state(
        batch_size=batch_size, dtype=dtypes.float32)
    outputs, final_state = rnn.dynamic_rnn(
        cell=cell, inputs=inputs, initial_state=initial_state,
        time_major=True)
    flat_final_state = nest.flatten(final_state)
    trainable_variables = variables.trainable_variables()
    outputs_grad = gradients_impl.gradients(
        [outputs],
        trainable_variables + [inputs] + nest.flatten(initial_state))
    final_state_grad = gradients_impl.gradients(
        flat_final_state,
        trainable_variables + [inputs] + nest.flatten(initial_state))

    return {"outputs": outputs,
            "final_state": flat_final_state,
            "outputs_grad": outputs_grad,
            "final_state_grad": final_state_grad}
 def testLSTMCell(self):
     with self.test_session() as sess:
         num_units = 8
         num_proj = 6
         state_size = num_units + num_proj
         batch_size = 3
         input_size = 2
         with variable_scope.variable_scope(
                 "root", initializer=init_ops.constant_initializer(0.5)):
             x = array_ops.zeros([batch_size, input_size])
             m = array_ops.zeros([batch_size, state_size])
             cell = core_rnn_cell_impl.LSTMCell(num_units=num_units,
                                                num_proj=num_proj,
                                                forget_bias=1.0,
                                                state_is_tuple=False)
             output, state = cell(x, m)
             sess.run([variables_lib.global_variables_initializer()])
             res = sess.run(
                 [output, state], {
                     x.name: np.array([[1., 1.], [2., 2.], [3., 3.]]),
                     m.name: 0.1 * np.ones((batch_size, state_size))
                 })
             self.assertEqual(len(res), 2)
             # The numbers in results were not calculated, this is mostly just a
             # smoke test.
             self.assertEqual(res[0].shape, (batch_size, num_proj))
             self.assertEqual(res[1].shape, (batch_size, state_size))
             # Different inputs so different outputs and states
             for i in range(1, batch_size):
                 self.assertTrue(
                     float(np.linalg.norm((res[0][0, :] -
                                           res[0][i, :]))) > 1e-6)
                 self.assertTrue(
                     float(np.linalg.norm((res[1][0, :] -
                                           res[1][i, :]))) > 1e-6)
  def benchmarkTfRNNLSTMTraining(self):
    test_configs = self._GetTestConfig()
    for config_name, config in test_configs.items():
      num_layers = config["num_layers"]
      num_units = config["num_units"]
      batch_size = config["batch_size"]
      seq_length = config["seq_length"]

      with ops.Graph().as_default(), ops.device("/gpu:0"):
        inputs = seq_length * [
            array_ops.zeros([batch_size, num_units], dtypes.float32)
        ]
        initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=127)

        cell = core_rnn_cell_impl.LSTMCell(
            num_units=num_units, initializer=initializer, state_is_tuple=True)
        multi_cell = core_rnn_cell_impl.MultiRNNCell(
            [cell() for _ in range(num_layers)])
        outputs, final_state = core_rnn.static_rnn(
            multi_cell, inputs, dtype=dtypes.float32)
        trainable_variables = ops.get_collection(
            ops.GraphKeys.TRAINABLE_VARIABLES)
        gradients = gradients_impl.gradients([outputs, final_state],
                                             trainable_variables)
        training_op = control_flow_ops.group(*gradients)
        self._BenchmarkOp(training_op, "tf_rnn_lstm %s %s" %
                          (config_name, self._GetConfigDesc(config)))
Exemple #8
0
 def _testDropoutWrapper(self, batch_size=None, time_steps=None,
                         parallel_iterations=None, **kwargs):
   with self.test_session() as sess:
     with variable_scope.variable_scope(
         "root", initializer=init_ops.constant_initializer(0.5)):
       if batch_size is None and time_steps is None:
         # 2 time steps, batch size 1, depth 3
         batch_size = 1
         time_steps = 2
         x = constant_op.constant(
             [[[2., 2., 2.]], [[1., 1., 1.]]], dtype=dtypes.float32)
         m = core_rnn_cell_impl.LSTMStateTuple(
             *[constant_op.constant([[0.1, 0.1, 0.1]],
                                    dtype=dtypes.float32)] * 2)
       else:
         x = constant_op.constant(
             np.random.randn(time_steps, batch_size, 3).astype(np.float32))
         m = core_rnn_cell_impl.LSTMStateTuple(
             *[constant_op.constant([[0.1, 0.1, 0.1]] * batch_size,
                                    dtype=dtypes.float32)] * 2)
       outputs, final_state = rnn.dynamic_rnn(
           cell=core_rnn_cell_impl.DropoutWrapper(
               core_rnn_cell_impl.LSTMCell(3),
               dtype=x.dtype,
               **kwargs),
           time_major=True,
           parallel_iterations=parallel_iterations,
           inputs=x, initial_state=m)
       sess.run([variables_lib.global_variables_initializer()])
       res = sess.run([outputs, final_state])
       self.assertEqual(res[0].shape, (time_steps, batch_size, 3))
       self.assertEqual(res[1].c.shape, (batch_size, 3))
       self.assertEqual(res[1].h.shape, (batch_size, 3))
       return res
Exemple #9
0
  def testLSTMBasicToBlockCellPeeping(self):
    with self.test_session(use_gpu=self._use_gpu) as sess:
      x = array_ops.zeros([1, 2])
      x_values = np.random.randn(1, 2)

      m0_val = 0.1 * np.ones([1, 2])
      m1_val = -0.1 * np.ones([1, 2])
      m2_val = -0.2 * np.ones([1, 2])
      m3_val = 0.2 * np.ones([1, 2])

      initializer = init_ops.random_uniform_initializer(
          -0.01, 0.01, seed=19890212)
      with variable_scope.variable_scope("basic", initializer=initializer):
        m0 = array_ops.zeros([1, 2])
        m1 = array_ops.zeros([1, 2])
        m2 = array_ops.zeros([1, 2])
        m3 = array_ops.zeros([1, 2])
        g, ((out_m0, out_m1),
            (out_m2, out_m3)) = core_rnn_cell_impl.MultiRNNCell(
                [
                    core_rnn_cell_impl.LSTMCell(
                        2, use_peepholes=True, state_is_tuple=True)
                ] * 2,
                state_is_tuple=True)(x, ((m0, m1), (m2, m3)))
        sess.run([variables.global_variables_initializer()])
        basic_res = sess.run([g, out_m0, out_m1, out_m2, out_m3], {
            x.name: x_values,
            m0.name: m0_val,
            m1.name: m1_val,
            m2.name: m2_val,
            m3.name: m3_val
        })

      with variable_scope.variable_scope("block", initializer=initializer):
        m0 = array_ops.zeros([1, 2])
        m1 = array_ops.zeros([1, 2])
        m2 = array_ops.zeros([1, 2])
        m3 = array_ops.zeros([1, 2])
        g, ((out_m0, out_m1),
            (out_m2, out_m3)) = core_rnn_cell_impl.MultiRNNCell(
                [lstm_ops.LSTMBlockCell(
                    2, use_peephole=True)] * 2,
                state_is_tuple=True)(x, ((m0, m1), (m2, m3)))
        sess.run([variables.global_variables_initializer()])
        block_res = sess.run([g, out_m0, out_m1, out_m2, out_m3], {
            x.name: x_values,
            m0.name: m0_val,
            m1.name: m1_val,
            m2.name: m2_val,
            m3.name: m3_val
        })

      self.assertEqual(len(basic_res), len(block_res))
      for basic, block in zip(basic_res, block_res):
        self.assertAllClose(basic, block)
Exemple #10
0
 def testUsingCellInDifferentScopeFromFirstCallFails(self):
     # This test should go away when this behavior is no longer an
     # error (Approx. May 2017)
     cell = core_rnn_cell_impl.LSTMCell(3)
     x = array_ops.zeros([1, 3])
     m = core_rnn_cell_impl.LSTMStateTuple(*[array_ops.zeros([1, 3])] * 2)
     with variable_scope.variable_scope("scope1"):
         cell(x, m)
     with variable_scope.variable_scope("scope2"):
         with self.assertRaisesRegexp(ValueError,
                                      r"Attempt to reuse RNNCell"):
             cell(x, m)
Exemple #11
0
    def get_rnncell(cell_type, cell_size, keep_prob, num_layer):
        if cell_type == "gru":
            cell = rnn_cell.GRUCell(cell_size)
        else:
            cell = rnn_cell.LSTMCell(cell_size, use_peepholes=False, forget_bias=1.0)

        if keep_prob < 1.0:
            cell = rnn_cell.DropoutWrapper(cell, output_keep_prob=keep_prob)

        if num_layer > 1:
            cell = rnn_cell.MultiRNNCell([cell] * num_layer, state_is_tuple=True)

        return cell
Exemple #12
0
    def testStateTupleDictConversion(self):
        """Test `state_tuple_to_dict` and `dict_to_state_tuple`."""
        cell_sizes = [5, 3, 7]
        # A MultiRNNCell of LSTMCells is both a common choice and an interesting
        # test case, because it has two levels of nesting, with an inner class that
        # is not a plain tuple.
        cell = core_rnn_cell_impl.MultiRNNCell(
            [core_rnn_cell_impl.LSTMCell(i) for i in cell_sizes])
        state_dict = {
            dynamic_rnn_estimator._get_state_name(i):
            array_ops.expand_dims(math_ops.range(cell_size), 0)
            for i, cell_size in enumerate([5, 5, 3, 3, 7, 7])
        }
        expected_state = (core_rnn_cell_impl.LSTMStateTuple(
            np.reshape(np.arange(5), [1, -1]),
            np.reshape(np.arange(5), [1, -1])),
                          core_rnn_cell_impl.LSTMStateTuple(
                              np.reshape(np.arange(3), [1, -1]),
                              np.reshape(np.arange(3), [1, -1])),
                          core_rnn_cell_impl.LSTMStateTuple(
                              np.reshape(np.arange(7), [1, -1]),
                              np.reshape(np.arange(7), [1, -1])))
        actual_state = dynamic_rnn_estimator.dict_to_state_tuple(
            state_dict, cell)
        flattened_state = dynamic_rnn_estimator.state_tuple_to_dict(
            actual_state)

        with self.test_session() as sess:
            (state_dict_val, actual_state_val, flattened_state_val) = sess.run(
                [state_dict, actual_state, flattened_state])

        def _recursive_assert_equal(x, y):
            self.assertEqual(type(x), type(y))
            if isinstance(x, (list, tuple)):
                self.assertEqual(len(x), len(y))
                for i, _ in enumerate(x):
                    _recursive_assert_equal(x[i], y[i])
            elif isinstance(x, np.ndarray):
                np.testing.assert_array_equal(x, y)
            else:
                self.fail('Unexpected type: {}'.format(type(x)))

        for k in state_dict_val.keys():
            np.testing.assert_array_almost_equal(
                state_dict_val[k],
                flattened_state_val[k],
                err_msg='Wrong value for state component {}.'.format(k))
        _recursive_assert_equal(expected_state, actual_state_val)
 def testLSTMCellVariables(self):
     with self.test_session():
         num_units = 8
         num_proj = 6
         state_size = num_units + num_proj
         batch_size = 3
         input_size = 2
         with variable_scope.variable_scope(
                 "root", initializer=init_ops.constant_initializer(0.5)):
             x = array_ops.zeros([batch_size, input_size])
             m = array_ops.zeros([batch_size, state_size])
             cell = core_rnn_cell_impl.LSTMCell(num_units=num_units,
                                                num_proj=num_proj,
                                                forget_bias=1.0,
                                                state_is_tuple=False)
             cell(x, m)  # Execute to create variables
         variables = variables_lib.global_variables()
         self.assertEquals(variables[0].op.name, "root/lstm_cell/weights")
         self.assertEquals(variables[1].op.name, "root/lstm_cell/biases")
         self.assertEquals(variables[2].op.name,
                           "root/lstm_cell/projection/weights")
Exemple #14
0
  def testLSTMBasicToBlockPeeping(self):
    with self.test_session(use_gpu=self._use_gpu) as sess:
      batch_size = 2
      input_size = 3
      cell_size = 4
      sequence_length = 5

      inputs = []
      for _ in range(sequence_length):
        inp = ops.convert_to_tensor(
            np.random.randn(batch_size, input_size), dtype=dtypes.float32)
        inputs.append(inp)

      initializer = init_ops.random_uniform_initializer(
          -0.01, 0.01, seed=19890212)
      with variable_scope.variable_scope("basic", initializer=initializer):
        cell = core_rnn_cell_impl.LSTMCell(
            cell_size, use_peepholes=True, state_is_tuple=True)
        outputs, state = core_rnn.static_rnn(cell, inputs, dtype=dtypes.float32)

        sess.run([variables.global_variables_initializer()])
        basic_outputs, basic_state = sess.run([outputs, state[0]])
        basic_grads = sess.run(gradients_impl.gradients(outputs, inputs))
        basic_wgrads = sess.run(
            gradients_impl.gradients(outputs, variables.trainable_variables()))

      with variable_scope.variable_scope("block", initializer=initializer):
        w = variable_scope.get_variable(
            "w",
            shape=[input_size + cell_size, cell_size * 4],
            dtype=dtypes.float32)
        b = variable_scope.get_variable(
            "b",
            shape=[cell_size * 4],
            dtype=dtypes.float32,
            initializer=init_ops.zeros_initializer())

        wci = variable_scope.get_variable(
            "wci", shape=[cell_size], dtype=dtypes.float32)
        wcf = variable_scope.get_variable(
            "wcf", shape=[cell_size], dtype=dtypes.float32)
        wco = variable_scope.get_variable(
            "wco", shape=[cell_size], dtype=dtypes.float32)

        _, _, _, _, _, _, outputs = block_lstm(
            ops.convert_to_tensor(
                sequence_length, dtype=dtypes.int64),
            inputs,
            w,
            b,
            wci=wci,
            wcf=wcf,
            wco=wco,
            cell_clip=0,
            use_peephole=True)

        sess.run([variables.global_variables_initializer()])
        block_outputs = sess.run(outputs)
        block_grads = sess.run(gradients_impl.gradients(outputs, inputs))
        block_wgrads = sess.run(
            gradients_impl.gradients(outputs, [w, b, wci, wcf, wco]))

      self.assertAllClose(basic_outputs, block_outputs)
      self.assertAllClose(basic_grads, block_grads)
      for basic, block in zip(basic_wgrads, block_wgrads):
        self.assertAllClose(basic, block, rtol=1e-2, atol=1e-2)

      with variable_scope.variable_scope("fused", initializer=initializer):
        cell = lstm_ops.LSTMBlockFusedCell(
            cell_size, cell_clip=0, use_peephole=True)
        outputs, state = cell(inputs, dtype=dtypes.float32)

        sess.run([variables.global_variables_initializer()])
        fused_outputs, fused_state = sess.run([outputs, state[0]])
        fused_grads = sess.run(gradients_impl.gradients(outputs, inputs))
        fused_vars = [
            v for v in variables.trainable_variables()
            if v.name.startswith("fused/")
        ]
        fused_wgrads = sess.run(gradients_impl.gradients(outputs, fused_vars))

      self.assertAllClose(basic_outputs, fused_outputs)
      self.assertAllClose(basic_state, fused_state)
      self.assertAllClose(basic_grads, fused_grads)
      for basic, fused in zip(basic_wgrads, fused_wgrads):
        self.assertAllClose(basic, fused, rtol=1e-2, atol=1e-2)
batch_size = 2
display_step = 10

# Network Parameters
n_input = 8  # data input
n_hidden = 5  # hidden layer num of features

# tf Graph input
x = tf.placeholder("int32", [None, word_lenght, n_input])
y = tf.placeholder("int32", [None, n_input])
with tf.variable_scope("train_test", reuse=None):
    x = tf.unstack(x, word_lenght, 1)
    outputs, states = embedding_rnn_seq2seq(
        encoder_inputs=x,
        decoder_inputs=[0] * 20,
        cell=core_rnn_cell_impl.LSTMCell(n_hidden),
        num_encoder_symbols=256,
        num_decoder_symbols=256,
        embedding_size=100,
        output_projection=None,
        feed_previous=False)

# Define loss and optimizer
cost = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(logits=outputs, labels=y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

# Evaluate model
correct_pred = tf.equal(tf.argmax(outputs, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
  def __init__(self,
               source_vocab_size,
               target_vocab_size,
               buckets,
               size,
               num_layers,
               max_gradient_norm,
               batch_size,
               learning_rate,
               learning_rate_decay_factor,
               use_lstm=False,
               use_rnn=False,
               use_mtgru=False,
               use_mtlstm=False,
               num_samples=512,
               forward_only=False,
               dtype=tf.float32):
    """Create the model.

    Args:
      source_vocab_size: size of the source vocabulary.
      target_vocab_size: size of the target vocabulary.
      buckets: a list of pairs (I, O), where I specifies maximum input length
        that will be processed in that bucket, and O specifies maximum output
        length. Training instances that have inputs longer than I or outputs
        longer than O will be pushed to the next bucket and padded accordingly.
        We assume that the list is sorted, e.g., [(2, 4), (8, 16)].
      size: number of units in each layer of the model.
      num_layers: number of layers in the model.
      max_gradient_norm: gradients will be clipped to maximally this norm.
      batch_size: the size of the batches used during training;
        the model construction is independent of batch_size, so it can be
        changed after initialization if this is convenient, e.g., for decoding.
      learning_rate: learning rate to start with.
      learning_rate_decay_factor: decay learning rate by this much when needed.
      use_lstm: if true, we use LSTM cells instead of GRU cells.
      num_samples: number of samples for sampled softmax.
      forward_only: if set, we do not construct the backward pass in the model.
      dtype: the data type to use to store internal variables.
    """
    self.source_vocab_size = source_vocab_size
    self.target_vocab_size = target_vocab_size
    self.buckets = buckets
    self.batch_size = batch_size
    self.learning_rate = tf.Variable(
        float(learning_rate), trainable=False, dtype=dtype)
    self.learning_rate_decay_op = self.learning_rate.assign(
        self.learning_rate * learning_rate_decay_factor)
    self.global_step = tf.Variable(0, trainable=False)

    # If we use sampled softmax, we need an output projection.
    output_projection = None
    softmax_loss_function = None
    # Sampled softmax only makes sense if we sample less than vocabulary size.
    if num_samples > 0 and num_samples < self.target_vocab_size:
      w_t = tf.get_variable("proj_w", [self.target_vocab_size, size], dtype=dtype)
      w = tf.transpose(w_t)
      b = tf.get_variable("proj_b", [self.target_vocab_size], dtype=dtype)
      output_projection = (w, b)

      def sampled_loss(labels, inputs):
        labels = tf.reshape(labels, [-1, 1])
        # We need to compute the sampled_softmax_loss using 32bit floats to
        # avoid numerical instabilities.
        local_w_t = tf.cast(w_t, tf.float32)
        local_b = tf.cast(b, tf.float32)
        local_inputs = tf.cast(inputs, tf.float32)
        return tf.cast(
            tf.nn.sampled_softmax_loss(
                weights=local_w_t,
                biases=local_b,
                labels=labels,
                inputs=local_inputs,
                num_sampled=num_samples,
                num_classes=self.target_vocab_size),
            dtype)
      softmax_loss_function = sampled_loss

    # Create the internal multi-layer cell for our RNN.
    if use_mtgru:
       cell1 = core_rnn_cell_impl.MTGRUCell(size)
       cell2 = core_rnn_cell_impl.MTGRUCell(size, tau=0.999) #tau=0.99) #tau=0.98)
       cell3 = core_rnn_cell_impl.MTGRUCell(size, tau=0.998) #tau=0.98) #tau=0.95)
       #cell4 = core_rnn_cell_impl.MTGRUCell(size, tau=0.997) #tau=0.98) #tau=0.95)
       cell = core_rnn_cell_impl.MultiMTRNNCell([cell1, cell2, cell3])
       #cell = core_rnn_cell_impl.MultiMTRNNCell([cell1, cell2, cell3, cell4])
       #cell = core_rnn_cell_impl.MultiMTRNNCell([cell1, cell2, cell3])
    elif use_mtlstm:
       cell1 = core_rnn_cell_impl.LSTMCell(size)
       cell2 = core_rnn_cell_impl.LSTMCell(size, tau=0.999) #tau=0.99) #tau=0.98)
       #cell3 = core_rnn_cell_impl.LSTMCell(size, tau=0.998) #tau=0.98) #tau=0.95)
       #cell4 = core_rnn_cell_impl.LSTMCell(size, tau=0.997) #tau=0.98) #tau=0.95)
       cell = core_rnn_cell_impl.MultiMTRNNCell([cell1, cell2])
       #cell = core_rnn_cell_impl.MultiMTRNNCell([cell1, cell2, cell3])
    else:
      def single_cell():
        return tf.contrib.rnn.GRUCell(size)
      if use_lstm:
        def single_cell():
          return tf.contrib.rnn.BasicLSTMCell(size)
      if use_rnn:
        def single_cell():
          return tf.contrib.rnn.BasicRNNCell(size)
      cell = single_cell()
      if num_layers > 1:
        cell = tf.contrib.rnn.MultiRNNCell([single_cell() for _ in range(num_layers)])

    # The seq2seq function: we use embedding for the input and attention.
    def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
      return tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(
          encoder_inputs,
          decoder_inputs,
          cell,
          num_encoder_symbols=source_vocab_size,
          num_decoder_symbols=target_vocab_size,
          embedding_size=size,
          output_projection=output_projection,
          feed_previous=do_decode,
          dtype=dtype)

    # Feeds for inputs.
    self.encoder_inputs = []
    self.decoder_inputs = []
    self.target_weights = []
    for i in xrange(buckets[-1][0]):  # Last bucket is the biggest one.
      self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None],
                                                name="encoder{0}".format(i)))
    for i in xrange(buckets[-1][1] + 1):
      self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None],
                                                name="decoder{0}".format(i)))
      self.target_weights.append(tf.placeholder(dtype, shape=[None],
                                                name="weight{0}".format(i)))

    # Our targets are decoder inputs shifted by one.
    targets = [self.decoder_inputs[i + 1]
               for i in xrange(len(self.decoder_inputs) - 1)]

    # Training outputs and losses.
    if forward_only:
      self.outputs, self.losses = tf.contrib.legacy_seq2seq.model_with_buckets(
          self.encoder_inputs, self.decoder_inputs, targets,
          self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, True),
          softmax_loss_function=softmax_loss_function)
      # If we use output projection, we need to project outputs for decoding.
      if output_projection is not None:
        for b in xrange(len(buckets)):
          self.outputs[b] = [
              tf.matmul(output, output_projection[0]) + output_projection[1]
              for output in self.outputs[b]
          ]
    else:
      self.outputs, self.losses = tf.contrib.legacy_seq2seq.model_with_buckets(
          self.encoder_inputs, self.decoder_inputs, targets,
          self.target_weights, buckets,
          lambda x, y: seq2seq_f(x, y, False),
          softmax_loss_function=softmax_loss_function)

    # Gradients and SGD update operation for training the model.
    params = tf.trainable_variables()
    if not forward_only:
      self.gradient_norms = []
      self.updates = []
      opt = tf.train.GradientDescentOptimizer(self.learning_rate)
      for b in xrange(len(buckets)):
        gradients = tf.gradients(self.losses[b], params)
        clipped_gradients, norm = tf.clip_by_global_norm(gradients,
                                                         max_gradient_norm)
        self.gradient_norms.append(norm)
        self.updates.append(opt.apply_gradients(
            zip(clipped_gradients, params), global_step=self.global_step))

    self.saver = tf.train.Saver(tf.global_variables())