Exemple #1
0
 def _CreateModel(self,
                  rnn_mode,
                  num_layers,
                  num_units,
                  input_size,
                  input_mode="linear_input",
                  dropout=0.):
     if rnn_mode == cudnn_rnn_ops.CUDNN_LSTM:
         model = cudnn_rnn_ops.CudnnLSTM(num_layers,
                                         num_units,
                                         input_size,
                                         dropout=dropout)
     elif rnn_mode == cudnn_rnn_ops.CUDNN_GRU:
         model = cudnn_rnn_ops.CudnnGRU(num_layers,
                                        num_units,
                                        input_size,
                                        dropout=dropout)
     elif rnn_mode == cudnn_rnn_ops.CUDNN_RNN_TANH:
         model = cudnn_rnn_ops.CudnnRNNTanh(num_layers,
                                            num_units,
                                            input_size,
                                            dropout=dropout)
     elif rnn_mode == cudnn_rnn_ops.CUDNN_RNN_RELU:
         model = cudnn_rnn_ops.CudnnRNNRelu(num_layers,
                                            num_units,
                                            input_size,
                                            dropout=dropout)
     else:
         raise ValueError("Invalid rnn_mode: %s" % rnn_mode)
     return model
Exemple #2
0
    def build(self, input_shape):
        super(CuDNNLSTM, self).build(input_shape)
        if isinstance(input_shape, list):
            input_shape = input_shape[0]
        input_dim = input_shape[-1]

        from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops
        self._cudnn_lstm = cudnn_rnn_ops.CudnnLSTM(
            num_layers=1,
            num_units=self.units,
            input_size=input_dim,
            input_mode='linear_input')

        self.kernel = self.add_weight(shape=(input_dim, self.units * 4),
                                      name='kernel',
                                      initializer=self.kernel_initializer,
                                      regularizer=self.kernel_regularizer,
                                      constraint=self.kernel_constraint)
        self.recurrent_kernel = self.add_weight(
            shape=(self.units, self.units * 4),
            name='recurrent_kernel',
            initializer=self.recurrent_initializer,
            regularizer=self.recurrent_regularizer,
            constraint=self.recurrent_constraint)

        if self.unit_forget_bias:
            def bias_initializer(shape, *args, **kwargs):
                return K.concatenate([
                    self.bias_initializer((self.units * 5,), *args, **kwargs),
                    initializers.Ones()((self.units,), *args, **kwargs),
                    self.bias_initializer((self.units * 2,), *args, **kwargs),
                ])
        else:
            bias_initializer = self.bias_initializer
        self.bias = self.add_weight(shape=(self.units * 8,),
                                    name='bias',
                                    initializer=bias_initializer,
                                    regularizer=self.bias_regularizer,
                                    constraint=self.bias_constraint)

        self.kernel_i = self.kernel[:, :self.units]
        self.kernel_f = self.kernel[:, self.units: self.units * 2]
        self.kernel_c = self.kernel[:, self.units * 2: self.units * 3]
        self.kernel_o = self.kernel[:, self.units * 3:]

        self.recurrent_kernel_i = self.recurrent_kernel[:, :self.units]
        self.recurrent_kernel_f = self.recurrent_kernel[:, self.units: self.units * 2]
        self.recurrent_kernel_c = self.recurrent_kernel[:, self.units * 2: self.units * 3]
        self.recurrent_kernel_o = self.recurrent_kernel[:, self.units * 3:]

        self.bias_i_i = self.bias[:self.units]
        self.bias_f_i = self.bias[self.units: self.units * 2]
        self.bias_c_i = self.bias[self.units * 2: self.units * 3]
        self.bias_o_i = self.bias[self.units * 3: self.units * 4]
        self.bias_i = self.bias[self.units * 4: self.units * 5]
        self.bias_f = self.bias[self.units * 5: self.units * 6]
        self.bias_c = self.bias[self.units * 6: self.units * 7]
        self.bias_o = self.bias[self.units * 7:]

        self.built = True
  def benchmarkCudnnLSTMTraining(self):
    test_configs = self._GetTestConfig()
    for config_name, config in test_configs.items():
      config = test_configs[config_name]
      num_layers = config["num_layers"]
      num_units = config["num_units"]
      batch_size = config["batch_size"]
      seq_length = config["seq_length"]

      with ops.Graph().as_default(), ops.device("/gpu:0"):
        model = cudnn_rnn_ops.CudnnLSTM(num_layers, num_units, num_units)
        params_size_t = model.params_size()
        input_data = variables.Variable(
            array_ops.ones([seq_length, batch_size, num_units]))
        input_h = variables.Variable(
            array_ops.ones([num_layers, batch_size, num_units]))
        input_c = variables.Variable(
            array_ops.ones([num_layers, batch_size, num_units]))
        params = variables.Variable(
            array_ops.ones([params_size_t]), validate_shape=False)
        output, output_h, output_c = model(
            is_training=True,
            input_data=input_data,
            input_h=input_h,
            input_c=input_c,
            params=params)
        all_grads = gradients_impl.gradients(
            [output, output_h, output_c],
            [params, input_data, input_h, input_c])
        training_op = control_flow_ops.group(*all_grads)
        self._BenchmarkOp(training_op, "cudnn_lstm %s %s" %
                          (config_name, self._GetConfigDesc(config)))
Exemple #4
0
 def _CreateModel(self,
                  rnn_mode,
                  num_layers,
                  num_units,
                  input_size,
                  input_mode="linear_input",
                  dropout=0.):
     if rnn_mode == "lstm":
         model = cudnn_rnn_ops.CudnnLSTM(num_layers,
                                         num_units,
                                         input_size,
                                         dropout=dropout)
     elif rnn_mode == "gru":
         model = cudnn_rnn_ops.CudnnGRU(num_layers,
                                        num_units,
                                        input_size,
                                        dropout=dropout)
     elif rnn_mode == "rnn_tanh":
         model = cudnn_rnn_ops.CudnnRNNTanh(num_layers,
                                            num_units,
                                            input_size,
                                            dropout=dropout)
     elif rnn_mode == "rnn_relu":
         model = cudnn_rnn_ops.CudnnRNNRelu(num_layers,
                                            num_units,
                                            input_size,
                                            dropout=dropout)
     else:
         raise ValueError("Invalid rnn_mode: %s" % rnn_mode)
     return model
    def _apply_transposed(self, is_train, x, initial_states=None):
        w_init = TruncatedNormal(stddev=0.05)
        x_size = x.shape.as_list()[-1]
        if x_size is None:
            raise ValueError("Last dimension must be defined (have shape %s)" %
                             str(x.shape))

        cell = cudnn_rnn_ops.CudnnLSTM(1,
                                       self.n_out,
                                       x_size,
                                       input_mode="linear_input")

        # We need to know the mapping of weights/baises -> CudnnLSTM parameter, so just
        # build a `CudnnLSTM` and read its fields
        c = cudnn_layers.CudnnLSTM(1, self.n_out)
        c._input_size = x.shape.as_list()[-1]
        w_shapes = c.canonical_weight_shapes
        b_shapes = c.canonical_bias_shapes
        weights = [w_init(s, tf.float32) for s in w_shapes]
        biases = [tf.zeros(s, tf.float32) for s in b_shapes]
        biases[1] = tf.constant(self.lstm_bias / 2.0, tf.float32, b_shapes[1])
        biases[5] = tf.constant(self.lstm_bias / 2.0, tf.float32, b_shapes[5])

        opaque_params_t = cell.canonical_to_params(weights, biases)
        parameters = tf.get_variable("opaque_kernel",
                                     initializer=opaque_params_t,
                                     validate_shape=False)

        p = 1.0 - self.dropout

        if is_train and self.dropout > 0:
            mult_bias = [tf.ones_like(x) for x in biases]
            mult_w = [tf.ones_like(x) for x in weights]

            bias_mask = tf.floor(tf.random_uniform(
                (self.n_out, ), p, 1 + p)) / p

            for j in range(4, 8):
                mult_w[j] *= tf.expand_dims(bias_mask, 0)

            mult_mask = cell.canonical_to_params(mult_w, mult_bias)
            parameters = parameters * mult_mask

        initial_state_h, initial_state_c = initial_states
        out = cell(x, initial_state_h, initial_state_c, parameters, True)[0]

        return out
 def _build_rnn_graph_cudnn(self, inputs, config, is_training):
     """Build the inference graph using CUDNN cell."""
     inputs = tf.transpose(inputs, [1, 0, 2])
     from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops
     self._cell = cudnn_rnn_ops.CudnnLSTM(
         num_layers=config.num_layers,
         num_units=config.hidden_size,
         input_size=config.hidden_size,
         dropout=1 - config.keep_prob if is_training else 0)
     params_size_t = self._cell.params_size()
     self._rnn_params = tf.get_variable(
         "lstm_params",
         initializer=tf.random_uniform([params_size_t], -config.init_scale,
                                       config.init_scale),
         validate_shape=False)
     c = tf.zeros([config.num_layers, self.batch_size, config.hidden_size],
                  tf.float32)
     h = tf.zeros([config.num_layers, self.batch_size, config.hidden_size],
                  tf.float32)
     self._initial_state = (tf.contrib.rnn.LSTMStateTuple(h=h, c=c), )
     outputs, h, c = self._cell(inputs, h, c, self._rnn_params, is_training)
     outputs = tf.transpose(outputs, [1, 0, 2])
     outputs = tf.reshape(outputs, [-1, config.hidden_size])
     return outputs, (tf.contrib.rnn.LSTMStateTuple(h=h, c=c), )
    def _apply_transposed(self, is_train, x):
        w_init = get_keras_initialization(self.w_init)
        r_init = None if self.recurrent_init is None else get_keras_initialization(self.recurrent_init)
        x_size = x.shape.as_list()[-1]
        if x_size is None:
            raise ValueError("Last dimension must be defined (have shape %s)" % str(x.shape))

        if self._kind == "GRU":
            cell = cudnn_rnn_ops.CudnnGRU(self.n_layers, self.n_units, x_size, input_mode="linear_input")
        elif self._kind == "LSTM":
            cell = cudnn_rnn_ops.CudnnLSTM(self.n_layers, self.n_units, x_size, input_mode="linear_input")
        else:
            raise ValueError()

        n_params = cell.params_size().eval()
        weights, biases = cell.params_to_canonical(tf.zeros([n_params]))

        def init(shape, dtype=None, partition_info=None):
            # This a bit hacky, since the api for these models is akward. We have to compute the shape of
            # the weights / biases by calling `cell.params_to_canonical` with a unused tensor, and then
            # use .eval() to actually get the shape. Then we can apply the user-requested initialzers
            if self._kind == "LSTM":
                is_recurrent = [False, False, False, False, True, True, True, True]
                is_forget_bias = [False, True, False, False, False, True, False, False]
            else:
                is_recurrent = [False, False, False, True, True, True]
                is_forget_bias = [False] * 6

            init_biases = [tf.constant(self.lstm_bias/2.0, tf.float32, (self.n_units,)) if z else tf.zeros(self.n_units)
                           for z in is_forget_bias]
            init_weights = []

            for w, r in zip(weights, is_recurrent):
                if r and r_init is not None:
                    init_weights.append(tf.reshape(r_init((self.n_units, self.n_units), w.dtype), tf.shape(w)))
                else:
                    init_weights.append(w_init(tf.shape(w).eval(), w.dtype))
            out = cell.canonical_to_params(init_weights, init_biases)
            out.set_shape((n_params, ))

            return out

        parameters = tf.get_variable(
            "gru_parameters",
            n_params,
            tf.float32,
            initializer=init
        )

        if self.keep_recurrent < 1:
            # Not super well test, try to figure out which indices in `parameters` are recurrent weights and drop them
            # this is implementing drop-connect for the recurrent weights
            is_recurrent = weights[:len(weights) // 2] + [tf.ones_like(w) for w in weights[len(weights) // 2:]]
            recurrent_mask = cell.canonical_to_params(is_recurrent, biases)  # ones at recurrent weights
            recurrent_mask = 1 - recurrent_mask * (1 - self.keep_recurrent)  # ones are non-recurrent param, keep_prob elsewhere
            parameters = tf.cond(is_train,
                                 lambda: tf.floor(tf.random_uniform((n_params, )) + recurrent_mask) * parameters,
                                 lambda: parameters)

        if self._kind == "LSTM":
            if self.learn_initial_states:
                raise NotImplementedError()
            else:
                initial_state_h = tf.zeros((self.n_layers, tf.shape(x)[1], self.n_units), tf.float32)
                initial_state_c = tf.zeros((self.n_layers, tf.shape(x)[1], self.n_units), tf.float32)
            out = cell(x, initial_state_h, initial_state_c, parameters, True)
        else:
            if self.learn_initial_states:
                initial_state = tf.get_variable("initial_state", self.n_units,
                                                tf.float32, tf.zeros_initializer())
                initial_state = tf.tile(tf.expand_dims(tf.expand_dims(initial_state, 0), 0),
                                        [self.n_layers, tf.shape(x)[1], 1])
            else:
                initial_state = tf.zeros((self.n_layers, tf.shape(x)[1], self.n_units), tf.float32)
            out = cell(x, initial_state, parameters, True)
        return out
Exemple #8
0
    def __init__(self, is_training, config, input_):
        self._input = input_

        batch_size = input_.batch_size
        num_steps = input_.num_steps
        size = config.hidden_size
        vocab_size = config.vocab_size
        num_layers = config.num_layers

        if is_training == False and config.keep_prob < 1:
            dropout_rate = 1 - config.keep_prob
        else:
            dropout_rate = 0.0
        dropout_rate = 0.0

        with tf.device("/cpu:0"):
            embedding = tf.get_variable("embedding", [vocab_size, size],
                                        dtype=data_type())
            inputs = tf.nn.embedding_lookup(embedding, input_.input_data)

        from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops
        from tensorflow.python.framework import constant_op
        from tensorflow.python.ops import random_ops

        model = cudnn_rnn_ops.CudnnLSTM(num_layers,
                                        size,
                                        size,
                                        dropout=dropout_rate)
        params_size_t = model.params_size()

        init_state = tf.zeros([num_layers, batch_size, size],
                              dtype=data_type(),
                              name="Kit_Init_State")

        params = tf.get_variable(
            'Kit_Parameters',
            initializer=tf.random_uniform([params_size_t], -config.init_scale,
                                          config.init_scale),
            validate_shape=False)

        inputs = tf.transpose(inputs, perm=[1, 0, 2])
        output, output_h, output_c = model(is_training=is_training,
                                           input_data=inputs,
                                           input_h=init_state,
                                           input_c=init_state,
                                           params=params)

        output = tf.transpose(output, [1, 0, 2])
        output = tf.reshape(output, [-1, size])
        softmax_w = tf.get_variable("softmax_w", [size, vocab_size],
                                    dtype=data_type())
        softmax_b = tf.get_variable("softmax_b", [vocab_size],
                                    dtype=data_type())
        logits = tf.nn.xw_plus_b(output, softmax_w, softmax_b)

        # Reshape logits to be 3-D tensor for sequence loss
        logits = tf.reshape(logits, [batch_size, num_steps, vocab_size])

        # use the contrib sequence loss and average over the batches
        loss = tf.contrib.seq2seq.sequence_loss(logits,
                                                input_.targets,
                                                tf.ones(
                                                    [batch_size, num_steps],
                                                    dtype=data_type()),
                                                average_across_timesteps=False,
                                                average_across_batch=True)

        # update the cost variables
        self._cost = cost = tf.reduce_sum(loss)
        self._final_state = (tf.contrib.rnn.LSTMStateTuple(h=output_h,
                                                           c=output_c))

        if not is_training:
            return

        self._lr = tf.Variable(0.0, trainable=False)
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars),
                                          config.max_grad_norm)
        optimizer = tf.train.GradientDescentOptimizer(self._lr)
        self._train_op = optimizer.apply_gradients(
            zip(grads, tvars),
            global_step=tf.contrib.framework.get_or_create_global_step())

        self._new_lr = tf.placeholder(tf.float32,
                                      shape=[],
                                      name="new_learning_rate")
        self._lr_update = tf.assign(self._lr, self._new_lr)
Exemple #9
0
def cudnn_rnn_wrapper(input_data,
                      rnn_mode,
                      num_layers,
                      num_units,
                      input_size,
                      variable_name,
                      direction="unidirectional",
                      time_major=False,
                      dropout=0.0):

    if rnn_mode == "lstm":
        model = cudnn_rnn_ops.CudnnLSTM(num_layers,
                                        num_units,
                                        input_size,
                                        direction=direction,
                                        dropout=dropout)
    elif rnn_mode == "gru":
        model = cudnn_rnn_ops.CudnnGRU(num_layers,
                                       num_units,
                                       input_size,
                                       direction=direction,
                                       dropout=dropout)
    else:
        raise ValueError("Invalid rnn_mode: %s" % rnn_mode)

    # Compute the total size of RNN params (Tensor)
    params_size_ts = model.params_size()
    params = tf.Variable(tf.random_uniform([params_size_ts],
                                           minval=-0.1,
                                           maxval=0.1),
                         validate_shape=False,
                         name=variable_name)

    if not time_major:
        batch_size_ts = tf.shape(input_data)[0]  # batch size Tensor
        input_data = tf.transpose(input_data, [1, 0, 2])
    else:
        batch_size_ts = tf.shape(input_data)[1]  # batch size Tensor
    # NB: input_data should has shape [batch_size, num_timestep, d]

    if direction == "unidirectional":
        dir_count = 1
    elif direction == "bidirectional":
        dir_count = 2
    else:
        raise ValueError("Invalid direction: %s" % direction)

    init_h = tf.zeros(
        tf.stack([num_layers * dir_count, batch_size_ts, num_units]))
    has_input_c = (rnn_mode == "lstm")

    # Call the CudnnRNN
    if has_input_c:
        init_c = tf.zeros(
            tf.stack([num_layers * dir_count, batch_size_ts, num_units]))
        output, output_h, output_c = model(input_data=input_data,
                                           input_h=init_h,
                                           input_c=init_c,
                                           params=params)
    else:
        output, output_h = model(input_data=input_data,
                                 input_h=init_h,
                                 params=params)

    # output: [num_timestep, batch_size, num_units*dir_count]
    # output_h/c: [batch_size, num_units*dir_count]
    return output, output_h
Exemple #10
0
    def __init__(self,
                 is_training,
                 batch_size,
                 num_unrollings,
                 vocab_size,
                 hidden_size,
                 max_grad_norm,
                 embedding_size,
                 num_layers,
                 learning_rate,
                 model,
                 dropout=0.0,
                 input_dropout=0.0,
                 use_batch=True):
        self.batch_size = batch_size
        self.num_unrollings = num_unrollings
        if not use_batch:
            self.batch_size = 1
            self.num_unrollings = 1
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.max_grad_norm = max_grad_norm
        self.num_layers = num_layers
        self.embedding_size = embedding_size
        self.model = model
        self.dropout = dropout
        self.input_dropout = input_dropout
        if embedding_size <= 0:
            self.input_size = vocab_size
            # Don't do dropout on one hot representation.
            self.input_dropout = 0.0
        else:
            self.input_size = embedding_size
        self.model_size = (
            embedding_size * vocab_size +  # embedding parameters
            # lstm parameters
            4 * hidden_size * (hidden_size + self.input_size + 1) +
            # softmax parameters
            vocab_size * (hidden_size + 1) +
            # multilayer lstm parameters for extra layers.
            (num_layers - 1) * 4 * hidden_size *
            (hidden_size + hidden_size + 1))
        # self.decay_rate = decay_rate

        # Placeholder to feed in input and targets/labels data.
        self.input_data = tf.placeholder(
            tf.int64, [self.batch_size, self.num_unrollings], name='inputs')
        self.targets = tf.placeholder(tf.int64,
                                      [self.batch_size, self.num_unrollings],
                                      name='targets')

        #################################################
        #NEED TO REPLACE ALL CELL CODE

        # if self.model == 'rnn':
        #   cell_fn = tf.contrib.rnn.BasicRNNCell
        # elif self.model == 'lstm':
        #   cell_fn = tf.contrib.rnn.BasicLSTMCell
        # elif self.model == 'gru':
        #   cell_fn = tf.contrib.rnn.GRUCell

        # # params = {'input_size': self.input_size}
        # params = {}
        # if self.model == 'lstm':
        #   # add bias to forget gate in lstm.
        #   params['forget_bias'] = 0.0
        #   params['state_is_tuple'] = True
        # # Create multilayer cell.
        # cell = cell_fn(
        #     self.hidden_size, reuse=tf.get_variable_scope().reuse,
        #     **params)

        # cells = [cell]
        # # params['input_size'] = self.hidden_size
        # # more explicit way to create cells for MultiRNNCell than
        # # [higher_layer_cell] * (self.num_layers - 1)
        # for i in range(self.num_layers-1):
        #   higher_layer_cell = cell_fn(
        #       self.hidden_size, reuse=tf.get_variable_scope().reuse,
        #       **params)
        #   cells.append(higher_layer_cell)

        # if is_training and self.dropout > 0:
        #   cells = [tf.contrib.rnn.DropoutWrapper(
        #     cell,
        #     output_keep_prob=1.0-self.dropout)
        #            for cell in cells]

        # multi_cell = tf.contrib.rnn.MultiRNNCell(cells)

        # with tf.name_scope('initial_state'):
        #   # zero_state is used to compute the intial state for cell.
        #   self.zero_state = multi_cell.zero_state(self.batch_size, tf.float32)
        #   # Placeholder to feed in initial state.
        #   # self.initial_state = tf.placeholder(
        #   #   tf.float32,
        #   #   [self.batch_size, multi_cell.state_size],
        #   #   'initial_state')

        # self.initial_state = create_tuple_placeholders_with_default(
        #   multi_cell.zero_state(batch_size, tf.float32),
        #   extra_dims=(None,),
        #   shape=multi_cell.state_size)

        ######## MIGHT NEED THIS STUFF ##################

        # Embeddings layers.
        with tf.name_scope('embedding_layer'):
            if embedding_size > 0:
                self.embedding = tf.get_variable(
                    'embedding', [self.vocab_size, self.embedding_size])
            else:
                self.embedding = tf.constant(np.eye(self.vocab_size),
                                             dtype=tf.float32)

            inputs = tf.nn.embedding_lookup(self.embedding, self.input_data)
            if is_training and self.input_dropout > 0:
                inputs = tf.nn.dropout(inputs, 1 - self.input_dropout)

        with tf.name_scope('slice_inputs'):
            # Slice inputs into a list of shape [batch_size, 1] data colums.
            sliced_inputs = [
                tf.squeeze(input_, [1])
                for input_ in tf.split(axis=1,
                                       num_or_size_splits=self.num_unrollings,
                                       value=inputs)
            ]

        # Copy cell to do unrolling and collect outputs.
        # outputs, final_state = tf.contrib.rnn.static_rnn(
        #   multi_cell, sliced_inputs,
        #   initial_state=self.initial_state)

########################

#Insert MIOPEN
        if self.model == 'lstm':
            model = cudnn_rnn_ops.CudnnLSTM(self.num_layers,
                                            self.hidden_size,
                                            self.embedding_size,
                                            dropout=self.dropout)
        elif self.model == 'gru':
            model = cudnn_rnn_ops.CudnnGRU(self.num_layers,
                                           self.hidden_size,
                                           self.embedding_size,
                                           dropout=self.dropout)
        elif self.model == 'rnn':
            model = cudnn_rnn_ops.CudnnRNNTanh(self.num_layers,
                                               self.hidden_size,
                                               self.embedding_size,
                                               dropout=self.dropout)
        else:
            raise ValueError("Invalid model: %s" % self.model)

        # Set zero init input states
        input_h = constant_op.constant(np.zeros(
            [self.num_layers, self.num_unrollings, self.hidden_size]),
                                       dtype=tf.float32)
        has_input_c = (self.model == 'lstm')
        if has_input_c:
            input_c = constant_op.constant(np.zeros(
                [self.num_layers, self.num_unrollings, self.hidden_size]),
                                           dtype=tf.float32)

        # Set rnn params
        params_size_t = model.params_size()
        rand_params = random_ops.random_uniform(params_size_t.shape)
        print "PARAMS size"
        print params_size_t
        print rand_params.shape
        print "Input sizes"
        print input_h
        print input_c
        print "Batch size"
        print batch_size
        print "Hidden size"
        print self.hidden_size
        #rand_params.set_shape(params_size_t.shape);
        params = variables.Variable(rand_params, validate_shape=True)
        args = {
            "input_data": inputs,
            "input_h": input_h,
            "params": params,
            "is_training": is_training
        }
        if has_input_c:
            args["input_c"] = input_c
        # Build cell
        if (self.model == 'lstm'):
            outputs, final_state, final_cell = model(input_data=inputs,
                                                     input_h=input_h,
                                                     input_c=input_c,
                                                     params=params)
        else:
            outputs, final_state, final_cell = model(input_data=inputs,
                                                     input_h=input_h,
                                                     params=params)
        # model(**args)

        self.zero_state = state_ops.assign(
            params, array_ops.zeros(params_size_t.shape))

        self.initial_state = create_tuple_placeholders_with_default(
            self.zero_state, extra_dims=(None, ), shape=params_size_t.shape)

        print "Initial State"
        print self.initial_state

        ########################

        self.final_state = final_state

        with tf.name_scope('flatten_ouputs'):
            # Flatten the outputs into one dimension.
            flat_outputs = tf.reshape(tf.concat(axis=1, values=outputs),
                                      [-1, hidden_size])

        with tf.name_scope('flatten_targets'):
            # Flatten the targets too.
            flat_targets = tf.reshape(tf.concat(axis=1, values=self.targets),
                                      [-1])

        # Create softmax parameters, weights and bias.
        with tf.variable_scope('softmax') as sm_vs:
            softmax_w = tf.get_variable("softmax_w", [hidden_size, vocab_size])
            softmax_b = tf.get_variable("softmax_b", [vocab_size])
            self.logits = tf.matmul(flat_outputs, softmax_w) + softmax_b
            self.probs = tf.nn.softmax(self.logits)

        with tf.name_scope('loss'):
            # Compute mean cross entropy loss for each output.
            loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=self.logits, labels=flat_targets)
            self.mean_loss = tf.reduce_mean(loss)

        with tf.name_scope('loss_monitor'):
            # Count the number of elements and the sum of mean_loss
            # from each batch to compute the average loss.
            count = tf.Variable(1.0, name='count')
            sum_mean_loss = tf.Variable(1.0, name='sum_mean_loss')

            self.reset_loss_monitor = tf.group(sum_mean_loss.assign(0.0),
                                               count.assign(0.0),
                                               name='reset_loss_monitor')
            self.update_loss_monitor = tf.group(
                sum_mean_loss.assign(sum_mean_loss + self.mean_loss),
                count.assign(count + 1),
                name='update_loss_monitor')
            with tf.control_dependencies([self.update_loss_monitor]):
                self.average_loss = sum_mean_loss / count
                self.ppl = tf.exp(self.average_loss)

            # Monitor the loss.
            loss_summary_name = "average loss"
            ppl_summary_name = "perplexity"

            average_loss_summary = tf.summary.scalar(loss_summary_name,
                                                     self.average_loss)
            ppl_summary = tf.summary.scalar(ppl_summary_name, self.ppl)

        # Monitor the loss.
        self.summaries = tf.summary.merge([average_loss_summary, ppl_summary],
                                          name='loss_monitor')

        self.global_step = tf.get_variable(
            'global_step', [], initializer=tf.constant_initializer(0.0))

        self.learning_rate = tf.constant(learning_rate)
        if is_training:
            # learning_rate = tf.train.exponential_decay(1.0, self.global_step,
            #                                            5000, 0.1, staircase=True)
            tvars = tf.trainable_variables()
            grads, _ = tf.clip_by_global_norm(
                tf.gradients(self.mean_loss, tvars), self.max_grad_norm)
            # optimizer = tf.train.GradientDescentOptimizer(learning_rate)
            # optimizer = tf.train.RMSPropOptimizer(learning_rate, decay_rate)
            optimizer = tf.train.AdamOptimizer(self.learning_rate)

            self.train_op = optimizer.apply_gradients(
                zip(grads, tvars), global_step=self.global_step)