Ejemplo n.º 1
0
    def create_model(self):
        
        self.input_data  = tf.placeholder(tf.int32, [self.batch_size, self.seq_length], name="input_data")
        self.target_data = tf.placeholder(tf.int32,[self.batch_size, self.seq_length],  name="target_data")

        # define hyper_parameters
        self.keep_prob = tf.Variable(0.3, trainable=False, name='keep_prob')
        self.lr = tf.Variable(0.0, trainable=False, name="lr")
              
        softmax_weights = tf.get_variable("softmax_weights",[self.rnn_size, self.vocab_size])
        softmax_biases = tf.get_variable("softmax_biases", [self.vocab_size])
            
        lstm_cell = rnn_cell.BasicLSTMCell(self.rnn_size)

#        if self.is_training and self.keep_prob < 1:
#              lstm_cell = rnn_cell.DropoutWrapper(lstm_cell, output_keep_prob=self.keep_prob)
        
        multilayer_cell = rnn_cell.MultiRNNCell([lstm_cell] * self.num_layers)
        self.initial_state = multilayer_cell.zero_state(self.batch_size, tf.float32)    
        
        
            
        with tf.device("/cpu:0"):
            # define the embedding matrix for the whole vocabulary
            self.embedding = tf.get_variable("embeddings", [self.vocab_size, self.rnn_size])
            # take the vector representation for each word in the embeddings
            embeds = tf.nn.embedding_lookup(self.embedding, self.input_data)
        
        if self.is_training and self.keep_prob < 1:
            embeds = tf.nn.dropout(embeds, self.keep_prob)
        
        
        def loop(prev, _):
            prev = tf.nn.xw_plus_b(prev, softmax_weights, softmax_biases)
            prev_symbol = tf.stop_gradient(tf.argmax(prev, 1))
            return tf.nn.embedding_lookup(self.embedding, prev_symbol)
            
        #convert input to a list of seq_length
        inputs = tf.split(1,self.seq_length, embeds)
        
        #after splitting the shape becomes (batch_size,1,rnn_size). We need to modify it to [batch*rnn_size]
        inputs = [ tf.squeeze(input_, [1]) for input_ in inputs]    
    
        output,states= seq2seq.rnn_decoder(inputs,self.initial_state, multilayer_cell, loop_function=loop if self.infer else None, scope='rnnlm')
        
        output = tf.reshape(tf.concat(1, output), [-1, self.rnn_size])
        
        self.logits = tf.nn.xw_plus_b(output, softmax_weights, softmax_biases)
        self.probs = tf.nn.softmax(self.logits, name= "probability")
        
        loss = seq2seq.sequence_loss_by_example([self.logits], [tf.reshape(self.target_data, [-1])],  [tf.ones([self.batch_size * self.seq_length])], self.vocab_size )
        self.cost = tf.reduce_sum(loss) / ( self.batch_size * self.seq_length )
        
        self.final_state= states[-1]
        
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars),self.grad_clip)
        
        optimizer = tf.train.AdamOptimizer(0.01)
        self.train_op = optimizer.apply_gradients(zip(grads, tvars))
Ejemplo n.º 2
0
def seq2seq_f(cell, encoder_inputs, decoder_inputs, loop_output):
    ''' 
    The seq2seq neural network structurei
    
    Args: 
        cell: the RNNCell object
        encoder_inputs: a list of Tensors to feed the encoder
        decoder_inputs: a list of Tensors to feed the decoder
        loop_output: True for using the loop_func to construct the next 
            decoder_input element using the previous output element

    Returns:
        outputs: a list of Tensors generated by the decoder
        states: the hidden states at the final step of the encoder
    '''
    if loop_output:
        def loop_func(prev, i):
        # simplest construction: using the previous output as the next input
            return prev
        # use rnn() directly for modified decoder.
        _, enc_states = rnn.rnn(cell, encoder_inputs, dtype=tf.float32)
        # note that the returned states are all hidden states, not just the last one
        outputs,states = seq2seq.rnn_decoder(decoder_inputs, enc_states[-1], cell, loop_func)
    else:
        # using the given decoder inputs
        outputs,states = seq2seq.basic_rnn_seq2seq(
                 encoder_inputs, decoder_inputs, cell)

    # one way to bound the output in [-1,1]. but not used.
#            for x in outputs:
#                x = tf.tanh(x)
#  print(states)
    
    # the output states is just the last element of all hidden states
    return outputs,states
Ejemplo n.º 3
0
    def __init__(self, args, infer=False):
        self.args = args
        if infer:
            args.batch_size = 1
            args.seq_length = 1

        additional_cell_args = {}
        if args.model == 'rnn':
            cell_fn = rnn_cell.BasicRNNCell
        elif args.model == 'gru':
            cell_fn = rnn_cell.GRUCell
        elif args.model == 'lstm':
            cell_fn = rnn_cell.BasicLSTMCell
        elif args.model == 'gridlstm':
            cell_fn = grid_rnn.Grid2LSTMCell
            additional_cell_args.update({'use_peepholes': True, 'forget_bias': 1.0})
        elif args.model == 'gridgru':
            cell_fn = grid_rnn.Grid2GRUCell
        else:
            raise Exception("model type not supported: {}".format(args.model))

        cell = cell_fn(args.rnn_size, **additional_cell_args)

        self.cell = cell = rnn_cell.MultiRNNCell([cell] * args.num_layers)

        self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length])
        self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length])
        self.initial_state = cell.zero_state(args.batch_size, tf.float32)

        with tf.variable_scope('rnnlm'):
            softmax_w = tf.get_variable("softmax_w", [args.rnn_size, args.vocab_size])
            softmax_b = tf.get_variable("softmax_b", [args.vocab_size])
            with tf.device("/cpu:0"):
                embedding = tf.get_variable("embedding", [args.vocab_size, args.rnn_size])
                inputs = tf.split(1, args.seq_length, tf.nn.embedding_lookup(embedding, self.input_data))
                inputs = [tf.squeeze(input_, [1]) for input_ in inputs]

        def loop(prev, _):
            prev = tf.nn.xw_plus_b(prev, softmax_w, softmax_b)
            prev_symbol = tf.stop_gradient(tf.argmax(prev, 1))
            return tf.nn.embedding_lookup(embedding, prev_symbol)

        outputs, last_state = seq2seq.rnn_decoder(inputs, self.initial_state, cell,
                                                  loop_function=loop if infer else None, scope='rnnlm')
        output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size])
        self.logits = tf.nn.xw_plus_b(output, softmax_w, softmax_b)
        self.probs = tf.nn.softmax(self.logits)
        loss = seq2seq.sequence_loss_by_example([self.logits],
                                                [tf.reshape(self.targets, [-1])],
                                                [tf.ones([args.batch_size * args.seq_length])],
                                                args.vocab_size)
        self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length
        self.final_state = last_state
        self.lr = tf.Variable(0.0, trainable=False)
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars),
                                          args.grad_clip)
        optimizer = tf.train.AdamOptimizer(self.lr)
        self.train_op = optimizer.apply_gradients(zip(grads, tvars))
Ejemplo n.º 4
0
Archivo: util.py Proyecto: hans/rlcomp
def embedding_rnn_decoder(decoder_inputs, initial_state, cell, num_symbols,
                          output_projection=None, feed_previous=False,
                          scope=None, embedding=None):
  """RNN decoder with embedding and a pure-decoding option.
  Args:
    decoder_inputs: a list of 1D batch-sized int32-Tensors (decoder inputs).
    initial_state: 2D Tensor [batch_size x cell.state_size].
    cell: rnn_cell.RNNCell defining the cell function.
    num_symbols: integer, how many symbols come into the embedding.
    output_projection: None or a pair (W, B) of output projection weights and
      biases; W has shape [cell.output_size x num_symbols] and B has
      shape [num_symbols]; if provided and feed_previous=True, each fed
      previous output will first be multiplied by W and added B.
    feed_previous: Boolean; if True, only the first of decoder_inputs will be
      used (the "GO" symbol), and all other decoder inputs will be generated by:
        next = embedding_lookup(embedding, argmax(previous_output)),
      In effect, this implements a greedy decoder. It can also be used
      during training to emulate http://arxiv.org/pdf/1506.03099v2.pdf.
      If False, decoder_inputs are used as given (the standard decoder case).
    scope: VariableScope for the created subgraph; defaults to
      "embedding_rnn_decoder".
  Returns:
    outputs: A list of the same length as decoder_inputs of 2D Tensors with
      shape [batch_size x cell.output_size] containing the generated outputs.
    states: The state of each decoder cell in each time-step. This is a list
      with length len(decoder_inputs) -- one item for each time-step.
      Each item is a 2D Tensor of shape [batch_size x cell.state_size].
  Raises:
    ValueError: when output_projection has the wrong shape.
  """
  if output_projection is not None:
    proj_weights = tf.convert_to_tensor(output_projection[0], dtype=tf.float32)
    proj_weights.get_shape().assert_is_compatible_with([cell.output_size,
                                                        num_symbols])
    proj_biases = tf.convert_to_tensor(output_projection[1], dtype=tf.float32)
    proj_biases.get_shape().assert_is_compatible_with([num_symbols])

  with tf.variable_scope(scope or "embedding_rnn_decoder"):
    if embedding is None:
      with tf.device("/cpu:0"):
        embedding = tf.get_variable("embedding", [num_symbols, cell.input_size])

    def extract_argmax_and_embed(prev, _):
      """Loop_function that extracts the symbol from prev and embeds it."""
      if output_projection is not None:
        prev = tf.nn.xw_plus_b(prev, output_projection[0], output_projection[1])
      prev_symbol = tf.stop_gradient(tf.argmax(prev, 1))
      return tf.nn.embedding_lookup(embedding, prev_symbol)

    loop_function = None
    if feed_previous:
      loop_function = extract_argmax_and_embed

    emb_inp = [tf.nn.embedding_lookup(embedding, i) for i in decoder_inputs]
    return seq2seq.rnn_decoder(emb_inp, initial_state, cell,
                               loop_function=loop_function)
Ejemplo n.º 5
0
    def _init_seq2seq(self, encoder_inputs, decoder_inputs, cell, feed_previous):

        def inference_loop_function(prev, _):
            prev = tf.nn.xw_plus_b(prev, self.w_softmax, self.b_softmax)
            return tf.to_float(tf.equal(prev, tf.reduce_max(prev, reduction_indices=[1], keep_dims=True)))

        loop_function = inference_loop_function if feed_previous else None

        with variable_scope.variable_scope('seq2seq'):
            _, final_enc_state = rnn.rnn(cell, encoder_inputs, dtype=dtypes.float32)
            return seq2seq.rnn_decoder(decoder_inputs, final_enc_state, cell, loop_function=loop_function)
Ejemplo n.º 6
0
    def __init__(self,
                 rnn_size,
                 num_layers,
                 vocab_size,
                 grad_clip,
                 batch_size=1,
                 seq_length=1):

        cell = rnn_cell.BasicLSTMCell(rnn_size)

        self.cell = cell = rnn_cell.MultiRNNCell([cell] * num_layers)

        self.input_data = tf.placeholder(tf.int32, [batch_size, seq_length])
        self.targets = tf.placeholder(tf.int32, [batch_size, seq_length])
        self.initial_state = cell.zero_state(batch_size, tf.float32)

        with tf.variable_scope('rnnlm'):
            softmax_w = tf.get_variable('softmax_w', [rnn_size, vocab_size])
            softmax_b = tf.get_variable('softmax_b', [vocab_size])
            with tf.device('/cpu:0'):
                embedding = tf.get_variable('embedding',
                                            [vocab_size, rnn_size])
                inputs = tf.split(
                    1, seq_length,
                    tf.nn.embedding_lookup(embedding, self.input_data))
                inputs = [tf.squeeze(input_, [1]) for input_ in inputs]

        def loop(prev, _):
            prev = tf.nn.xw_plus_b(prev, softmax_w, softmax_b)
            prev_symbol = tf.stop_gradient(tf.argmax(prev, 1))
            return tf.nn.embedding_lookup(embedding, prev_symbol)

        train = batch_size == 1 and seq_length == 1
        loop_fn = loop if train else None

        outputs, last_state = seq2seq.rnn_decoder(inputs,
                                                  self.initial_state,
                                                  cell,
                                                  loop_function=loop_fn,
                                                  scope='rnnlm')
        output = tf.reshape(tf.concat(1, outputs), [-1, rnn_size])
        self.logits = tf.nn.xw_plus_b(output, softmax_w, softmax_b)
        self.probs = tf.nn.softmax(self.logits)
        loss = seq2seq.sequence_loss_by_example(
            [self.logits], [tf.reshape(self.targets, [-1])],
            [tf.ones([batch_size * seq_length])], vocab_size)
        self.cost = tf.reduce_sum(loss) / batch_size / seq_length
        self.final_state = last_state
        self.lr = tf.Variable(0.0, trainable=False)
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars),
                                          grad_clip)
        optimizer = tf.train.AdamOptimizer(self.lr)
        self.train_op = optimizer.apply_gradients(zip(grads, tvars))
Ejemplo n.º 7
0
Archivo: model.py Proyecto: lacker/ai
  def __init__(self, args, sampling=False):
    self.args = args
    if sampling:
      args.batch_size = 1
      args.seq_length = 1

    basic_cell = rnn_cell.BasicLSTMCell(args.rnn_size)
    self.cell = rnn_cell.MultiRNNCell([basic_cell] * args.num_layers)

    self.input_data = tf.placeholder(tf.int32,
                                     [args.batch_size, args.seq_length])
    self.targets = tf.placeholder(tf.int32,
                                  [args.batch_size, args.seq_length])
    self.initial_state = self.cell.zero_state(args.batch_size, tf.float32)

    with tf.variable_scope('rnnlm'):
      softmax_w = tf.get_variable("softmax_w",
                                  [args.rnn_size, args.vocab_size])
      softmax_b = tf.get_variable("softmax_b",
                                  [args.vocab_size])
      with tf.device("/cpu:0"):
        embedding = tf.get_variable("embedding",
                                    [args.vocab_size, args.rnn_size])
        inputs = tf.split(1, args.seq_length,
                          tf.nn.embedding_lookup(embedding, self.input_data))
        inputs = [tf.squeeze(input_, [1]) for input_ in inputs]

    def loop(prev, _):
      prev = tf.nn.xw_plus_b(prev, softmax_w, softmax_b)
      prev_symbol = tf.stop_gradient(tf.argmax(prev, 1))
      return tf.nn.embedding_lookup(embedding, prev_symbol)

    outputs, last_state = seq2seq.rnn_decoder(
      inputs, self.initial_state, self.cell,
      loop_function=loop if sampling else None, scope='rnnlm')

    output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size])
    self.logits = tf.nn.xw_plus_b(output, softmax_w, softmax_b)
    self.probs = tf.nn.softmax(self.logits)
    loss = seq2seq.sequence_loss_by_example(
      [self.logits],
      [tf.reshape(self.targets, [-1])],
      [tf.ones([args.batch_size * args.seq_length])],
      args.vocab_size)
    self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length
    self.final_state = last_state
    self.lr = tf.Variable(0.0, trainable=False)
    tvars = tf.trainable_variables()
    grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars),
                                      args.grad_clip)
    optimizer = tf.train.AdamOptimizer(self.lr)
    self.train_op = optimizer.apply_gradients(zip(grads, tvars))
Ejemplo n.º 8
0
def generator(input_data, args, reuse=False):
	'''
	Produce a probability sequence from the provided input_sequence

	args:
		input_data:   
		args:  

	returns:
		probs:   [args.batch_size, args.seq_length, args.vocab_size]

	'''
	with tf.variable_scope('generator', args, reuse = reuse):
		if args.model == 'rnn':
			cell = rnn_cell.BasicRNNCell(args.rnn_size)
		if args.model == 'gru':
			cell = rnn_cell.GRUCell(args.rnn_size)
		if args.model == 'lstm':
			cell = rnn_cell.BasicLSTMCell(args.rnn_size)
		else:
			raise Exception('model type not supported: {}'.format(args.model))
		cell = rnn_cell.MultiRNNCell([cell] * args.num_layers)
		initial_state = cell.zero_state(args.batch_size, tf.float32)

		with tf.variable_scope('rnn'):
			softmax_w = tf.get_variable('softmax_w', [args.rnn_size, args.vocab_size])
			softmax_b = tf.get_variable('softmax_b', [args.vocab_size])
			
			with tf.device('/cpu:0'):
				embedding  = tf.get_variable('embedding', [args.vocab_size, args.rnn_size])
				inputs = tf.split(1, args.seq_length, tf.nn.embedding_lookup(embedding, input_data))
				inputs = [tf.squeeze(i, [1]) for i in inputs]

		def loop(prev, _):
			prev = tf.nn.xw_plus_b(prev, softmax_w, softmax_b)
			prev_symbol = tf.stop_gradient(tf.argmax(prev, 1))
			return tf.nn.embedding_lookup(embedding, prev_symbol)

		outputs, last_state = seq2seq.rnn_decoder(inputs, initial_state, cell, 
				loop_function=None if is_training else loop, scope='rnn')
		
		#  Dim: [args.batch_size * args.seq_length, args.rnn_size]
		output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size])
		#  Dim: [args.batch_size * args.seq_length, args.vocab_size]
		logits = tf.nn.xw_plus_b(output, softmax_w, softmax_b)
		probs  = tf.nn.softmax(logits)
		# Dim:  [args.batch_size, args.seq_length, args.vocab_size]
		probs  = tf.reshape(probs, [args.batch_size, args.seq_length, args.vocab_size])
		return probs
Ejemplo n.º 9
0
def model():
    initial_loc = tf.random_uniform((batch_size, 2), minval=-1, maxval=1)

    initial_glimpse = get_glimpse(initial_loc)   
    
    lstm_cell = rnn_cell.LSTMCell(cell_size, g_size, num_proj=cell_out_size)

    initial_state = lstm_cell.zero_state(batch_size, tf.float32)
    
    inputs = [initial_glimpse]
    inputs.extend([0] * (glimpses - 1))
    
    outputs, _ = seq2seq.rnn_decoder(inputs, initial_state, lstm_cell, loop_function=get_next_input)
    get_next_input(outputs[-1], 0)
            
    return outputs
Ejemplo n.º 10
0
  def testRNNDecoder(self):
    with self.test_session() as sess:
      with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)):
        inp = [tf.constant(0.5, shape=[2, 2]) for _ in xrange(2)]
        _, enc_states = rnn.rnn(rnn_cell.GRUCell(2), inp, dtype=tf.float32)
        dec_inp = [tf.constant(0.4, shape=[2, 2]) for _ in xrange(3)]
        cell = rnn_cell.OutputProjectionWrapper(rnn_cell.GRUCell(2), 4)
        dec, mem = seq2seq.rnn_decoder(dec_inp, enc_states[-1], cell)
        sess.run([tf.initialize_all_variables()])
        res = sess.run(dec)
        self.assertEqual(len(res), 3)
        self.assertEqual(res[0].shape, (2, 4))

        res = sess.run(mem)
        self.assertEqual(len(res), 4)
        self.assertEqual(res[0].shape, (2, 2))
Ejemplo n.º 11
0
    def __init__(self, config, is_training):
        self.batch_size = batch_size = config.batch_size
        self.num_steps = num_steps = config.num_steps
        size = config.hidden_size

        lstm_cell = rnn_cell.BasicLSTMCell(size, forget_bias=0.0)
        cell = rnn_cell.MultiRNNCell([lstm_cell] * config.num_layers)

        if is_training and config.keep_prob < 1:
            cell = rnn_cell.DropoutWrapper(cell, output_keep_prob=config.keep_prob)

        self.cell = cell

        self.input_data = tf.placeholder(dtype=tf.float32, shape=[None, num_steps, 1])
        self.target_data = tf.placeholder(dtype=tf.float32, shape=[None, num_steps, 1])
        self.initial_state = cell.zero_state(batch_size=config.batch_size, dtype=tf.float32)

        inputs = tf.split(1, num_steps, self.input_data)
        inputs = [tf.squeeze(input_, [1]) for input_ in inputs]

        with tf.variable_scope('rnnvm'):
            output_w = tf.get_variable("output_w", [size, 1])
            output_b = tf.get_variable("output_b", [1])

        outputs, states = seq2seq.rnn_decoder(inputs, self.initial_state, cell, scope='rnnvm')

        output = tf.reshape(tf.concat(1, outputs), [-1, size])
        output = tf.nn.xw_plus_b(output, output_w, output_b)

        entropy = tf.nn.sigmoid_cross_entropy_with_logits(
                output,
                tf.reshape(self.target_data, shape=[num_steps * batch_size, 1]))

        self.cost = cost = tf.reduce_mean(entropy)
        self.final_state = states[-1]

        if not is_training:
            return

        self.lr = tf.Variable(0.0, trainable=False)
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars),
                                          config.max_grad_norm)
        optimizer = tf.train.AdamOptimizer(self.lr)
        self.train_op = optimizer.apply_gradients(zip(grads, tvars))
Ejemplo n.º 12
0
    def inference(self, input_data):
        """
        Build out the graph enough to make predictions
        input_data - a batch of sequences to predict.  Tensor of size [batch_size, input_channels, sequence_length]
        :return: logits
        """

        inputs = tf.split(2, self.sequence_length, input_data)  # Slice up the input_data into a list
        inputs = [tf.squeeze(input_, squeeze_dims=[2]) for input_ in inputs]  # Get rid of the dim with size 1

        self.outputs, self.states = seq2seq.rnn_decoder(inputs, # decoder_inputs: a list of 2D Tensors [batch_size x cell.input_size]
                                              self.initial_state,
                                              self.cell,
                                              None,  # Loop fn
                                              scope='inference'  # Name scope
                                              )
        #TODO: cleanup organziation
        self.final_state = self.states[-1]
        self.final_output = self.outputs[-1]

        return self.outputs, self.states
Ejemplo n.º 13
0
############
with tf.variable_scope('rnn_generator'):
	softmax_w = tf.get_variable('softmax_w', [args.rnn_size, args.vocab_size])
	softmax_b = tf.get_variable('softmax_b', [args.vocab_size])
	
	with tf.device('/cpu:0'):
		embedding  = tf.get_variable('embedding', [args.vocab_size, args.rnn_size])
		inputs_gen = tf.split(1, args.seq_length, tf.nn.embedding_lookup(embedding, input_data))
		inputs_gen = [tf.squeeze(i, [1]) for i in inputs_gen]

def loop(prev, _):
	prev = tf.nn.xw_plus_b(prev, softmax_w, softmax_b)
	prev_symbol = tf.stop_gradient(tf.argmax(prev, 1))
	return tf.nn.embedding_lookup(embedding, prev_symbol)

outputs_gen, last_state = seq2seq.rnn_decoder(inputs_gen, initial_state_gen, 
	cell_gen, loop_function=None if is_training else loop, scope='rnn_generator')

#  Dim: [args.batch_size * args.seq_length, args.rnn_size]
output_gen = tf.reshape(tf.concat(1, outputs_gen), [-1, args.rnn_size])

#  Dim: [args.batch_size * args.seq_length, args.vocab_size]
logits_gen = tf.nn.xw_plus_b(output_gen, softmax_w, softmax_b)
gen_probs  = tf.nn.softmax(logits_gen)
gen_probs  = tf.reshape(gen_probs, [args.batch_size, args.seq_length, args.vocab_size])

################
# Discriminator
################
# Pass a tensor of *probabilities* over the characters to the Discriminator
with tf.variable_scope('rnn_discriminator'):
	softmax_w = tf.get_variable('softmax_w', [args.rnn_size, 2], trainable = False)
Ejemplo n.º 14
0
  return(X,y)
  
with tf.name_scope("Placeholders") as scope:
  inputs = [tf.placeholder(tf.float32,shape=[batch_size,1]) for _ in range(seq_len)]
  target = tf.placeholder(tf.float32, shape=[batch_size])
  keep_prob = tf.placeholder("float")		
		
with tf.name_scope("Cell") as scope:
  cell = rnn_cell.BasicLSTMCell(hidden_size)
  cell = rnn_cell.MultiRNNCell([cell] * num_layers)
  cell = rnn_cell.DropoutWrapper(cell,output_keep_prob=keep_prob)			
  initial_state = cell.zero_state(batch_size, tf.float32)


with tf.name_scope("RNN") as scope:
  outputs, states = seq2seq.rnn_decoder(inputs, initial_state, cell)
  final = outputs[-1]

with tf.name_scope("Output") as scope:
  W_o = tf.Variable(tf.random_normal([hidden_size,input_size], stddev=0.01))     
  b_o = tf.Variable(tf.random_normal([input_size], stddev=0.01))
  prediction = tf.matmul(final, W_o) + b_o

with tf.name_scope("Optimization") as scope:
  cost = tf.pow(tf.sub(tf.reshape(prediction, [-1]), target),2)
  train_op = tf.train.RMSPropOptimizer(0.005, 0.2).minimize(cost)
  loss = tf.reduce_sum(cost)

#Validation Data
X_val,y_val = generate_data(5,seq_len,batch_size)
X_val = np.split(np.squeeze(X_val),seq_len,axis=1)
Ejemplo n.º 15
0
y = tf.placeholder(tf.int32, [None, seq_size])
initial_state = cell.zero_state(batch_size, tf.float32)

#with tf.variable_scope('rnn'):
w = tf.get_variable('softmax_w', [hidden_size, input_size])
b = tf.get_variable('softmax_b', [input_size])
#with tf.device('/cpu:0'):

# [input_size x hidden_size]
embed = tf.get_variable('embed', [input_size, hidden_size])

# [batch_size x seq_size x hidden_size]
input_set = tf.nn.embedding_lookup(embed, x)
# [batch_size x 1 x hidden_size] x seq_size
inputs = tf.split(1, seq_size, input_set)
# [batch_size x hidden_size] x seq_size
inputs = [tf.squeeze(input_, [1]) for input_ in inputs]

def loop(prev, _):
    prev = tf.nn.xw_plus_b(prev, w, b)
    prev_symbol = tf.stop_gradient(tf.argmax(prev, 1))
    # [batch_size x hidden_size]
    return tf.nn.embedding_lookup(embed, prev_symbol)

infer = False

# outputs : [batch_size x hidden_size]
# states : [batch_size x state_size]
outputs, states = seq2seq.rnn_decoder(inputs, initial_state, cell, loop_function=loop if infer else None, scope='rnn')
output = tf.reshape(tf.concat(1, outputs), [-1, hidden_size])
Ejemplo n.º 16
0
    def __init__(self, args):

        self.size = args.rnn_size
        self.n_steps = args.n_steps
        self.batch_size = args.batch_size
        self.input_dim = args.input_dim
        self.num_layers = args.num_layers
        
        initializer = tf.random_uniform_initializer(-0.8,0.8)
        # initializer = tf.zeros_initializer((size*2,1), dtype=tf.float32)

        self.seq_input = tf.placeholder(tf.float32, [self.n_steps, self.batch_size, self.input_dim])
        # sequence we will provide at runtime
        self.early_stop = tf.placeholder(tf.int32)
        # what timestep we want to stop at

        self.inputs = [tf.reshape(i, (self.batch_size, self.input_dim)) for i in tf.split(0, self.n_steps, self.seq_input)]
        # inputs for rnn needs to be a list, each item being a timestep. 
        # we need to split our input into each timestep, and reshape it because split keeps dims by default
        # result = tf.placeholder(tf.float32, [n_steps, batch_size, seq_width])
        self.result = tf.placeholder(tf.float32, [None, self.input_dim])

        if args.cell_type == "srnn":
            cell = BasicRNNCell(self.size)#, seq_width, initializer=initializer)
        elif args.cell_type == "lstm":
            cell = BasicLSTMCell(self.size, forget_bias = 1.0)
        elif args.cell_type == "lstmp":
            cell = LSTMCell(self.size, self.input_dim, initializer=initializer) 
        elif args.cell_type == "cw":
            cell = CWRNNCell(self.size, [1, 4, 16, 64])#, seq_width, initializer=initializer)  

        self.cell = cell = rnn_cell.MultiRNNCell([cell] * self.num_layers)
        
        # initial_state = cell.zero_state(batch_size, tf.float32)
        self.initial_state = tf.random_uniform([self.batch_size, self.cell.state_size], -0.1, 0.1)

        # self variables: scope RNN -> BasicRNNCell -> get_variable("Matrix", "Bias")
        
        # network type
        if args.rnn_type == "rnn":
            self.outputs, self.states = rnn.rnn(self.cell, self.inputs,
                                                initial_state = self.initial_state,
                                                sequence_length = self.early_stop)
        elif args.rnn_type == "seq2seq":
            self.outputs, self.states = seq2seq.rnn_decoder(self.inputs,
                                                            self.initial_state,
                                                            self.cell,
                                                            loop_function=loop if False else None)
            # set up lstm
        self.final_state = self.states[-1]

        self.W_o = tf.Variable(tf.random_normal([self.size,1], stddev=0.01))
        self.b_o = tf.Variable(tf.random_normal([1], stddev=0.01))

        print "type(outputs)", type(self.outputs)
        self.output_cat = tf.reshape(tf.concat(1, self.outputs), [-1, self.size])
        self.output = tf.nn.xw_plus_b(self.output_cat, self.W_o, self.b_o)
        # self.final_state = states[-1]
        self.output2 = tf.reshape(self.output, [self.batch_size, self.n_steps, self.input_dim])
        self.output2 = self.output2 + tf.random_normal([self.batch_size, self.n_steps, self.input_dim], stddev=0.05)
        # then transpose
        self.output2 = tf.transpose(self.output2, [1, 0, 2])
Ejemplo n.º 17
0
    def __init__(self, args):
        # define cell
        if args.model == 'rnn':
            cell_fn = rnn_cell.BasicRNNCell
        elif args.model == 'lstm':
            cell_fn = rnn_cell.BasicLSTMCell
        else:
            print "Invalid cell"
            sys.exit()

        cell = cell_fn(args.rnn_size)
        cell = rnn_cell.MultiRNNCell([cell] * args.num_layers)

        # define inputs and targets, initialize state
        self.inputs = tf.placeholder(tf.int32, [args.batch_size, args.seq_length])
        self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length])
        self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
        self.initial_state = cell.zero_state(args.batch_size, tf.float32)

        # prepare word embedding, reshape inputs
        with tf.name_scope("embedding"):
            with tf.device("/cpu:0"):
                if args.emb_vocab is None:
                    E = tf.get_variable("E", [args.vocab_size, args.rnn_size])
                else:
                    emb_dim = len(args.emb_vocab[args.emb_vocab.keys()[0]][1])
                    emb_mat = np.random.rand(args.vocab_size, emb_dim)
                    for word, (idx, emb_vec) in args.emb_vocab.iteritems():
                        emb_mat[idx] = emb_vec
                    E = tf.Variable(tf.convert_to_tensor(emb_mat, dtype=tf.float32), name="E")

                inputs = tf.split(1, args.seq_length, tf.nn.embedding_lookup(E, self.inputs))
                inputs = [tf.squeeze(input_, [1]) for input_ in inputs]

        # feed inputs into rnn
        with tf.name_scope("rnn"):
            outputs, states = seq2seq.rnn_decoder(inputs, self.initial_state, cell, loop_function=None, scope='rnnlm')
            self.output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size])

        # Add dropout
        with tf.name_scope("dropout"):
            self.h_drop = tf.nn.dropout(self.output, self.dropout_keep_prob)

        # output layer
        with tf.name_scope("output"):
            W = tf.Variable(tf.truncated_normal([args.rnn_size, args.num_classes], stddev=0.1), name="W")
            b = tf.Variable(tf.constant(0.1, shape=[args.num_classes]), name="b")
            self.logits = tf.nn.xw_plus_b(self.h_drop, W, b)
            self.probs = tf.nn.softmax(self.logits)
            self.predictions = tf.cast(tf.argmax(self.logits, 1), tf.int32)

        # accuracy
        with tf.name_scope("accuracy"):
            # calculate token-level accuracy
            self.reshaped_targets = tf.reshape(self.targets, [-1])
            correct_predictions = tf.equal(self.predictions, self.reshaped_targets)
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"))

            # calculate sentence-level accuracy
            self.predictions_sentence = tf.reshape(self.predictions, [-1, args.seq_length])  # batch_size * seq_length
            correct_predictions_sentence_tokens = tf.equal(self.predictions_sentence, self.targets)  # batch_size X seq_length
            multiply_mat = tf.constant(1, shape=[args.seq_length, 1])
            sentence_accuracy_mat = tf.matmul(tf.cast(correct_predictions_sentence_tokens, tf.int32), multiply_mat)  # batch_size X 1
            correct_predictions_sentence = \
                tf.equal(sentence_accuracy_mat, tf.constant(args.seq_length, shape=[args.batch_size, 1]))  # batch_size X 1
            self.accuracy_sentence = tf.reduce_mean(tf.cast(correct_predictions_sentence, "float"))

        # calculate loss
        with tf.name_scope("loss"):
            self.loss = seq2seq.sequence_loss_by_example(
                    [self.logits],  # TODO: should I use a list of 2D tensors ?
                    [self.reshaped_targets],  # TODO: correct ???
                    [tf.ones([args.batch_size * args.seq_length])],
                    args.num_classes)
            self.cost = tf.reduce_sum(self.loss) / args.batch_size / args.seq_length

        # train and update
        with tf.name_scope("update"):
            tvars = tf.trainable_variables()
            self.grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), args.grad_clip)  # TODO: correct ???
            optimizer = tf.train.AdamOptimizer(args.learning_rate)
            self.global_step = tf.Variable(0, name="global_step", trainable=False)
            self.train_op = optimizer.apply_gradients(zip(self.grads, tvars), global_step=self.global_step)

            # l2 norm clipping
            self.weight_clipping_op = []
            trainable_vars = tf.trainable_variables()
            for var in trainable_vars:
                if var.name.startswith('output/W'):
                    updated_var = tf.clip_by_norm(var, args.l2_limit)
                    self.weight_clipping_op.append(tf.assign(var, updated_var))
Ejemplo n.º 18
0
  def __init__(self, args, infer=False):
    self.dim = 1
    self.args = args
    if infer:
      args.batch_size = 1
      args.seq_length = 1

    if args.model == 'rnn':
      cell_fn = rnn_cell.BasicRNNCell
    elif args.model == 'gru':
      cell_fn = rnn_cell.GRUCell
    elif args.model == 'lstm':
      cell_fn = rnn_cell.BasicLSTMCell
    else:
      raise Exception("model type not supported: {}".format(args.model))

    cell = cell_fn(args.rnn_size)

    cell = rnn_cell.MultiRNNCell([cell] * args.num_layers)

    if (infer == False and args.keep_prob < 1): # training mode
      cell = rnn_cell.DropoutWrapper(cell, output_keep_prob = args.keep_prob)

    self.cell = cell

    self.input_data = tf.placeholder(dtype=tf.float32, shape=[None, args.seq_length, self.dim])
    self.target_data = tf.placeholder(dtype=tf.float32, shape=[None, args.seq_length, self.dim])
    self.initial_state = cell.zero_state(batch_size=args.batch_size, dtype=tf.float32)

    self.num_mixture = args.num_mixture
    NOUT = self.num_mixture * (1 + 2 * self.dim) # prob + mu + sig
    # [prob 1-20, dim1 mu, dim1 sig, dim2,... ]

    with tf.variable_scope('rnnlm'):
        output_w = tf.get_variable("output_w", [args.rnn_size, NOUT])
        output_b = tf.get_variable("output_b", [NOUT])

    self.w = output_w

    inputs = tf.split(1, args.seq_length, self.input_data)
    inputs = [tf.squeeze(input_, [1]) for input_ in inputs]

    outputs, states = seq2seq.rnn_decoder(inputs, self.initial_state, cell, loop_function=None, scope='rnnlm')
    output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size])
    output = tf.nn.xw_plus_b(output, output_w, output_b)
    self.final_state = states

    # reshape target data so that it is compatible with prediction shape
    flat_target_data = tf.reshape(self.target_data,[-1, self.dim])
    #[x1_data, x2_data, eos_data] = tf.split(1, 3, flat_target_data)
    x_data = flat_target_data

    def tf_normal(x, mu, sig):
        return tf.exp(-tf.square(x - mu) / (2 * tf.square(sig))) / (sig * tf.sqrt(2 * np.pi))

    
    def get_lossfunc(z_pi, z_mu, z_sig, x_data):
      result0 = tf_normal(x_data, z_mu, z_sig) 
      result1 = tf.reduce_sum(result0 * z_pi, 1, keep_dims=True)
      result2 = -tf.log(tf.maximum(result1, 1e-20)) 
      return tf.reduce_sum(result2)

    self.pi = output[:, 0:self.num_mixture]
    max_pi = tf.reduce_max(self.pi, 1, keep_dims=True)
    self.pi = tf.exp(tf.sub(self.pi, max_pi))
    normalize_pi = tf.inv(tf.reduce_sum(self.pi, 1, keep_dims=True))
    self.pi = normalize_pi * self.pi

    output_each_dim = tf.split(1, self.dim, output[:, self.num_mixture:])

    self.mu = []
    self.sig = []
    self.cost = 0

    for i in range(self.dim):
        [o_mu, o_sig] = tf.split(1, 2, output_each_dim[i])
        o_sig = tf.exp(o_sig)

        self.mu.append(o_mu)
        self.sig.append(o_sig)

        lossfunc = get_lossfunc(self.pi, o_mu, o_sig, x_data[:,i:i+1])
        self.cost += lossfunc / (args.batch_size * args.seq_length * self.dim)

    self.mu = tf.concat(1, self.mu)
    self.sig = tf.concat(1, self.sig)

    self.loss_summary = tf.scalar_summary("loss", self.cost)
    self.summary = tf.merge_all_summaries()

    self.lr = tf.Variable(0.0, trainable=False)
    tvars = tf.trainable_variables()
    grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), args.grad_clip)
    optimizer = tf.train.AdamOptimizer(self.lr)
    self.train_op = optimizer.apply_gradients(zip(grads, tvars))
Ejemplo n.º 19
0
  def __init__(self, args, infer=False):
    self.args = args
    if infer:
      args.batch_size = 1
      args.seq_length = 1

    if args.model == 'rnn':
      cell_fn = rnn_cell.BasicRNNCell
    elif args.model == 'gru':
      cell_fn = rnn_cell.GRUCell
    elif args.model == 'lstm':
      cell_fn = rnn_cell.BasicLSTMCell
    else:
      raise Exception("model type not supported: {}".format(args.model))

    cell = cell_fn(args.rnn_size)

    cell = rnn_cell.MultiRNNCell([cell] * args.num_layers)

    if (infer == False and args.keep_prob < 1): # training mode
      cell = rnn_cell.DropoutWrapper(cell, output_keep_prob = args.keep_prob)

    self.cell = cell

    self.input_data = tf.placeholder(dtype=tf.float32, shape=[None, args.seq_length, 3])
    self.target_data = tf.placeholder(dtype=tf.float32, shape=[None, args.seq_length, 3])
    self.initial_state = cell.zero_state(batch_size=args.batch_size, dtype=tf.float32)

    self.num_mixture = args.num_mixture
    NOUT = 1 + self.num_mixture * 6 # end_of_stroke + prob + 2*(mu + sig) + corr

    with tf.variable_scope('rnnlm'):
      output_w = tf.get_variable("output_w", [args.rnn_size, NOUT])
      output_b = tf.get_variable("output_b", [NOUT])

    inputs = tf.split(1, args.seq_length, self.input_data)
    inputs = [tf.squeeze(input_, [1]) for input_ in inputs]

    outputs, last_state = seq2seq.rnn_decoder(inputs, self.initial_state, cell, loop_function=None, scope='rnnlm')
    output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size])
    output = tf.nn.xw_plus_b(output, output_w, output_b)
    self.final_state = last_state

    # reshape target data so that it is compatible with prediction shape
    flat_target_data = tf.reshape(self.target_data,[-1, 3])
    [x1_data, x2_data, eos_data] = tf.split(1, 3, flat_target_data)

    # long method:
    #flat_target_data = tf.split(1, args.seq_length, self.target_data)
    #flat_target_data = [tf.squeeze(flat_target_data_, [1]) for flat_target_data_ in flat_target_data]
    #flat_target_data = tf.reshape(tf.concat(1, flat_target_data), [-1, 3])

    def tf_2d_normal(x1, x2, mu1, mu2, s1, s2, rho):
      # eq # 24 and 25 of http://arxiv.org/abs/1308.0850
      norm1 = tf.sub(x1, mu1)
      norm2 = tf.sub(x2, mu2)
      s1s2 = tf.mul(s1, s2)
      z = tf.square(tf.div(norm1, s1))+tf.square(tf.div(norm2, s2))-2*tf.div(tf.mul(rho, tf.mul(norm1, norm2)), s1s2)
      negRho = 1-tf.square(rho)
      result = tf.exp(tf.div(-z,2*negRho))
      denom = 2*np.pi*tf.mul(s1s2, tf.sqrt(negRho))
      result = tf.div(result, denom)
      return result

    def get_lossfunc(z_pi, z_mu1, z_mu2, z_sigma1, z_sigma2, z_corr, z_eos, x1_data, x2_data, eos_data):
      result0 = tf_2d_normal(x1_data, x2_data, z_mu1, z_mu2, z_sigma1, z_sigma2, z_corr)
      # implementing eq # 26 of http://arxiv.org/abs/1308.0850
      epsilon = 1e-20
      result1 = tf.mul(result0, z_pi)
      result1 = tf.reduce_sum(result1, 1, keep_dims=True)
      result1 = -tf.log(tf.maximum(result1, 1e-20)) # at the beginning, some errors are exactly zero.

      result2 = tf.mul(z_eos, eos_data) + tf.mul(1-z_eos, 1-eos_data)
      result2 = -tf.log(result2)

      result = result1 + result2
      return tf.reduce_sum(result)

    # below is where we need to do MDN splitting of distribution params
    def get_mixture_coef(output):
      # returns the tf slices containing mdn dist params
      # ie, eq 18 -> 23 of http://arxiv.org/abs/1308.0850
      z = output
      z_eos = z[:, 0:1]
      z_pi, z_mu1, z_mu2, z_sigma1, z_sigma2, z_corr = tf.split(1, 6, z[:, 1:])

      # process output z's into MDN paramters

      # end of stroke signal
      z_eos = tf.sigmoid(z_eos) # should be negated, but doesn't matter.

      # softmax all the pi's:
      max_pi = tf.reduce_max(z_pi, 1, keep_dims=True)
      z_pi = tf.sub(z_pi, max_pi)
      z_pi = tf.exp(z_pi)
      normalize_pi = tf.inv(tf.reduce_sum(z_pi, 1, keep_dims=True))
      z_pi = tf.mul(normalize_pi, z_pi)

      # exponentiate the sigmas and also make corr between -1 and 1.
      z_sigma1 = tf.exp(z_sigma1)
      z_sigma2 = tf.exp(z_sigma2)
      z_corr = tf.tanh(z_corr)

      return [z_pi, z_mu1, z_mu2, z_sigma1, z_sigma2, z_corr, z_eos]

    [o_pi, o_mu1, o_mu2, o_sigma1, o_sigma2, o_corr, o_eos] = get_mixture_coef(output)

    self.pi = o_pi
    self.mu1 = o_mu1
    self.mu2 = o_mu2
    self.sigma1 = o_sigma1
    self.sigma2 = o_sigma2
    self.corr = o_corr
    self.eos = o_eos

    lossfunc = get_lossfunc(o_pi, o_mu1, o_mu2, o_sigma1, o_sigma2, o_corr, o_eos, x1_data, x2_data, eos_data)
    self.cost = lossfunc / (args.batch_size * args.seq_length)

    self.lr = tf.Variable(0.0, trainable=False)
    tvars = tf.trainable_variables()
    grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), args.grad_clip)
    optimizer = tf.train.AdamOptimizer(self.lr)
    self.train_op = optimizer.apply_gradients(zip(grads, tvars))
Ejemplo n.º 20
0
    def __init__(self, args):

        self.size = args.rnn_size
        self.n_steps = args.n_steps
        self.batch_size = args.batch_size
        self.input_dim = args.input_dim
        self.num_layers = args.num_layers

        initializer = tf.random_uniform_initializer(-0.8, 0.8)
        # initializer = tf.zeros_initializer((size*2,1), dtype=tf.float32)

        self.seq_input = tf.placeholder(
            tf.float32, [self.n_steps, self.batch_size, self.input_dim])
        # sequence we will provide at runtime
        self.early_stop = tf.placeholder(tf.int32)
        # what timestep we want to stop at

        self.inputs = [
            tf.reshape(i, (self.batch_size, self.input_dim))
            for i in tf.split(0, self.n_steps, self.seq_input)
        ]
        # inputs for rnn needs to be a list, each item being a timestep.
        # we need to split our input into each timestep, and reshape it because split keeps dims by default
        # result = tf.placeholder(tf.float32, [n_steps, batch_size, seq_width])
        self.result = tf.placeholder(tf.float32, [None, self.input_dim])

        if args.cell_type == "srnn":
            cell = BasicRNNCell(
                self.size)  #, seq_width, initializer=initializer)
        elif args.cell_type == "lstm":
            cell = BasicLSTMCell(self.size, forget_bias=1.0)
        elif args.cell_type == "lstmp":
            cell = LSTMCell(self.size, self.input_dim, initializer=initializer)
        elif args.cell_type == "cw":
            cell = CWRNNCell(
                self.size,
                [1, 4, 16, 64])  #, seq_width, initializer=initializer)

        self.cell = cell = rnn_cell.MultiRNNCell([cell] * self.num_layers)

        # initial_state = cell.zero_state(batch_size, tf.float32)
        self.initial_state = tf.random_uniform(
            [self.batch_size, self.cell.state_size], -0.1, 0.1)

        # self variables: scope RNN -> BasicRNNCell -> get_variable("Matrix", "Bias")

        # network type
        if args.rnn_type == "rnn":
            self.outputs, self.states = rnn.rnn(
                self.cell,
                self.inputs,
                initial_state=self.initial_state,
                sequence_length=self.early_stop)
        elif args.rnn_type == "seq2seq":
            self.outputs, self.states = seq2seq.rnn_decoder(
                self.inputs,
                self.initial_state,
                self.cell,
                loop_function=loop if False else None)
            # set up lstm
        self.final_state = self.states[-1]

        self.W_o = tf.Variable(tf.random_normal([self.size, 1], stddev=0.01))
        self.b_o = tf.Variable(tf.random_normal([1], stddev=0.01))

        print "type(outputs)", type(self.outputs)
        self.output_cat = tf.reshape(tf.concat(1, self.outputs),
                                     [-1, self.size])
        self.output = tf.nn.xw_plus_b(self.output_cat, self.W_o, self.b_o)
        # self.final_state = states[-1]
        self.output2 = tf.reshape(
            self.output, [self.batch_size, self.n_steps, self.input_dim])
        self.output2 = self.output2 + tf.random_normal(
            [self.batch_size, self.n_steps, self.input_dim], stddev=0.05)
        # then transpose
        self.output2 = tf.transpose(self.output2, [1, 0, 2])
Ejemplo n.º 21
0
# Set Network
with tf.variable_scope('rnnlm'):
    softmax_w = tf.get_variable("softmax_w", [rnn_size, vocab_size])
    softmax_b = tf.get_variable("softmax_b", [vocab_size])
    with tf.device("/cpu:0"):
        embedding = tf.get_variable("embedding", [vocab_size, rnn_size])
        inputs = tf.split(1, seq_length, tf.nn.embedding_lookup(embedding, input_data))
        inputs = [tf.squeeze(input_, [1]) for input_ in inputs]
# Loop function for seq2seq
def loop(prev, _):
    prev = tf.nn.xw_plus_b(prev, softmax_w, softmax_b)
    prev_symbol = tf.stop_gradient(tf.argmax(prev, 1))
    return tf.nn.embedding_lookup(embedding, prev_symbol)
# Output of RNN
outputs, last_state = seq2seq.rnn_decoder(inputs, initial_state, cell, loop_function=None, scope='rnnlm')
output = tf.reshape(tf.concat(1, outputs), [-1, rnn_size])
logits = tf.nn.xw_plus_b(output, softmax_w, softmax_b)
# Next word probability
probs = tf.nn.softmax(logits)
# Define LOSS
loss = seq2seq.sequence_loss_by_example([logits], # Input
    [tf.reshape(targets, [-1])], # Target
    [tf.ones([batch_size * seq_length])], # Weight
    vocab_size)
# Define Optimizer
cost = tf.reduce_sum(loss) / batch_size / seq_length
final_state = last_state
lr = tf.Variable(0.0, trainable=False)
tvars = tf.trainable_variables()
grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), grad_clip)
Ejemplo n.º 22
0
    def __init__(self, args, infer=False):
        self.args = args
        if infer:
            args.batch_size = 1
            args.seq_length = 1

        if args.model == 'rnn': cell_fn = jzRNNCell
        elif args.model == 'gru': cell_fn = jzGRUCell
        elif args.model == 'lstm': cell_fn = jzLSTMCell
        else: raise Exception("model type not supported: {}".format(args.model))

        if args.activation == 'tanh': cell_af = tf.tanh
        elif args.activation == 'sigmoid': cell_af = tf.sigmoid
        elif args.activation == 'relu': cell_af = tf.nn.relu
        else: raise Exception("activation function not supported: {}".format(args.activation))

        self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length])
        self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length])

        with tf.variable_scope('rnnlm'):
            if not args.bidirectional:
                softmax_w = tf.get_variable("softmax_w", [args.rnn_size, args.vocab_size])
            else:
                softmax_w = tf.get_variable("softmax_w", [args.rnn_size*2, args.vocab_size])
            softmax_b = tf.get_variable("softmax_b", [args.vocab_size])
            with tf.device("/cpu:0"):
                embedding = tf.get_variable("embedding", [args.vocab_size, args.rnn_size])
                inputs = tf.split(1, args.seq_length, tf.nn.embedding_lookup(embedding, self.input_data))
                inputs = [tf.nn.dropout(tf.squeeze(input_, [1]),args.dropout) for input_ in inputs]

        # one-directional RNN (nothing changed here..)
        if not args.bidirectional:
            cell = cell_fn(args.rnn_size,activation=cell_af)
            self.cell = cell = rnn_cell.MultiRNNCell([cell] * args.num_layers)
            self.initial_state = cell.zero_state(args.batch_size, tf.float32)
            def loop(prev, _):
                prev = tf.matmul(prev, softmax_w) + softmax_b
                prev_symbol = tf.stop_gradient(tf.argmax(prev, 1))
                return tf.nn.embedding_lookup(embedding, prev_symbol)
            outputs, last_state = seq2seq.rnn_decoder(inputs, self.initial_state, cell, loop_function=loop if infer else None, scope='rnnlm')
            output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size])

        # bi-directional RNN
        else:
            lstm_fw = cell_fn(args.rnn_size,activation=cell_af)
            lstm_bw = cell_fn(args.rnn_size,activation=cell_af)
            self.lstm_fw = lstm_fw = rnn_cell.MultiRNNCell([lstm_fw]*args.num_layers)
            self.lstm_bw = lstm_bw = rnn_cell.MultiRNNCell([lstm_bw]*args.num_layers)
            self.initial_state_fw = lstm_fw.zero_state(args.batch_size,tf.float32)
            self.initial_state_bw = lstm_bw.zero_state(args.batch_size,tf.float32)
            outputs,_,_ = rnn.bidirectional_rnn(lstm_fw, lstm_bw, inputs,
                                            initial_state_fw=self.initial_state_fw,
                                            initial_state_bw=self.initial_state_bw,
                                                sequence_length=args.batch_size) 
            output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size*2])

        self.logits = tf.matmul(tf.nn.dropout(output,args.dropout), softmax_w) + softmax_b
        self.probs = tf.nn.softmax(self.logits)
        loss = seq2seq.sequence_loss_by_example([self.logits],
                [tf.reshape(self.targets, [-1])],
                [tf.ones([args.batch_size * args.seq_length])],
                args.vocab_size)
        self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length
        self.final_state = last_state
        self.lr = tf.Variable(0.0, trainable=False)
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars),
                args.grad_clip)
        optimizer = tf.train.AdamOptimizer(self.lr)
        self.train_op = optimizer.apply_gradients(zip(grads, tvars))
Ejemplo n.º 23
0
    def __init__(self, args, predict=False):

        self.args = args 
        if predict:
            batchSize = 1
            numSteps = 1

        # Various parameters for the LSTM. 
        # Hardcoded here for now.
        numSteps = 50 # Steps to unroll for
        batchSize = 50
        rnnSize = 128
        numLayers = 2
        gradClip = 5
        learningRate = 0.002
        decayRate = 0.97

        #Create LSTM layer and stack multiple layers. 
        lstmCell = rnn_cell.BasicLSTMCell(rnnSize)
        lstmNet = rnn_cell.MultiRNNCell([lstmCell] * numLayers)

        #Define placeholders.
        self.inputData = tf.placeholder(tf.int32, [batchSize, numSteps])
        self.targetOutput = tf.placeholder(tf.int32, [batchSize, numSteps])
        self.initialState = lstmNet.zero_state(batchSize, tf.float32)

        # If rnn_decoder is told to loop, this function will return to it the output at time
        # 't' for feeding as the input at time 't+1'. During training, this is generally
        # not done because we want to feed the *correct* input at all times and not what
        # is output. During prediction/testing, we loop the output back to the input to
        # generate our sequence of notes. 
        def feedBack(prev, _):
            prev = tf.nn.xw_plus_b(prev, softmax_w, softmax_b)
            prev_symbol = tf.stop_gradient(tf.argmax(prev, 1))
            return tf.nn.embedding_lookup(embedding, prev_symbol)

        with tf.variable_scope('nn_lstm'):
            softmax_w = tf.get_variable("softmax_w", [rnnSize, args.vocabSize])
            softmax_b = tf.get_variable("softmax_b", [args.vocabSize])
            with tf.device("/cpu:0"):
                embedding = tf.get_variable("embedding", [args.vocabSize, rnnSize])
                inputs = tf.split(1, numSteps, tf.nn.embedding_lookup(embedding, self.inputData))
                inputs = [tf.squeeze(input_, [1]) for input_ in inputs]
   
        #Call seq2seq rnn decoder.
        outputs, states = seq2seq.rnn_decoder(inputs, self.initialState, lstmNet, loop_function=feedBack if predict else None, scope='nn_lstm')
        output = tf.reshape(tf.concat(1, outputs), [-1, rnnSize])

        #Logit and probability
        #softmax_w = tf.get_variable("softmax_w", rnnSize, [args.vocabSize])
        #softmax_b = tf.get_variable("softmax_b", [args.vocabSize])
        self.logits = tf.matmul(output, softmax_w) + softmax_b
        self.probs = tf.nn.softmax(self.logits)

        # Calculate loss compared to targetOutput
        loss = seq2seq.sequence_loss_by_example([self.logits],
                [tf.reshape(self.targetOutput, [-1])],
                [tf.ones([batchSize * numSteps])],
                args.vocabSize)

        # Set the cost to minimize total loss.
        self.cost = tf.reduce_sum(loss)

        # Learning rate remains constant (not trainable)
        self.finalState = states[-1]
        self.learningRate = tf.Variable(0.0, trainable=False)

        # Define gradient and trainable variables for adjusting 
        # during training/optimization.
        trainableVars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, trainableVars),
                gradClip)

        # We use the Adam optimizer.
        #optimizer = tf.train.GradientDescentOptimizer(self.learningRate).minimize(loss)
        #optimizer = tf.train.AdagradOptimizer(self.learningRate, initial_accumulator_value=0.1)
        #self.trainStep = optimizer.apply_gradients(zip(grads, trainableVars))
        optimizer = tf.train.AdamOptimizer(self.learningRate)
        self.trainStep = optimizer.apply_gradients(zip(grads, trainableVars))
Ejemplo n.º 24
0
    def build_model(self, inputs, infer):
        x_in, lx_in, y_in, my_in = inputs

        if self.rnn_type == 'rnn':
            cell_fn = rnn_cell.BasicRNNCell
        elif self.rnn_type == 'gru':
            cell_fn = rnn_cell.GRUCell
        elif self.rnn_type == 'lstm':
            cell_fn = rnn_cell.BasicLSTMCell
        else:
            raise Exception('rnn type not supported: {}'.format(rnn_type))

        cell_enc = cell_fn(self.num_units)
        cell_dec = cell_fn(self.num_units)

        embedding = tf.get_variable('embedding',
                                    [self.vocab_size, self.dim_emb])

        # encoding
        enc_in = tf.nn.embedding_lookup(embedding, x_in)
        enc_in = tf.split(1, self.seq_len, enc_in)
        enc_in = [tf.squeeze(input_, [1]) for input_ in enc_in]
        #enc_in is seq_len * [batch_size, embdding_size]
        print("enc_in size:", len(enc_in))
        print("enc_in[0] shape:", enc_in[0].get_shape())

        _, initial_state = rnn.rnn(cell_enc,
                                   enc_in,
                                   sequence_length=lx_in,
                                   dtype='float32',
                                   scope='encoder')
        #self.initial_state = tf.Variable(initial_value=initial_state, validate_shape=False, name="initial_state")
        self.initial_state = tf.mul(1.0, initial_state, name='initial_state')

        # decoding
        if infer == False:
            dec_in = tf.nn.embedding_lookup(
                embedding,
                tf.concat(1, [
                    tf.zeros([self.batch_size, 1], dtype='int32'),
                    y_in[:, :self.seq_len - 1]
                ]))
            dec_in = tf.split(1, self.seq_len, dec_in)
            dec_in = [tf.squeeze(input_, [1]) for input_ in dec_in]
        else:
            dec_in = tf.nn.embedding_lookup(embedding, y_in)
            dec_in = tf.split(1, 1, dec_in)
            dec_in = [tf.squeeze(input_, [1]) for input_ in dec_in]

    # seq_len * [batch_size , embedding_size]
    # 50 * [32, 300]
        print("dec_in size:", len(dec_in))
        print("dec_in[0] shape:", dec_in[0].get_shape())

        # output is seq_len * [batch_size, num_units]
        output, last_state = seq2seq.rnn_decoder(dec_in,
                                                 self.initial_state,
                                                 cell_dec,
                                                 scope='decoder')
        print("output[0] shape:", output[0].get_shape())
        print("last_state shape:", last_state.get_shape())

        # output shape [batch_size*seq_len, num_units]
        # [32*50, 512]
        output = tf.reshape(tf.concat(1, output), [-1, self.num_units])
        self.last_state = tf.mul(1.0, last_state, name='last_state')
        #self.last_state = tf.Variable(initial_value=last_state, validate_shape=False, name="last_state")
        #self.last_state = last_state

        # get loss
        #with tf.variable_scope('softmax'):
        softmax_w = tf.get_variable('softmax_w',
                                    [self.num_units, self.vocab_size])
        softmax_b = tf.get_variable('softmax_b', [self.vocab_size])

        logits = tf.matmul(output, softmax_w) + softmax_b
        noname_probs = tf.nn.softmax(logits)
        self.probs = tf.mul(1.0, noname_probs, name='probs')
        self.log_probs = tf.log(self.probs, name='log_probs')

        loss = tf.nn.seq2seq.sequence_loss_by_example(
            [logits], [tf.reshape(y_in, [-1])], [tf.reshape(my_in, [-1])])
        #self.loss = loss

        print "loss shape:", loss.get_shape()
        self.cost = cost = tf.reduce_sum(loss) / tf.to_float(self.batch_size)
        self.loss = cost
        #tvars = tf.trainable_variables()
        #grads = tf.gradients(cost, tvars)
        #if self.grad_clip: grads, _ = tf.clip_by_global_norm(grads, self.grad_clip)
        #optimizer = tf.train.AdamOptimizer(self.lr)
        #self.train_op = optimizer.apply_gradients(zip(grads, tvars))

        return cost
Ejemplo n.º 25
0
    def create_model(self):

        self.input_data = tf.placeholder(tf.int32, [self.batch_size, self.seq_length], name="input_data")
        self.target_data = tf.placeholder(tf.int32, [self.batch_size, self.seq_length], name="target_data")

        # define hyper_parameters
        self.keep_prob = tf.Variable(0.3, trainable=False, name="keep_prob")
        self.lr = tf.Variable(0.0, trainable=False, name="lr")

        softmax_weights = tf.get_variable("softmax_weights", [self.rnn_size, self.vocab_size])
        softmax_biases = tf.get_variable("softmax_biases", [self.vocab_size])

        lstm_cell = rnn_cell.BasicLSTMCell(self.rnn_size)

        #        if self.is_training and self.keep_prob < 1:
        #              lstm_cell = rnn_cell.DropoutWrapper(lstm_cell, output_keep_prob=self.keep_prob)

        multilayer_cell = rnn_cell.MultiRNNCell([lstm_cell] * self.num_layers)
        self.initial_state = multilayer_cell.zero_state(self.batch_size, tf.float32)

        with tf.device("/cpu:0"):
            # define the embedding matrix for the whole vocabulary
            self.embedding = tf.get_variable("embeddings", [self.vocab_size, self.rnn_size])
            # take the vector representation for each word in the embeddings
            embeds = tf.nn.embedding_lookup(self.embedding, self.input_data)

        if self.is_training and self.keep_prob < 1:
            embeds = tf.nn.dropout(embeds, self.keep_prob)

        def loop(prev, _):
            prev = tf.nn.xw_plus_b(prev, softmax_weights, softmax_biases)
            prev_symbol = tf.stop_gradient(tf.argmax(prev, 1))
            return tf.nn.embedding_lookup(self.embedding, prev_symbol)

        # convert input to a list of seq_length
        inputs = tf.split(1, self.seq_length, embeds)

        # after splitting the shape becomes (batch_size,1,rnn_size). We need to modify it to [batch*rnn_size]
        inputs = [tf.squeeze(input_, [1]) for input_ in inputs]

        output, states = seq2seq.rnn_decoder(
            inputs, self.initial_state, multilayer_cell, loop_function=loop if self.infer else None, scope="rnnlm"
        )

        output = tf.reshape(tf.concat(1, output), [-1, self.rnn_size])

        self.logits = tf.nn.xw_plus_b(output, softmax_weights, softmax_biases)
        self.probs = tf.nn.softmax(self.logits, name="probability")

        loss = seq2seq.sequence_loss_by_example(
            [self.logits],
            [tf.reshape(self.target_data, [-1])],
            [tf.ones([self.batch_size * self.seq_length])],
            self.vocab_size,
        )
        self.cost = tf.reduce_sum(loss) / (self.batch_size * self.seq_length)

        self.final_state = states[-1]

        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), self.grad_clip)

        optimizer = tf.train.AdamOptimizer(0.01)
        self.train_op = optimizer.apply_gradients(zip(grads, tvars))
Ejemplo n.º 26
0
    def __init__(self, args, infer=False):
        self.dim = 1
        self.args = args
        if infer:
            args.batch_size = 1
            args.seq_length = 1

        if args.model == 'rnn':
            cell_fn = rnn_cell.BasicRNNCell
        elif args.model == 'gru':
            cell_fn = rnn_cell.GRUCell
        elif args.model == 'lstm':
            cell_fn = rnn_cell.BasicLSTMCell
        else:
            raise Exception("model type not supported: {}".format(args.model))

        cell = cell_fn(args.rnn_size)

        cell = rnn_cell.MultiRNNCell([cell] * args.num_layers)

        if (infer == False and args.keep_prob < 1):  # training mode
            cell = rnn_cell.DropoutWrapper(cell,
                                           output_keep_prob=args.keep_prob)

        self.cell = cell

        self.input_data = tf.placeholder(
            dtype=tf.float32, shape=[None, args.seq_length, self.dim])
        self.target_data = tf.placeholder(
            dtype=tf.float32, shape=[None, args.seq_length, self.dim])
        self.initial_state = cell.zero_state(batch_size=args.batch_size,
                                             dtype=tf.float32)

        self.num_mixture = args.num_mixture
        NOUT = self.num_mixture * (1 + 2 * self.dim)  # prob + mu + sig
        # [prob 1-20, dim1 mu, dim1 sig, dim2,... ]

        with tf.variable_scope('rnnlm'):
            output_w = tf.get_variable("output_w", [args.rnn_size, NOUT])
            output_b = tf.get_variable("output_b", [NOUT])

        self.w = output_w

        inputs = tf.split(1, args.seq_length, self.input_data)
        inputs = [tf.squeeze(input_, [1]) for input_ in inputs]

        outputs, states = seq2seq.rnn_decoder(inputs,
                                              self.initial_state,
                                              cell,
                                              loop_function=None,
                                              scope='rnnlm')
        output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size])
        output = tf.nn.xw_plus_b(output, output_w, output_b)
        self.final_state = states

        # reshape target data so that it is compatible with prediction shape
        flat_target_data = tf.reshape(self.target_data, [-1, self.dim])
        #[x1_data, x2_data, eos_data] = tf.split(1, 3, flat_target_data)
        x_data = flat_target_data

        def tf_normal(x, mu, sig):
            return tf.exp(-tf.square(x - mu) /
                          (2 * tf.square(sig))) / (sig * tf.sqrt(2 * np.pi))

        def get_lossfunc(z_pi, z_mu, z_sig, x_data):
            result0 = tf_normal(x_data, z_mu, z_sig)
            result1 = tf.reduce_sum(result0 * z_pi, 1, keep_dims=True)
            result2 = -tf.log(tf.maximum(result1, 1e-20))
            return tf.reduce_sum(result2)

        self.pi = output[:, 0:self.num_mixture]
        max_pi = tf.reduce_max(self.pi, 1, keep_dims=True)
        self.pi = tf.exp(tf.sub(self.pi, max_pi))
        normalize_pi = tf.inv(tf.reduce_sum(self.pi, 1, keep_dims=True))
        self.pi = normalize_pi * self.pi

        output_each_dim = tf.split(1, self.dim, output[:, self.num_mixture:])

        self.mu = []
        self.sig = []
        self.cost = 0

        for i in range(self.dim):
            [o_mu, o_sig] = tf.split(1, 2, output_each_dim[i])
            o_sig = tf.exp(o_sig)

            self.mu.append(o_mu)
            self.sig.append(o_sig)

            lossfunc = get_lossfunc(self.pi, o_mu, o_sig, x_data[:, i:i + 1])
            self.cost += lossfunc / (args.batch_size * args.seq_length *
                                     self.dim)

        self.mu = tf.concat(1, self.mu)
        self.sig = tf.concat(1, self.sig)

        self.loss_summary = tf.scalar_summary("loss", self.cost)
        self.summary = tf.merge_all_summaries()

        self.lr = tf.Variable(0.0, trainable=False)
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars),
                                          args.grad_clip)
        optimizer = tf.train.AdamOptimizer(self.lr)
        self.train_op = optimizer.apply_gradients(zip(grads, tvars))
Ejemplo n.º 27
0
    def __init__(self, args, infer=False):
        self.args = args
        # if infer:
        #     args.batch_size = 1
        #     args.seq_length = 1

        if args.model == 'rnn':
            cell_fn = rnn_cell.BasicRNNCell
        elif args.model == 'gru':
            cell_fn = rnn_cell.GRUCell
        elif args.model == 'lstm':
            cell_fn = rnn_cell.BasicLSTMCell
        else:
            raise Exception("model type not supported: {}".format(args.model))

        cell = cell_fn(args.rnn_size)

        self.cell = cell = rnn_cell.MultiRNNCell([cell] * args.num_layers)

        #self.seq_length = tf.placeholder(tf.int32)
        #args.seq_length = self.seq_length
        self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length])
        self.targets = tf.placeholder(tf.int32, [args.batch_size])
        self.initial_state = cell.zero_state(args.batch_size, tf.float32)

        with tf.variable_scope('rnnlm'):
            softmax_w = tf.get_variable("softmax_w", [args.rnn_size, args.vocab_size])
            softmax_b = tf.get_variable("softmax_b", [args.vocab_size])
            with tf.device("/cpu:0"):
                embedding = tf.get_variable("embedding", [args.vocab_size, args.rnn_size])
                inputs = tf.split(1, args.seq_length, tf.nn.embedding_lookup(embedding, self.input_data))
                # len(inputs)==args.seq_length, shape(inputs[0])==(args.batch_size, args.rnn_size)
                inputs = [tf.squeeze(input_, [1]) for input_ in inputs]

        def loop(prev, _):
            return None  # TODO
            prev = tf.nn.xw_plus_b(prev, softmax_w, softmax_b)
            prev_symbol = tf.stop_gradient(tf.argmax(prev, 1))
            return tf.nn.embedding_lookup(embedding, prev_symbol)

        # len(outputs)==args.seq_length, shape(outputs[0])==(args.batch_size, args.rnn_size)
        outputs, states = seq2seq.rnn_decoder(inputs, self.initial_state, cell, loop_function=loop if infer else None, scope='rnnlm')
        # # shape(output) = (batch_size*seq_length, rnn_size)
        # output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size])
        def handle_outputs(use_lastone=True):
            """ Shape of return is [batch_size, rnn_size].
            """
            if use_lastone:
                return outputs[-1]
            output = tf.add_n(outputs)
            output = tf.div(output, len(outputs))
            return output
        output = handle_outputs(use_lastone=False)
        # shape(logits) = (batch_size, vocab_size)
        self.logits = tf.nn.xw_plus_b(output, softmax_w, softmax_b)
        self.probs = tf.nn.softmax(self.logits)
        loss = seq2seq.sequence_loss_by_example([self.logits],
                [tf.reshape(self.targets, [-1])],
                [tf.ones([args.batch_size])],
                args.vocab_size)
        self.cost = tf.reduce_sum(loss) / args.batch_size
        _ = tf.scalar_summary('cost', self.cost)

        # Evaluate accuracy
        correct_pred = tf.equal(tf.cast(tf.argmax(self.logits, 1), tf.int32), tf.reshape(self.targets, [-1]))
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
        _ = tf.scalar_summary('accuracy', self.accuracy)

        self.final_state = states
        self.lr = tf.Variable(0.0, trainable=False)
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars),
                args.grad_clip)
        optimizer = tf.train.AdamOptimizer(self.lr)
        self.train_op = optimizer.apply_gradients(zip(grads, tvars))
Ejemplo n.º 28
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_dir',
                        type=str,
                        default='../data/xinhua',
                        help='data directory containing input.txt')
    parser.add_argument('--batch_size',
                        type=int,
                        default=120,
                        help='minibatch size')
    parser.add_argument('--seq_length',
                        type=int,
                        default=5,
                        help='RNN sequence length')
    parser.add_argument('--hidden_num',
                        type=int,
                        default=256,
                        help='number of hidden layers')
    parser.add_argument('--word_dim',
                        type=int,
                        default=256,
                        help='number of word embedding')
    parser.add_argument('--num_epochs',
                        type=int,
                        default=50,
                        help='number of epochs')
    parser.add_argument('--model',
                        type=str,
                        default='lstm',
                        help='rnn, gru, or lstm')
    parser.add_argument('--grad_clip',
                        type=float,
                        default=10.,
                        help='clip gradients at this value')

    args = parser.parse_args()  #参数集合

    #准备训练数据
    data_loader = TextLoader2(args.data_dir, args.batch_size, args.seq_length)
    args.vocab_size = data_loader.vocab_size

    #模型定义
    graph = tf.Graph()
    with graph.as_default():

        if args.model == 'rnn':
            cell_fn = rnn_cell.BasicRNNCell
        elif args.model == 'gru':
            cell_fn = rnn_cell.GRUCell
        elif args.model == 'lstm':
            cell_fn = rnn_cell.BasicLSTMCell
        else:
            raise Exception("model type not supported: {}".format(args.model))

        cell = cell_fn(args.hidden_num)

        #输入变量
        input_data = tf.placeholder(tf.int32,
                                    [args.batch_size, args.seq_length])
        targets = tf.placeholder(tf.int64, [args.batch_size, args.seq_length])

        initial_state = cell.zero_state(args.batch_size, tf.float32)
        #模型参数
        with tf.variable_scope('rnnlm' + 'embedding'):
            embeddings = tf.Variable(
                tf.random_uniform([args.vocab_size, args.word_dim], -1.0, 1.0))
            embeddings = tf.nn.l2_normalize(embeddings, 1)

        with tf.variable_scope('rnnlm' + 'weight'):
            softmax_w = tf.get_variable("softmax_w",
                                        [args.hidden_num, args.vocab_size])
            softmax_b = tf.get_variable("softmax_b", [args.vocab_size])

        # def loop(prev, _):
        #     prev = tf.matmul(prev, softmax_w) + softmax_b
        #     prev_symbol = tf.stop_gradient(tf.argmax(prev, 1))
        #     return tf.nn.embedding_lookup(embeddings, prev_symbol)

        inputs = tf.split(1, args.seq_length,
                          tf.nn.embedding_lookup(embeddings, input_data))
        inputs = [tf.squeeze(input_, [1]) for input_ in inputs]

        outputs, last_state = seq2seq.rnn_decoder(inputs, initial_state, cell)
        output = tf.reshape(tf.concat(1, outputs), [-1, args.hidden_num])
        logits = tf.matmul(output, softmax_w) + softmax_b
        probs = tf.nn.softmax(logits)
        loss_rnn = seq2seq.sequence_loss_by_example(
            [logits], [tf.reshape(targets, [-1])],
            [tf.ones([args.batch_size * args.seq_length])], args.vocab_size)
        cost = tf.reduce_sum(loss_rnn) / args.batch_size / args.seq_length
        final_state = last_state
        lr = tf.Variable(0.0, trainable=False)
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars),
                                          args.grad_clip)
        optimizer = tf.train.AdagradOptimizer(0.1)
        train_op = optimizer.apply_gradients(zip(grads, tvars))

        #输出词向量
        embeddings_norm = tf.sqrt(
            tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
        normalized_embeddings = embeddings / embeddings_norm

    #模型训练
    with tf.Session(graph=graph) as sess:
        tf.initialize_all_variables().run()
        for e in range(args.num_epochs):
            data_loader.reset_batch_pointer()
            for b in range(data_loader.num_batches):
                start = time.time()
                x, y = data_loader.next_batch()
                feed = {input_data: x, targets: y}
                train_loss, _ = sess.run([cost, train_op], feed)
                end = time.time()
                print(
                    "{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}"
                    .format(b, data_loader.num_batches, e, train_loss,
                            end - start))
            np.save('rnnlm_word_embeddings', normalized_embeddings.eval())
Ejemplo n.º 29
0
    def __init__(
        self,
        vocab,
        tagset,
        alphabet,
        word_embedding_size,
        char_embedding_size,
        num_chars,
        num_steps,
        optimizer_desc,
        generate_lemmas,
        l2,
        dropout_prob_values,
        experiment_name,
        supply_form_characters_to_lemma,
        threads=0,
        seed=None,
        write_summaries=True,
        use_attention=True,
        scheduled_sampling=None,
    ):
        """
        Builds the tagger computation graph and initializes it in a TensorFlow
        session.

        Arguments:

            vocab: Vocabulary of word forms.

            tagset: Vocabulary of possible tags.

            alphabet: Vocabulary of possible characters.

            word_embedding_size (int): Size of the form-based word embedding.

            char_embedding_size (int): Size of character embeddings, i.e. a
                half of the size of the character-based words embeddings.

            num_chars: Maximum length of a word.

            num_steps: Maximum lenght of a sentence.

            optimizer_desc: Description of the optimizer.

            generate_lemmas: Generate lemmas during tagging.

            seed: TensorFlow seed

            write_summaries: Write summaries using TensorFlow interface.
        """

        self.num_steps = num_steps
        self.num_chars = num_chars

        self.word_embedding_size = word_embedding_size
        self.char_embedding_size = char_embedding_size
        self.lstm_size = word_embedding_size + 2 * char_embedding_size  ###

        self.vocab = vocab
        self.tagset = tagset
        self.alphabet = alphabet

        self.dropout_prob_values = dropout_prob_values

        self.forward_initial_state = tf.placeholder(
            tf.float32, [None, rnn_cell.BasicLSTMCell(self.lstm_size).state_size], name="forward_lstm_initial_state"
        )
        self.backward_initial_state = tf.placeholder(
            tf.float32, [None, rnn_cell.BasicLSTMCell(self.lstm_size).state_size], name="backward_lstm_initial_state"
        )
        self.sentence_lengths = tf.placeholder(tf.int64, [None], name="sentence_lengths")
        self.tags = tf.placeholder(tf.int32, [None, num_steps], name="ground_truth_tags")
        self.dropout_prob = tf.placeholder(tf.float32, [None], name="dropout_keep_p")
        self.generate_lemmas = generate_lemmas

        global_step = tf.Variable(0, trainable=False)

        input_list = []
        regularize = []

        # Word-level embeddings
        if word_embedding_size:
            self.words = tf.placeholder(tf.int32, [None, num_steps], name="words")
            word_embeddings = tf.Variable(tf.random_uniform([len(vocab), word_embedding_size], -1.0, 1.0))
            we_lookup = tf.nn.embedding_lookup(word_embeddings, self.words)

            input_list.append(we_lookup)

        # Character-level embeddings
        if char_embedding_size:
            self.chars = tf.placeholder(tf.int32, [None, num_steps, num_chars], name="chars")
            self.chars_lengths = tf.placeholder(tf.int64, [None, num_steps], name="chars_lengths")

            char_embeddings = tf.Variable(tf.random_uniform([len(alphabet), char_embedding_size], -1.0, 1.0))
            ce_lookup = tf.nn.embedding_lookup(char_embeddings, self.chars)

            reshaped_ce_lookup = tf.reshape(ce_lookup, [-1, num_chars, char_embedding_size], name="reshape-char_inputs")
            char_inputs = [tf.squeeze(input_, [1]) for input_ in tf.split(1, num_chars, reshaped_ce_lookup)]

            char_inputs_lengths = tf.reshape(self.chars_lengths, [-1])

            with tf.variable_scope("char_forward"):
                char_lstm = rnn_cell.BasicLSTMCell(char_embedding_size)
                _, char_last_state = rnn.rnn(
                    cell=char_lstm, inputs=char_inputs, sequence_length=char_inputs_lengths, dtype=tf.float32
                )
                tf.get_variable_scope().reuse_variables()
                regularize.append(tf.get_variable("RNN/BasicLSTMCell/Linear/Matrix"))

            with tf.variable_scope("char_backward"):
                char_lstm_rev = rnn_cell.BasicLSTMCell(char_embedding_size)
                _, char_last_state_rev = rnn.rnn(
                    cell=char_lstm_rev,
                    inputs=self._reverse_seq(char_inputs, char_inputs_lengths),
                    sequence_length=char_inputs_lengths,
                    dtype=tf.float32,
                )
                tf.get_variable_scope().reuse_variables()
                regularize.append(tf.get_variable("RNN/BasicLSTMCell/Linear/Matrix"))

            last_char_lstm_state = tf.split(1, 2, char_last_state)[1]
            last_char_lstm_state_rev = tf.split(1, 2, char_last_state_rev)[1]

            last_char_states = tf.reshape(
                last_char_lstm_state, [-1, num_steps, char_embedding_size], name="reshape-charstates"
            )
            last_char_states_rev = tf.reshape(
                last_char_lstm_state_rev, [-1, num_steps, char_embedding_size], name="reshape-charstates_rev"
            )

            char_output = tf.concat(2, [last_char_states, last_char_states_rev])

            input_list.append(char_output)

        # All inputs correctly sliced
        input_list_dropped = [tf.nn.dropout(x, self.dropout_prob[0]) for x in input_list]
        inputs = [tf.squeeze(input_, [1]) for input_ in tf.split(1, num_steps, tf.concat(2, input_list_dropped))]

        with tf.variable_scope("forward"):
            lstm = rnn_cell.BasicLSTMCell(self.lstm_size)
            outputs, last_state = rnn.rnn(
                cell=lstm,
                inputs=inputs,
                dtype=tf.float32,
                initial_state=self.forward_initial_state,
                sequence_length=self.sentence_lengths,
            )

            tf.get_variable_scope().reuse_variables()
            regularize.append(tf.get_variable("RNN/BasicLSTMCell/Linear/Matrix"))

        with tf.variable_scope("backward"):
            lstm_rev = rnn_cell.BasicLSTMCell(self.lstm_size)
            outputs_rev_rev, last_state_rev = rnn.rnn(
                cell=lstm_rev,
                inputs=self._reverse_seq(inputs, self.sentence_lengths),
                dtype=tf.float32,
                initial_state=self.backward_initial_state,
                sequence_length=self.sentence_lengths,
            )

            outputs_rev = self._reverse_seq(outputs_rev_rev, self.sentence_lengths)

            tf.get_variable_scope().reuse_variables()
            regularize.append(tf.get_variable("RNN/BasicLSTMCell/Linear/Matrix"))

        # outputs_forward = tf.reshape(tf.concat(1, outputs), [-1, self.lstm_size],
        #                    name="reshape-outputs_forward")

        # outputs_backward = tf.reshape(tf.concat(1, outputs_rev), [-1, self.lstm_size],
        #                    name="reshape-outputs_backward")

        # forward_w = tf.get_variable("forward_w", [self.lstm_size, self.lstm_size])
        # backward_w = tf.get_variable("backward_w", [self.lstm_size, self.lstm_size])
        # non_linearity_bias = tf.get_variable("non_linearity_b", [self.lstm_size])

        outputs_bidi = [tf.concat(1, [o1, o2]) for o1, o2 in zip(outputs, reversed(outputs_rev))]

        # output = tf.tanh(tf.matmul(outputs_forward, forward_w) + tf.matmul(outputs_backward, backward_w) + non_linearity_bias)
        output = tf.reshape(tf.concat(1, outputs_bidi), [-1, 2 * self.lstm_size], name="reshape-outputs_bidi")
        output_dropped = tf.nn.dropout(output, self.dropout_prob[1])

        # We are computing only the logits, not the actual softmax -- while
        # computing the loss, it is done by the sequence_loss_by_example and
        # during the runtime classification, the argmax over logits is enough.

        softmax_w = tf.get_variable("softmax_w", [2 * self.lstm_size, len(tagset)])
        logits_flatten = tf.nn.xw_plus_b(output_dropped, softmax_w, tf.get_variable("softmax_b", [len(tagset)]))
        # tf.get_variable_scope().reuse_variables()
        regularize.append(softmax_w)

        self.logits = tf.reshape(logits_flatten, [-1, num_steps, len(tagset)], name="reshape-logits")
        estimated_tags_flat = tf.to_int32(tf.argmax(logits_flatten, dimension=1))
        self.last_state = last_state

        # output maks: compute loss only if it insn't a padded word (i.e. zero index)
        output_mask = tf.reshape(tf.to_float(tf.not_equal(self.tags, 0)), [-1])

        gt_tags_flat = tf.reshape(self.tags, [-1])
        tagging_loss = seq2seq.sequence_loss_by_example(
            logits=[logits_flatten], targets=[gt_tags_flat], weights=[output_mask]
        )

        tagging_accuracy = tf.reduce_sum(
            tf.to_float(tf.equal(estimated_tags_flat, gt_tags_flat)) * output_mask
        ) / tf.reduce_sum(output_mask)
        tf.scalar_summary("train_accuracy", tagging_accuracy, collections=["train"])
        tf.scalar_summary("dev_accuracy", tagging_accuracy, collections=["dev"])

        self.cost = tf.reduce_mean(tagging_loss)

        tf.scalar_summary("train_tagging_loss", tf.reduce_mean(tagging_loss), collections=["train"])
        tf.scalar_summary("dev_tagging_loss", tf.reduce_mean(tagging_loss), collections=["dev"])

        if generate_lemmas:
            with tf.variable_scope("decoder"):
                self.lemma_chars = tf.placeholder(tf.int32, [None, num_steps, num_chars + 2], name="lemma_chars")

                lemma_state_size = self.lstm_size

                lemma_w = tf.Variable(tf.random_uniform([lemma_state_size, len(alphabet)], 0.5), name="state_to_char_w")
                lemma_b = tf.Variable(tf.fill([len(alphabet)], -math.log(len(alphabet))), name="state_to_char_b")
                lemma_char_embeddings = tf.Variable(
                    tf.random_uniform(
                        [len(alphabet), lemma_state_size / (2 if supply_form_characters_to_lemma else 1)], -0.5, 0.5
                    ),
                    name="char_embeddings",
                )

                lemma_char_inputs = [
                    tf.squeeze(input_, [1])
                    for input_ in tf.split(
                        1,
                        num_chars + 2,
                        tf.reshape(self.lemma_chars, [-1, num_chars + 2], name="reshape-lemma_char_inputs"),
                    )
                ]

                if supply_form_characters_to_lemma:
                    char_inputs_zeros = [
                        tf.squeeze(chars, [1])
                        for chars in tf.split(
                            1, num_chars, tf.reshape(self.chars, [-1, num_chars], name="reshape-char_inputs_zeros")
                        )
                    ]
                    char_inputs_zeros.append(char_inputs_zeros[0] * 0)

                    def loop(prev_state, i):
                        # it takes the previous hidden state, finds the character and formats it
                        # as input for the next time step ... used in the decoder in the "real decoding scenario"
                        out_activation = tf.matmul(prev_state, lemma_w) + lemma_b
                        prev_char_index = tf.argmax(out_activation, 1)
                        return tf.concat(
                            1,
                            [
                                tf.nn.embedding_lookup(lemma_char_embeddings, prev_char_index),
                                tf.nn.embedding_lookup(lemma_char_embeddings, char_inputs_zeros[i]),
                            ],
                        )

                    embedded_lemma_characters = []
                    for lemma_chars, form_chars in zip(lemma_char_inputs[:-1], char_inputs_zeros):
                        embedded_lemma_characters.append(
                            tf.concat(
                                1,
                                [
                                    tf.nn.embedding_lookup(lemma_char_embeddings, lemma_chars),
                                    tf.nn.embedding_lookup(lemma_char_embeddings, form_chars),
                                ],
                            )
                        )
                else:

                    def loop(prev_state, _):
                        # it takes the previous hidden state, finds the character and formats it
                        # as input for the next time step ... used in the decoder in the "real decoding scenario"
                        out_activation = tf.matmul(prev_state, lemma_w) + lemma_b
                        prev_char_index = tf.argmax(out_activation, 1)
                        return tf.nn.embedding_lookup(lemma_char_embeddings, prev_char_index)

                    embedded_lemma_characters = []
                    for lemma_chars in lemma_char_inputs[:-1]:
                        embedded_lemma_characters.append(tf.nn.embedding_lookup(lemma_char_embeddings, lemma_chars))

                def sampling_loop(prev_state, i):
                    threshold = scheduled_sampling / (scheduled_sampling + tf.exp(tf.to_float(global_step)))
                    condition = tf.less_equal(tf.random_uniform(tf.shape(embedded_lemma_characters[0])), threshold)
                    return tf.select(condition, embedded_lemma_characters[i], loop(prev_state, i))

                decoder_cell = rnn_cell.BasicLSTMCell(lemma_state_size)

                if scheduled_sampling:
                    lf = sampling_loop
                else:
                    lf = None

                if use_attention:
                    lemma_outputs_train, _ = seq2seq.attention_decoder(
                        embedded_lemma_characters, output_dropped, reshaped_ce_lookup, decoder_cell, loop_function=lf
                    )
                else:
                    lemma_outputs_train, _ = seq2seq.rnn_decoder(
                        embedded_lemma_characters, output_dropped, decoder_cell, loop_function=lf
                    )

                tf.get_variable_scope().reuse_variables()
                # regularize.append(tf.get_variable('attention_decoder/BasicLSTMCell/Linear/Matrix'))

                tf.get_variable_scope().reuse_variables()

                if use_attention:
                    lemma_outputs_runtime, _ = seq2seq.attention_decoder(
                        embedded_lemma_characters, output_dropped, reshaped_ce_lookup, decoder_cell, loop_function=loop
                    )
                else:
                    lemma_outputs_runtime, _ = seq2seq.rnn_decoder(
                        embedded_lemma_characters, output_dropped, decoder_cell, loop_function=loop
                    )

                lemma_char_logits_train = [tf.matmul(o, lemma_w) + lemma_b for o in lemma_outputs_train]

                lemma_char_logits_runtime = [tf.matmul(o, lemma_w) + lemma_b for o in lemma_outputs_runtime]

                self.lemmas_decoded = tf.reshape(
                    tf.transpose(tf.argmax(tf.pack(lemma_char_logits_runtime), 2)), [-1, num_steps, num_chars + 1]
                )

                lemma_char_weights = []
                for lemma_chars in lemma_char_inputs[1:]:
                    lemma_char_weights.append(tf.to_float(tf.not_equal(lemma_chars, 0)))

                lemmatizer_loss = seq2seq.sequence_loss(
                    lemma_char_logits_train, lemma_char_inputs[1:], lemma_char_weights
                )

                lemmatizer_loss_runtime = seq2seq.sequence_loss(
                    lemma_char_logits_runtime, lemma_char_inputs[1:], lemma_char_weights
                )

                tf.scalar_summary(
                    "train_lemma_loss_with_gt_inputs", tf.reduce_mean(lemmatizer_loss), collections=["train"]
                )
                tf.scalar_summary("dev_lemma_loss_with_gt_inputs", tf.reduce_mean(lemmatizer_loss), collections=["dev"])

                tf.scalar_summary(
                    "train_lemma_loss_with_decoded_inputs",
                    tf.reduce_mean(lemmatizer_loss_runtime),
                    collections=["train"],
                )
                tf.scalar_summary(
                    "dev_lemma_loss_with_decoded_inputs", tf.reduce_mean(lemmatizer_loss_runtime), collections=["dev"]
                )

                self.cost += tf.reduce_mean(lemmatizer_loss) + tf.reduce_mean(lemmatizer_loss_runtime)

        self.cost += l2 * sum([tf.nn.l2_loss(variable) for variable in regularize])

        tf.scalar_summary("train_optimization_cost", self.cost, collections=["train"])
        tf.scalar_summary("dev_optimization_cost", self.cost, collections=["dev"])

        def decay(learning_rate, exponent, iteration_steps):
            return tf.train.exponential_decay(learning_rate, global_step, iteration_steps, exponent, staircase=True)

        optimizer = eval("tf.train." + optimizer_desc)
        self.train = optimizer.minimize(self.cost, global_step=global_step)

        if threads > 0:
            self.session = tf.Session(
                config=tf.ConfigProto(inter_op_parallelism_threads=threads, intra_op_parallelism_threads=threads)
            )
        else:
            self.session = tf.Session()
        self.session.run(tf.initialize_all_variables())

        if write_summaries:
            self.summary_train = tf.merge_summary(tf.get_collection("train"))
            self.summary_dev = tf.merge_summary(tf.get_collection("dev"))
            timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
            self.summary_writer = tf.train.SummaryWriter("logs/" + timestamp + "_" + experiment_name)

        self.steps = 0
Ejemplo n.º 30
0
def create(x, targets,batch_size=-1):
    ops = {
        'conv1d':conv1d,
        'conv1d_transpose':conv1d_transpose,
        'feed_forward_nn':feed_forward_nn,
        'autoencoder':autoencoder,
        'reshape':reshape,
        'lstm':lstm
    }


    results = {}
    def nextMethod(current_layer):
        global layer_index
        if(len(layers) == layer_index+1):
            return current_layer
        layer_index += 1
        layer_def = layers[layer_index]
        return ops[layer_def['type']](current_layer, layer_def, nextMethod)

    decoded = ops[layers[0]['type']](x, layers[0], nextMethod)
    #decoded=input
    reconstructed_x = tf.reshape(decoded, [-1, SIZE,DEPTH])
    print("Completed reshaping")


    ## hack build lstm
    size = SIZE#layer_def['size']
    cell = rnn_cell.BasicLSTMCell(size)

    initial_state = cell.zero_state(batch_size, tf.float32)
    outputs, last_state = seq2seq.rnn_decoder([decoded], initial_state, cell)
    extra_outputs = tf.concat(1, outputs)
    print("shape of extra", extra_outputs)
    output = tf.reshape(extra_outputs, [-1, size])
    print("shape of output", output.get_shape())

#    softmax_w = tf.get_variable("softmax_w", [size, tf.shape(input)[0]]) #wrong
#    softmax_b = tf.get_variable("softmax_b", [tf.shape(input)[0]]) #wrong

#    logits = tf.nn.xw_plus_b(output, softmax_w, softmax_b)
    #print("shape of logits", logits.get_shape())
    #probs = tf.nn.softmax(logits)
    #print("shape of probs", probs.get_shape())

    #`weights = tf.ones_like(logits)
    #print("shape of targets", targets.get_shape())
    num_decoder_symbols = 10
    #loss = seq2seq.sequence_loss_by_example([logits], [targets], [weights], num_decoder_symbols)
    #output=loss
    #results["cost"]= tf.reduce_sum(loss) / SIZE / 1000
    ## end hack
    predict = output
    results["cost"]= tf.sqrt(tf.reduce_mean(tf.square(targets-reconstructed_x)))*0.1+tf.sqrt(tf.reduce_mean(tf.square(x-reconstructed_x)))*0.9
    results['predict']=predict

    results['decoded']=tf.reshape(decoded, [-1])

    #results['arranged']= arranged_prev_layer
    #results['transposed']= conv_transposed
    return results
Ejemplo n.º 31
0
    def __init__(self, args, infer=False):
        self.args = args
        if infer:
            args.batch_size = 1
            args.seq_length = 1

        if args.model == 'rnn':
            cell_fn = rnn_cell.BasicRNNCell
        elif args.model == 'gru':
            cell_fn = rnn_cell.GRUCell
        elif args.model == 'lstm':
            cell_fn = rnn_cell.BasicLSTMCell
        else:
            raise Exception("model type not supported: {}".format(args.model))

        cell = cell_fn(args.rnn_size)

        self.cell = cell = rnn_cell.MultiRNNCell([cell] * args.num_layers)

        # create tensorflow placeholder
        self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length])
        self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length])
        # Initial state of the cell memory.
        self.initial_state = cell.zero_state(args.batch_size, tf.float32)

        # create namespace for shareable variables (variable name = "rnnlm/softmax_w")
        with tf.variable_scope('rnnlm'):
            # create (or get) a variable with shape [rnn_size, vocab_size]
            softmax_w = tf.get_variable("softmax_w", [args.rnn_size, args.vocab_size])
            softmax_b = tf.get_variable("softmax_b", [args.vocab_size])
            with tf.device("/cpu:0"):
                # preparing dense representation of the data in a embedding matrix
                embedding = tf.get_variable("embedding", [args.vocab_size, args.rnn_size])
                inputs = tf.split(1, args.seq_length, tf.nn.embedding_lookup(embedding, self.input_data))
                inputs = [tf.squeeze(input_, [1]) for input_ in inputs]

        def loop(prev, _):
            prev = tf.matmul(prev, softmax_w) + softmax_b
            prev_symbol = tf.stop_gradient(tf.argmax(prev, 1))
            return tf.nn.embedding_lookup(embedding, prev_symbol)

        # rnn network
        outputs, last_state = seq2seq.rnn_decoder(inputs,
                                                  self.initial_state,
                                                  cell,
                                                  loop_function=loop if infer else None,
                                                  scope='rnnlm')
        output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size])
        # last layer (like fully connected nn)
        self.logits = tf.matmul(output, softmax_w) + softmax_b
        # activation function of the last layer
        self.probs = tf.nn.softmax(self.logits)

        # loss function
        loss = seq2seq.sequence_loss_by_example([self.logits],
                [tf.reshape(self.targets, [-1])],
                [tf.ones([args.batch_size * args.seq_length])],
                args.vocab_size)

        # training function
        self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length

        self.final_state = last_state

        self.lr = tf.Variable(0.0, trainable=False)
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars),
                args.grad_clip)
        optimizer = tf.train.AdamOptimizer(self.lr)
        self.train_op = optimizer.apply_gradients(zip(grads, tvars))