def create_model(self): self.input_data = tf.placeholder(tf.int32, [self.batch_size, self.seq_length], name="input_data") self.target_data = tf.placeholder(tf.int32,[self.batch_size, self.seq_length], name="target_data") # define hyper_parameters self.keep_prob = tf.Variable(0.3, trainable=False, name='keep_prob') self.lr = tf.Variable(0.0, trainable=False, name="lr") softmax_weights = tf.get_variable("softmax_weights",[self.rnn_size, self.vocab_size]) softmax_biases = tf.get_variable("softmax_biases", [self.vocab_size]) lstm_cell = rnn_cell.BasicLSTMCell(self.rnn_size) # if self.is_training and self.keep_prob < 1: # lstm_cell = rnn_cell.DropoutWrapper(lstm_cell, output_keep_prob=self.keep_prob) multilayer_cell = rnn_cell.MultiRNNCell([lstm_cell] * self.num_layers) self.initial_state = multilayer_cell.zero_state(self.batch_size, tf.float32) with tf.device("/cpu:0"): # define the embedding matrix for the whole vocabulary self.embedding = tf.get_variable("embeddings", [self.vocab_size, self.rnn_size]) # take the vector representation for each word in the embeddings embeds = tf.nn.embedding_lookup(self.embedding, self.input_data) if self.is_training and self.keep_prob < 1: embeds = tf.nn.dropout(embeds, self.keep_prob) def loop(prev, _): prev = tf.nn.xw_plus_b(prev, softmax_weights, softmax_biases) prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(self.embedding, prev_symbol) #convert input to a list of seq_length inputs = tf.split(1,self.seq_length, embeds) #after splitting the shape becomes (batch_size,1,rnn_size). We need to modify it to [batch*rnn_size] inputs = [ tf.squeeze(input_, [1]) for input_ in inputs] output,states= seq2seq.rnn_decoder(inputs,self.initial_state, multilayer_cell, loop_function=loop if self.infer else None, scope='rnnlm') output = tf.reshape(tf.concat(1, output), [-1, self.rnn_size]) self.logits = tf.nn.xw_plus_b(output, softmax_weights, softmax_biases) self.probs = tf.nn.softmax(self.logits, name= "probability") loss = seq2seq.sequence_loss_by_example([self.logits], [tf.reshape(self.target_data, [-1])], [tf.ones([self.batch_size * self.seq_length])], self.vocab_size ) self.cost = tf.reduce_sum(loss) / ( self.batch_size * self.seq_length ) self.final_state= states[-1] tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars),self.grad_clip) optimizer = tf.train.AdamOptimizer(0.01) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def seq2seq_f(cell, encoder_inputs, decoder_inputs, loop_output): ''' The seq2seq neural network structurei Args: cell: the RNNCell object encoder_inputs: a list of Tensors to feed the encoder decoder_inputs: a list of Tensors to feed the decoder loop_output: True for using the loop_func to construct the next decoder_input element using the previous output element Returns: outputs: a list of Tensors generated by the decoder states: the hidden states at the final step of the encoder ''' if loop_output: def loop_func(prev, i): # simplest construction: using the previous output as the next input return prev # use rnn() directly for modified decoder. _, enc_states = rnn.rnn(cell, encoder_inputs, dtype=tf.float32) # note that the returned states are all hidden states, not just the last one outputs,states = seq2seq.rnn_decoder(decoder_inputs, enc_states[-1], cell, loop_func) else: # using the given decoder inputs outputs,states = seq2seq.basic_rnn_seq2seq( encoder_inputs, decoder_inputs, cell) # one way to bound the output in [-1,1]. but not used. # for x in outputs: # x = tf.tanh(x) # print(states) # the output states is just the last element of all hidden states return outputs,states
def __init__(self, args, infer=False): self.args = args if infer: args.batch_size = 1 args.seq_length = 1 additional_cell_args = {} if args.model == 'rnn': cell_fn = rnn_cell.BasicRNNCell elif args.model == 'gru': cell_fn = rnn_cell.GRUCell elif args.model == 'lstm': cell_fn = rnn_cell.BasicLSTMCell elif args.model == 'gridlstm': cell_fn = grid_rnn.Grid2LSTMCell additional_cell_args.update({'use_peepholes': True, 'forget_bias': 1.0}) elif args.model == 'gridgru': cell_fn = grid_rnn.Grid2GRUCell else: raise Exception("model type not supported: {}".format(args.model)) cell = cell_fn(args.rnn_size, **additional_cell_args) self.cell = cell = rnn_cell.MultiRNNCell([cell] * args.num_layers) self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.initial_state = cell.zero_state(args.batch_size, tf.float32) with tf.variable_scope('rnnlm'): softmax_w = tf.get_variable("softmax_w", [args.rnn_size, args.vocab_size]) softmax_b = tf.get_variable("softmax_b", [args.vocab_size]) with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [args.vocab_size, args.rnn_size]) inputs = tf.split(1, args.seq_length, tf.nn.embedding_lookup(embedding, self.input_data)) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] def loop(prev, _): prev = tf.nn.xw_plus_b(prev, softmax_w, softmax_b) prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embedding, prev_symbol) outputs, last_state = seq2seq.rnn_decoder(inputs, self.initial_state, cell, loop_function=loop if infer else None, scope='rnnlm') output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size]) self.logits = tf.nn.xw_plus_b(output, softmax_w, softmax_b) self.probs = tf.nn.softmax(self.logits) loss = seq2seq.sequence_loss_by_example([self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([args.batch_size * args.seq_length])], args.vocab_size) self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length self.final_state = last_state self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), args.grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def embedding_rnn_decoder(decoder_inputs, initial_state, cell, num_symbols, output_projection=None, feed_previous=False, scope=None, embedding=None): """RNN decoder with embedding and a pure-decoding option. Args: decoder_inputs: a list of 1D batch-sized int32-Tensors (decoder inputs). initial_state: 2D Tensor [batch_size x cell.state_size]. cell: rnn_cell.RNNCell defining the cell function. num_symbols: integer, how many symbols come into the embedding. output_projection: None or a pair (W, B) of output projection weights and biases; W has shape [cell.output_size x num_symbols] and B has shape [num_symbols]; if provided and feed_previous=True, each fed previous output will first be multiplied by W and added B. feed_previous: Boolean; if True, only the first of decoder_inputs will be used (the "GO" symbol), and all other decoder inputs will be generated by: next = embedding_lookup(embedding, argmax(previous_output)), In effect, this implements a greedy decoder. It can also be used during training to emulate http://arxiv.org/pdf/1506.03099v2.pdf. If False, decoder_inputs are used as given (the standard decoder case). scope: VariableScope for the created subgraph; defaults to "embedding_rnn_decoder". Returns: outputs: A list of the same length as decoder_inputs of 2D Tensors with shape [batch_size x cell.output_size] containing the generated outputs. states: The state of each decoder cell in each time-step. This is a list with length len(decoder_inputs) -- one item for each time-step. Each item is a 2D Tensor of shape [batch_size x cell.state_size]. Raises: ValueError: when output_projection has the wrong shape. """ if output_projection is not None: proj_weights = tf.convert_to_tensor(output_projection[0], dtype=tf.float32) proj_weights.get_shape().assert_is_compatible_with([cell.output_size, num_symbols]) proj_biases = tf.convert_to_tensor(output_projection[1], dtype=tf.float32) proj_biases.get_shape().assert_is_compatible_with([num_symbols]) with tf.variable_scope(scope or "embedding_rnn_decoder"): if embedding is None: with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [num_symbols, cell.input_size]) def extract_argmax_and_embed(prev, _): """Loop_function that extracts the symbol from prev and embeds it.""" if output_projection is not None: prev = tf.nn.xw_plus_b(prev, output_projection[0], output_projection[1]) prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embedding, prev_symbol) loop_function = None if feed_previous: loop_function = extract_argmax_and_embed emb_inp = [tf.nn.embedding_lookup(embedding, i) for i in decoder_inputs] return seq2seq.rnn_decoder(emb_inp, initial_state, cell, loop_function=loop_function)
def _init_seq2seq(self, encoder_inputs, decoder_inputs, cell, feed_previous): def inference_loop_function(prev, _): prev = tf.nn.xw_plus_b(prev, self.w_softmax, self.b_softmax) return tf.to_float(tf.equal(prev, tf.reduce_max(prev, reduction_indices=[1], keep_dims=True))) loop_function = inference_loop_function if feed_previous else None with variable_scope.variable_scope('seq2seq'): _, final_enc_state = rnn.rnn(cell, encoder_inputs, dtype=dtypes.float32) return seq2seq.rnn_decoder(decoder_inputs, final_enc_state, cell, loop_function=loop_function)
def __init__(self, rnn_size, num_layers, vocab_size, grad_clip, batch_size=1, seq_length=1): cell = rnn_cell.BasicLSTMCell(rnn_size) self.cell = cell = rnn_cell.MultiRNNCell([cell] * num_layers) self.input_data = tf.placeholder(tf.int32, [batch_size, seq_length]) self.targets = tf.placeholder(tf.int32, [batch_size, seq_length]) self.initial_state = cell.zero_state(batch_size, tf.float32) with tf.variable_scope('rnnlm'): softmax_w = tf.get_variable('softmax_w', [rnn_size, vocab_size]) softmax_b = tf.get_variable('softmax_b', [vocab_size]) with tf.device('/cpu:0'): embedding = tf.get_variable('embedding', [vocab_size, rnn_size]) inputs = tf.split( 1, seq_length, tf.nn.embedding_lookup(embedding, self.input_data)) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] def loop(prev, _): prev = tf.nn.xw_plus_b(prev, softmax_w, softmax_b) prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embedding, prev_symbol) train = batch_size == 1 and seq_length == 1 loop_fn = loop if train else None outputs, last_state = seq2seq.rnn_decoder(inputs, self.initial_state, cell, loop_function=loop_fn, scope='rnnlm') output = tf.reshape(tf.concat(1, outputs), [-1, rnn_size]) self.logits = tf.nn.xw_plus_b(output, softmax_w, softmax_b) self.probs = tf.nn.softmax(self.logits) loss = seq2seq.sequence_loss_by_example( [self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([batch_size * seq_length])], vocab_size) self.cost = tf.reduce_sum(loss) / batch_size / seq_length self.final_state = last_state self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def __init__(self, args, sampling=False): self.args = args if sampling: args.batch_size = 1 args.seq_length = 1 basic_cell = rnn_cell.BasicLSTMCell(args.rnn_size) self.cell = rnn_cell.MultiRNNCell([basic_cell] * args.num_layers) self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.initial_state = self.cell.zero_state(args.batch_size, tf.float32) with tf.variable_scope('rnnlm'): softmax_w = tf.get_variable("softmax_w", [args.rnn_size, args.vocab_size]) softmax_b = tf.get_variable("softmax_b", [args.vocab_size]) with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [args.vocab_size, args.rnn_size]) inputs = tf.split(1, args.seq_length, tf.nn.embedding_lookup(embedding, self.input_data)) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] def loop(prev, _): prev = tf.nn.xw_plus_b(prev, softmax_w, softmax_b) prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embedding, prev_symbol) outputs, last_state = seq2seq.rnn_decoder( inputs, self.initial_state, self.cell, loop_function=loop if sampling else None, scope='rnnlm') output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size]) self.logits = tf.nn.xw_plus_b(output, softmax_w, softmax_b) self.probs = tf.nn.softmax(self.logits) loss = seq2seq.sequence_loss_by_example( [self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([args.batch_size * args.seq_length])], args.vocab_size) self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length self.final_state = last_state self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), args.grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def generator(input_data, args, reuse=False): ''' Produce a probability sequence from the provided input_sequence args: input_data: args: returns: probs: [args.batch_size, args.seq_length, args.vocab_size] ''' with tf.variable_scope('generator', args, reuse = reuse): if args.model == 'rnn': cell = rnn_cell.BasicRNNCell(args.rnn_size) if args.model == 'gru': cell = rnn_cell.GRUCell(args.rnn_size) if args.model == 'lstm': cell = rnn_cell.BasicLSTMCell(args.rnn_size) else: raise Exception('model type not supported: {}'.format(args.model)) cell = rnn_cell.MultiRNNCell([cell] * args.num_layers) initial_state = cell.zero_state(args.batch_size, tf.float32) with tf.variable_scope('rnn'): softmax_w = tf.get_variable('softmax_w', [args.rnn_size, args.vocab_size]) softmax_b = tf.get_variable('softmax_b', [args.vocab_size]) with tf.device('/cpu:0'): embedding = tf.get_variable('embedding', [args.vocab_size, args.rnn_size]) inputs = tf.split(1, args.seq_length, tf.nn.embedding_lookup(embedding, input_data)) inputs = [tf.squeeze(i, [1]) for i in inputs] def loop(prev, _): prev = tf.nn.xw_plus_b(prev, softmax_w, softmax_b) prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embedding, prev_symbol) outputs, last_state = seq2seq.rnn_decoder(inputs, initial_state, cell, loop_function=None if is_training else loop, scope='rnn') # Dim: [args.batch_size * args.seq_length, args.rnn_size] output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size]) # Dim: [args.batch_size * args.seq_length, args.vocab_size] logits = tf.nn.xw_plus_b(output, softmax_w, softmax_b) probs = tf.nn.softmax(logits) # Dim: [args.batch_size, args.seq_length, args.vocab_size] probs = tf.reshape(probs, [args.batch_size, args.seq_length, args.vocab_size]) return probs
def model(): initial_loc = tf.random_uniform((batch_size, 2), minval=-1, maxval=1) initial_glimpse = get_glimpse(initial_loc) lstm_cell = rnn_cell.LSTMCell(cell_size, g_size, num_proj=cell_out_size) initial_state = lstm_cell.zero_state(batch_size, tf.float32) inputs = [initial_glimpse] inputs.extend([0] * (glimpses - 1)) outputs, _ = seq2seq.rnn_decoder(inputs, initial_state, lstm_cell, loop_function=get_next_input) get_next_input(outputs[-1], 0) return outputs
def testRNNDecoder(self): with self.test_session() as sess: with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)): inp = [tf.constant(0.5, shape=[2, 2]) for _ in xrange(2)] _, enc_states = rnn.rnn(rnn_cell.GRUCell(2), inp, dtype=tf.float32) dec_inp = [tf.constant(0.4, shape=[2, 2]) for _ in xrange(3)] cell = rnn_cell.OutputProjectionWrapper(rnn_cell.GRUCell(2), 4) dec, mem = seq2seq.rnn_decoder(dec_inp, enc_states[-1], cell) sess.run([tf.initialize_all_variables()]) res = sess.run(dec) self.assertEqual(len(res), 3) self.assertEqual(res[0].shape, (2, 4)) res = sess.run(mem) self.assertEqual(len(res), 4) self.assertEqual(res[0].shape, (2, 2))
def __init__(self, config, is_training): self.batch_size = batch_size = config.batch_size self.num_steps = num_steps = config.num_steps size = config.hidden_size lstm_cell = rnn_cell.BasicLSTMCell(size, forget_bias=0.0) cell = rnn_cell.MultiRNNCell([lstm_cell] * config.num_layers) if is_training and config.keep_prob < 1: cell = rnn_cell.DropoutWrapper(cell, output_keep_prob=config.keep_prob) self.cell = cell self.input_data = tf.placeholder(dtype=tf.float32, shape=[None, num_steps, 1]) self.target_data = tf.placeholder(dtype=tf.float32, shape=[None, num_steps, 1]) self.initial_state = cell.zero_state(batch_size=config.batch_size, dtype=tf.float32) inputs = tf.split(1, num_steps, self.input_data) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] with tf.variable_scope('rnnvm'): output_w = tf.get_variable("output_w", [size, 1]) output_b = tf.get_variable("output_b", [1]) outputs, states = seq2seq.rnn_decoder(inputs, self.initial_state, cell, scope='rnnvm') output = tf.reshape(tf.concat(1, outputs), [-1, size]) output = tf.nn.xw_plus_b(output, output_w, output_b) entropy = tf.nn.sigmoid_cross_entropy_with_logits( output, tf.reshape(self.target_data, shape=[num_steps * batch_size, 1])) self.cost = cost = tf.reduce_mean(entropy) self.final_state = states[-1] if not is_training: return self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), config.max_grad_norm) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def inference(self, input_data): """ Build out the graph enough to make predictions input_data - a batch of sequences to predict. Tensor of size [batch_size, input_channels, sequence_length] :return: logits """ inputs = tf.split(2, self.sequence_length, input_data) # Slice up the input_data into a list inputs = [tf.squeeze(input_, squeeze_dims=[2]) for input_ in inputs] # Get rid of the dim with size 1 self.outputs, self.states = seq2seq.rnn_decoder(inputs, # decoder_inputs: a list of 2D Tensors [batch_size x cell.input_size] self.initial_state, self.cell, None, # Loop fn scope='inference' # Name scope ) #TODO: cleanup organziation self.final_state = self.states[-1] self.final_output = self.outputs[-1] return self.outputs, self.states
############ with tf.variable_scope('rnn_generator'): softmax_w = tf.get_variable('softmax_w', [args.rnn_size, args.vocab_size]) softmax_b = tf.get_variable('softmax_b', [args.vocab_size]) with tf.device('/cpu:0'): embedding = tf.get_variable('embedding', [args.vocab_size, args.rnn_size]) inputs_gen = tf.split(1, args.seq_length, tf.nn.embedding_lookup(embedding, input_data)) inputs_gen = [tf.squeeze(i, [1]) for i in inputs_gen] def loop(prev, _): prev = tf.nn.xw_plus_b(prev, softmax_w, softmax_b) prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embedding, prev_symbol) outputs_gen, last_state = seq2seq.rnn_decoder(inputs_gen, initial_state_gen, cell_gen, loop_function=None if is_training else loop, scope='rnn_generator') # Dim: [args.batch_size * args.seq_length, args.rnn_size] output_gen = tf.reshape(tf.concat(1, outputs_gen), [-1, args.rnn_size]) # Dim: [args.batch_size * args.seq_length, args.vocab_size] logits_gen = tf.nn.xw_plus_b(output_gen, softmax_w, softmax_b) gen_probs = tf.nn.softmax(logits_gen) gen_probs = tf.reshape(gen_probs, [args.batch_size, args.seq_length, args.vocab_size]) ################ # Discriminator ################ # Pass a tensor of *probabilities* over the characters to the Discriminator with tf.variable_scope('rnn_discriminator'): softmax_w = tf.get_variable('softmax_w', [args.rnn_size, 2], trainable = False)
return(X,y) with tf.name_scope("Placeholders") as scope: inputs = [tf.placeholder(tf.float32,shape=[batch_size,1]) for _ in range(seq_len)] target = tf.placeholder(tf.float32, shape=[batch_size]) keep_prob = tf.placeholder("float") with tf.name_scope("Cell") as scope: cell = rnn_cell.BasicLSTMCell(hidden_size) cell = rnn_cell.MultiRNNCell([cell] * num_layers) cell = rnn_cell.DropoutWrapper(cell,output_keep_prob=keep_prob) initial_state = cell.zero_state(batch_size, tf.float32) with tf.name_scope("RNN") as scope: outputs, states = seq2seq.rnn_decoder(inputs, initial_state, cell) final = outputs[-1] with tf.name_scope("Output") as scope: W_o = tf.Variable(tf.random_normal([hidden_size,input_size], stddev=0.01)) b_o = tf.Variable(tf.random_normal([input_size], stddev=0.01)) prediction = tf.matmul(final, W_o) + b_o with tf.name_scope("Optimization") as scope: cost = tf.pow(tf.sub(tf.reshape(prediction, [-1]), target),2) train_op = tf.train.RMSPropOptimizer(0.005, 0.2).minimize(cost) loss = tf.reduce_sum(cost) #Validation Data X_val,y_val = generate_data(5,seq_len,batch_size) X_val = np.split(np.squeeze(X_val),seq_len,axis=1)
y = tf.placeholder(tf.int32, [None, seq_size]) initial_state = cell.zero_state(batch_size, tf.float32) #with tf.variable_scope('rnn'): w = tf.get_variable('softmax_w', [hidden_size, input_size]) b = tf.get_variable('softmax_b', [input_size]) #with tf.device('/cpu:0'): # [input_size x hidden_size] embed = tf.get_variable('embed', [input_size, hidden_size]) # [batch_size x seq_size x hidden_size] input_set = tf.nn.embedding_lookup(embed, x) # [batch_size x 1 x hidden_size] x seq_size inputs = tf.split(1, seq_size, input_set) # [batch_size x hidden_size] x seq_size inputs = [tf.squeeze(input_, [1]) for input_ in inputs] def loop(prev, _): prev = tf.nn.xw_plus_b(prev, w, b) prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) # [batch_size x hidden_size] return tf.nn.embedding_lookup(embed, prev_symbol) infer = False # outputs : [batch_size x hidden_size] # states : [batch_size x state_size] outputs, states = seq2seq.rnn_decoder(inputs, initial_state, cell, loop_function=loop if infer else None, scope='rnn') output = tf.reshape(tf.concat(1, outputs), [-1, hidden_size])
def __init__(self, args): self.size = args.rnn_size self.n_steps = args.n_steps self.batch_size = args.batch_size self.input_dim = args.input_dim self.num_layers = args.num_layers initializer = tf.random_uniform_initializer(-0.8,0.8) # initializer = tf.zeros_initializer((size*2,1), dtype=tf.float32) self.seq_input = tf.placeholder(tf.float32, [self.n_steps, self.batch_size, self.input_dim]) # sequence we will provide at runtime self.early_stop = tf.placeholder(tf.int32) # what timestep we want to stop at self.inputs = [tf.reshape(i, (self.batch_size, self.input_dim)) for i in tf.split(0, self.n_steps, self.seq_input)] # inputs for rnn needs to be a list, each item being a timestep. # we need to split our input into each timestep, and reshape it because split keeps dims by default # result = tf.placeholder(tf.float32, [n_steps, batch_size, seq_width]) self.result = tf.placeholder(tf.float32, [None, self.input_dim]) if args.cell_type == "srnn": cell = BasicRNNCell(self.size)#, seq_width, initializer=initializer) elif args.cell_type == "lstm": cell = BasicLSTMCell(self.size, forget_bias = 1.0) elif args.cell_type == "lstmp": cell = LSTMCell(self.size, self.input_dim, initializer=initializer) elif args.cell_type == "cw": cell = CWRNNCell(self.size, [1, 4, 16, 64])#, seq_width, initializer=initializer) self.cell = cell = rnn_cell.MultiRNNCell([cell] * self.num_layers) # initial_state = cell.zero_state(batch_size, tf.float32) self.initial_state = tf.random_uniform([self.batch_size, self.cell.state_size], -0.1, 0.1) # self variables: scope RNN -> BasicRNNCell -> get_variable("Matrix", "Bias") # network type if args.rnn_type == "rnn": self.outputs, self.states = rnn.rnn(self.cell, self.inputs, initial_state = self.initial_state, sequence_length = self.early_stop) elif args.rnn_type == "seq2seq": self.outputs, self.states = seq2seq.rnn_decoder(self.inputs, self.initial_state, self.cell, loop_function=loop if False else None) # set up lstm self.final_state = self.states[-1] self.W_o = tf.Variable(tf.random_normal([self.size,1], stddev=0.01)) self.b_o = tf.Variable(tf.random_normal([1], stddev=0.01)) print "type(outputs)", type(self.outputs) self.output_cat = tf.reshape(tf.concat(1, self.outputs), [-1, self.size]) self.output = tf.nn.xw_plus_b(self.output_cat, self.W_o, self.b_o) # self.final_state = states[-1] self.output2 = tf.reshape(self.output, [self.batch_size, self.n_steps, self.input_dim]) self.output2 = self.output2 + tf.random_normal([self.batch_size, self.n_steps, self.input_dim], stddev=0.05) # then transpose self.output2 = tf.transpose(self.output2, [1, 0, 2])
def __init__(self, args): # define cell if args.model == 'rnn': cell_fn = rnn_cell.BasicRNNCell elif args.model == 'lstm': cell_fn = rnn_cell.BasicLSTMCell else: print "Invalid cell" sys.exit() cell = cell_fn(args.rnn_size) cell = rnn_cell.MultiRNNCell([cell] * args.num_layers) # define inputs and targets, initialize state self.inputs = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") self.initial_state = cell.zero_state(args.batch_size, tf.float32) # prepare word embedding, reshape inputs with tf.name_scope("embedding"): with tf.device("/cpu:0"): if args.emb_vocab is None: E = tf.get_variable("E", [args.vocab_size, args.rnn_size]) else: emb_dim = len(args.emb_vocab[args.emb_vocab.keys()[0]][1]) emb_mat = np.random.rand(args.vocab_size, emb_dim) for word, (idx, emb_vec) in args.emb_vocab.iteritems(): emb_mat[idx] = emb_vec E = tf.Variable(tf.convert_to_tensor(emb_mat, dtype=tf.float32), name="E") inputs = tf.split(1, args.seq_length, tf.nn.embedding_lookup(E, self.inputs)) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] # feed inputs into rnn with tf.name_scope("rnn"): outputs, states = seq2seq.rnn_decoder(inputs, self.initial_state, cell, loop_function=None, scope='rnnlm') self.output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size]) # Add dropout with tf.name_scope("dropout"): self.h_drop = tf.nn.dropout(self.output, self.dropout_keep_prob) # output layer with tf.name_scope("output"): W = tf.Variable(tf.truncated_normal([args.rnn_size, args.num_classes], stddev=0.1), name="W") b = tf.Variable(tf.constant(0.1, shape=[args.num_classes]), name="b") self.logits = tf.nn.xw_plus_b(self.h_drop, W, b) self.probs = tf.nn.softmax(self.logits) self.predictions = tf.cast(tf.argmax(self.logits, 1), tf.int32) # accuracy with tf.name_scope("accuracy"): # calculate token-level accuracy self.reshaped_targets = tf.reshape(self.targets, [-1]) correct_predictions = tf.equal(self.predictions, self.reshaped_targets) self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float")) # calculate sentence-level accuracy self.predictions_sentence = tf.reshape(self.predictions, [-1, args.seq_length]) # batch_size * seq_length correct_predictions_sentence_tokens = tf.equal(self.predictions_sentence, self.targets) # batch_size X seq_length multiply_mat = tf.constant(1, shape=[args.seq_length, 1]) sentence_accuracy_mat = tf.matmul(tf.cast(correct_predictions_sentence_tokens, tf.int32), multiply_mat) # batch_size X 1 correct_predictions_sentence = \ tf.equal(sentence_accuracy_mat, tf.constant(args.seq_length, shape=[args.batch_size, 1])) # batch_size X 1 self.accuracy_sentence = tf.reduce_mean(tf.cast(correct_predictions_sentence, "float")) # calculate loss with tf.name_scope("loss"): self.loss = seq2seq.sequence_loss_by_example( [self.logits], # TODO: should I use a list of 2D tensors ? [self.reshaped_targets], # TODO: correct ??? [tf.ones([args.batch_size * args.seq_length])], args.num_classes) self.cost = tf.reduce_sum(self.loss) / args.batch_size / args.seq_length # train and update with tf.name_scope("update"): tvars = tf.trainable_variables() self.grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), args.grad_clip) # TODO: correct ??? optimizer = tf.train.AdamOptimizer(args.learning_rate) self.global_step = tf.Variable(0, name="global_step", trainable=False) self.train_op = optimizer.apply_gradients(zip(self.grads, tvars), global_step=self.global_step) # l2 norm clipping self.weight_clipping_op = [] trainable_vars = tf.trainable_variables() for var in trainable_vars: if var.name.startswith('output/W'): updated_var = tf.clip_by_norm(var, args.l2_limit) self.weight_clipping_op.append(tf.assign(var, updated_var))
def __init__(self, args, infer=False): self.dim = 1 self.args = args if infer: args.batch_size = 1 args.seq_length = 1 if args.model == 'rnn': cell_fn = rnn_cell.BasicRNNCell elif args.model == 'gru': cell_fn = rnn_cell.GRUCell elif args.model == 'lstm': cell_fn = rnn_cell.BasicLSTMCell else: raise Exception("model type not supported: {}".format(args.model)) cell = cell_fn(args.rnn_size) cell = rnn_cell.MultiRNNCell([cell] * args.num_layers) if (infer == False and args.keep_prob < 1): # training mode cell = rnn_cell.DropoutWrapper(cell, output_keep_prob = args.keep_prob) self.cell = cell self.input_data = tf.placeholder(dtype=tf.float32, shape=[None, args.seq_length, self.dim]) self.target_data = tf.placeholder(dtype=tf.float32, shape=[None, args.seq_length, self.dim]) self.initial_state = cell.zero_state(batch_size=args.batch_size, dtype=tf.float32) self.num_mixture = args.num_mixture NOUT = self.num_mixture * (1 + 2 * self.dim) # prob + mu + sig # [prob 1-20, dim1 mu, dim1 sig, dim2,... ] with tf.variable_scope('rnnlm'): output_w = tf.get_variable("output_w", [args.rnn_size, NOUT]) output_b = tf.get_variable("output_b", [NOUT]) self.w = output_w inputs = tf.split(1, args.seq_length, self.input_data) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] outputs, states = seq2seq.rnn_decoder(inputs, self.initial_state, cell, loop_function=None, scope='rnnlm') output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size]) output = tf.nn.xw_plus_b(output, output_w, output_b) self.final_state = states # reshape target data so that it is compatible with prediction shape flat_target_data = tf.reshape(self.target_data,[-1, self.dim]) #[x1_data, x2_data, eos_data] = tf.split(1, 3, flat_target_data) x_data = flat_target_data def tf_normal(x, mu, sig): return tf.exp(-tf.square(x - mu) / (2 * tf.square(sig))) / (sig * tf.sqrt(2 * np.pi)) def get_lossfunc(z_pi, z_mu, z_sig, x_data): result0 = tf_normal(x_data, z_mu, z_sig) result1 = tf.reduce_sum(result0 * z_pi, 1, keep_dims=True) result2 = -tf.log(tf.maximum(result1, 1e-20)) return tf.reduce_sum(result2) self.pi = output[:, 0:self.num_mixture] max_pi = tf.reduce_max(self.pi, 1, keep_dims=True) self.pi = tf.exp(tf.sub(self.pi, max_pi)) normalize_pi = tf.inv(tf.reduce_sum(self.pi, 1, keep_dims=True)) self.pi = normalize_pi * self.pi output_each_dim = tf.split(1, self.dim, output[:, self.num_mixture:]) self.mu = [] self.sig = [] self.cost = 0 for i in range(self.dim): [o_mu, o_sig] = tf.split(1, 2, output_each_dim[i]) o_sig = tf.exp(o_sig) self.mu.append(o_mu) self.sig.append(o_sig) lossfunc = get_lossfunc(self.pi, o_mu, o_sig, x_data[:,i:i+1]) self.cost += lossfunc / (args.batch_size * args.seq_length * self.dim) self.mu = tf.concat(1, self.mu) self.sig = tf.concat(1, self.sig) self.loss_summary = tf.scalar_summary("loss", self.cost) self.summary = tf.merge_all_summaries() self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), args.grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def __init__(self, args, infer=False): self.args = args if infer: args.batch_size = 1 args.seq_length = 1 if args.model == 'rnn': cell_fn = rnn_cell.BasicRNNCell elif args.model == 'gru': cell_fn = rnn_cell.GRUCell elif args.model == 'lstm': cell_fn = rnn_cell.BasicLSTMCell else: raise Exception("model type not supported: {}".format(args.model)) cell = cell_fn(args.rnn_size) cell = rnn_cell.MultiRNNCell([cell] * args.num_layers) if (infer == False and args.keep_prob < 1): # training mode cell = rnn_cell.DropoutWrapper(cell, output_keep_prob = args.keep_prob) self.cell = cell self.input_data = tf.placeholder(dtype=tf.float32, shape=[None, args.seq_length, 3]) self.target_data = tf.placeholder(dtype=tf.float32, shape=[None, args.seq_length, 3]) self.initial_state = cell.zero_state(batch_size=args.batch_size, dtype=tf.float32) self.num_mixture = args.num_mixture NOUT = 1 + self.num_mixture * 6 # end_of_stroke + prob + 2*(mu + sig) + corr with tf.variable_scope('rnnlm'): output_w = tf.get_variable("output_w", [args.rnn_size, NOUT]) output_b = tf.get_variable("output_b", [NOUT]) inputs = tf.split(1, args.seq_length, self.input_data) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] outputs, last_state = seq2seq.rnn_decoder(inputs, self.initial_state, cell, loop_function=None, scope='rnnlm') output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size]) output = tf.nn.xw_plus_b(output, output_w, output_b) self.final_state = last_state # reshape target data so that it is compatible with prediction shape flat_target_data = tf.reshape(self.target_data,[-1, 3]) [x1_data, x2_data, eos_data] = tf.split(1, 3, flat_target_data) # long method: #flat_target_data = tf.split(1, args.seq_length, self.target_data) #flat_target_data = [tf.squeeze(flat_target_data_, [1]) for flat_target_data_ in flat_target_data] #flat_target_data = tf.reshape(tf.concat(1, flat_target_data), [-1, 3]) def tf_2d_normal(x1, x2, mu1, mu2, s1, s2, rho): # eq # 24 and 25 of http://arxiv.org/abs/1308.0850 norm1 = tf.sub(x1, mu1) norm2 = tf.sub(x2, mu2) s1s2 = tf.mul(s1, s2) z = tf.square(tf.div(norm1, s1))+tf.square(tf.div(norm2, s2))-2*tf.div(tf.mul(rho, tf.mul(norm1, norm2)), s1s2) negRho = 1-tf.square(rho) result = tf.exp(tf.div(-z,2*negRho)) denom = 2*np.pi*tf.mul(s1s2, tf.sqrt(negRho)) result = tf.div(result, denom) return result def get_lossfunc(z_pi, z_mu1, z_mu2, z_sigma1, z_sigma2, z_corr, z_eos, x1_data, x2_data, eos_data): result0 = tf_2d_normal(x1_data, x2_data, z_mu1, z_mu2, z_sigma1, z_sigma2, z_corr) # implementing eq # 26 of http://arxiv.org/abs/1308.0850 epsilon = 1e-20 result1 = tf.mul(result0, z_pi) result1 = tf.reduce_sum(result1, 1, keep_dims=True) result1 = -tf.log(tf.maximum(result1, 1e-20)) # at the beginning, some errors are exactly zero. result2 = tf.mul(z_eos, eos_data) + tf.mul(1-z_eos, 1-eos_data) result2 = -tf.log(result2) result = result1 + result2 return tf.reduce_sum(result) # below is where we need to do MDN splitting of distribution params def get_mixture_coef(output): # returns the tf slices containing mdn dist params # ie, eq 18 -> 23 of http://arxiv.org/abs/1308.0850 z = output z_eos = z[:, 0:1] z_pi, z_mu1, z_mu2, z_sigma1, z_sigma2, z_corr = tf.split(1, 6, z[:, 1:]) # process output z's into MDN paramters # end of stroke signal z_eos = tf.sigmoid(z_eos) # should be negated, but doesn't matter. # softmax all the pi's: max_pi = tf.reduce_max(z_pi, 1, keep_dims=True) z_pi = tf.sub(z_pi, max_pi) z_pi = tf.exp(z_pi) normalize_pi = tf.inv(tf.reduce_sum(z_pi, 1, keep_dims=True)) z_pi = tf.mul(normalize_pi, z_pi) # exponentiate the sigmas and also make corr between -1 and 1. z_sigma1 = tf.exp(z_sigma1) z_sigma2 = tf.exp(z_sigma2) z_corr = tf.tanh(z_corr) return [z_pi, z_mu1, z_mu2, z_sigma1, z_sigma2, z_corr, z_eos] [o_pi, o_mu1, o_mu2, o_sigma1, o_sigma2, o_corr, o_eos] = get_mixture_coef(output) self.pi = o_pi self.mu1 = o_mu1 self.mu2 = o_mu2 self.sigma1 = o_sigma1 self.sigma2 = o_sigma2 self.corr = o_corr self.eos = o_eos lossfunc = get_lossfunc(o_pi, o_mu1, o_mu2, o_sigma1, o_sigma2, o_corr, o_eos, x1_data, x2_data, eos_data) self.cost = lossfunc / (args.batch_size * args.seq_length) self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), args.grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def __init__(self, args): self.size = args.rnn_size self.n_steps = args.n_steps self.batch_size = args.batch_size self.input_dim = args.input_dim self.num_layers = args.num_layers initializer = tf.random_uniform_initializer(-0.8, 0.8) # initializer = tf.zeros_initializer((size*2,1), dtype=tf.float32) self.seq_input = tf.placeholder( tf.float32, [self.n_steps, self.batch_size, self.input_dim]) # sequence we will provide at runtime self.early_stop = tf.placeholder(tf.int32) # what timestep we want to stop at self.inputs = [ tf.reshape(i, (self.batch_size, self.input_dim)) for i in tf.split(0, self.n_steps, self.seq_input) ] # inputs for rnn needs to be a list, each item being a timestep. # we need to split our input into each timestep, and reshape it because split keeps dims by default # result = tf.placeholder(tf.float32, [n_steps, batch_size, seq_width]) self.result = tf.placeholder(tf.float32, [None, self.input_dim]) if args.cell_type == "srnn": cell = BasicRNNCell( self.size) #, seq_width, initializer=initializer) elif args.cell_type == "lstm": cell = BasicLSTMCell(self.size, forget_bias=1.0) elif args.cell_type == "lstmp": cell = LSTMCell(self.size, self.input_dim, initializer=initializer) elif args.cell_type == "cw": cell = CWRNNCell( self.size, [1, 4, 16, 64]) #, seq_width, initializer=initializer) self.cell = cell = rnn_cell.MultiRNNCell([cell] * self.num_layers) # initial_state = cell.zero_state(batch_size, tf.float32) self.initial_state = tf.random_uniform( [self.batch_size, self.cell.state_size], -0.1, 0.1) # self variables: scope RNN -> BasicRNNCell -> get_variable("Matrix", "Bias") # network type if args.rnn_type == "rnn": self.outputs, self.states = rnn.rnn( self.cell, self.inputs, initial_state=self.initial_state, sequence_length=self.early_stop) elif args.rnn_type == "seq2seq": self.outputs, self.states = seq2seq.rnn_decoder( self.inputs, self.initial_state, self.cell, loop_function=loop if False else None) # set up lstm self.final_state = self.states[-1] self.W_o = tf.Variable(tf.random_normal([self.size, 1], stddev=0.01)) self.b_o = tf.Variable(tf.random_normal([1], stddev=0.01)) print "type(outputs)", type(self.outputs) self.output_cat = tf.reshape(tf.concat(1, self.outputs), [-1, self.size]) self.output = tf.nn.xw_plus_b(self.output_cat, self.W_o, self.b_o) # self.final_state = states[-1] self.output2 = tf.reshape( self.output, [self.batch_size, self.n_steps, self.input_dim]) self.output2 = self.output2 + tf.random_normal( [self.batch_size, self.n_steps, self.input_dim], stddev=0.05) # then transpose self.output2 = tf.transpose(self.output2, [1, 0, 2])
# Set Network with tf.variable_scope('rnnlm'): softmax_w = tf.get_variable("softmax_w", [rnn_size, vocab_size]) softmax_b = tf.get_variable("softmax_b", [vocab_size]) with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [vocab_size, rnn_size]) inputs = tf.split(1, seq_length, tf.nn.embedding_lookup(embedding, input_data)) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] # Loop function for seq2seq def loop(prev, _): prev = tf.nn.xw_plus_b(prev, softmax_w, softmax_b) prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embedding, prev_symbol) # Output of RNN outputs, last_state = seq2seq.rnn_decoder(inputs, initial_state, cell, loop_function=None, scope='rnnlm') output = tf.reshape(tf.concat(1, outputs), [-1, rnn_size]) logits = tf.nn.xw_plus_b(output, softmax_w, softmax_b) # Next word probability probs = tf.nn.softmax(logits) # Define LOSS loss = seq2seq.sequence_loss_by_example([logits], # Input [tf.reshape(targets, [-1])], # Target [tf.ones([batch_size * seq_length])], # Weight vocab_size) # Define Optimizer cost = tf.reduce_sum(loss) / batch_size / seq_length final_state = last_state lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), grad_clip)
def __init__(self, args, infer=False): self.args = args if infer: args.batch_size = 1 args.seq_length = 1 if args.model == 'rnn': cell_fn = jzRNNCell elif args.model == 'gru': cell_fn = jzGRUCell elif args.model == 'lstm': cell_fn = jzLSTMCell else: raise Exception("model type not supported: {}".format(args.model)) if args.activation == 'tanh': cell_af = tf.tanh elif args.activation == 'sigmoid': cell_af = tf.sigmoid elif args.activation == 'relu': cell_af = tf.nn.relu else: raise Exception("activation function not supported: {}".format(args.activation)) self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) with tf.variable_scope('rnnlm'): if not args.bidirectional: softmax_w = tf.get_variable("softmax_w", [args.rnn_size, args.vocab_size]) else: softmax_w = tf.get_variable("softmax_w", [args.rnn_size*2, args.vocab_size]) softmax_b = tf.get_variable("softmax_b", [args.vocab_size]) with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [args.vocab_size, args.rnn_size]) inputs = tf.split(1, args.seq_length, tf.nn.embedding_lookup(embedding, self.input_data)) inputs = [tf.nn.dropout(tf.squeeze(input_, [1]),args.dropout) for input_ in inputs] # one-directional RNN (nothing changed here..) if not args.bidirectional: cell = cell_fn(args.rnn_size,activation=cell_af) self.cell = cell = rnn_cell.MultiRNNCell([cell] * args.num_layers) self.initial_state = cell.zero_state(args.batch_size, tf.float32) def loop(prev, _): prev = tf.matmul(prev, softmax_w) + softmax_b prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embedding, prev_symbol) outputs, last_state = seq2seq.rnn_decoder(inputs, self.initial_state, cell, loop_function=loop if infer else None, scope='rnnlm') output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size]) # bi-directional RNN else: lstm_fw = cell_fn(args.rnn_size,activation=cell_af) lstm_bw = cell_fn(args.rnn_size,activation=cell_af) self.lstm_fw = lstm_fw = rnn_cell.MultiRNNCell([lstm_fw]*args.num_layers) self.lstm_bw = lstm_bw = rnn_cell.MultiRNNCell([lstm_bw]*args.num_layers) self.initial_state_fw = lstm_fw.zero_state(args.batch_size,tf.float32) self.initial_state_bw = lstm_bw.zero_state(args.batch_size,tf.float32) outputs,_,_ = rnn.bidirectional_rnn(lstm_fw, lstm_bw, inputs, initial_state_fw=self.initial_state_fw, initial_state_bw=self.initial_state_bw, sequence_length=args.batch_size) output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size*2]) self.logits = tf.matmul(tf.nn.dropout(output,args.dropout), softmax_w) + softmax_b self.probs = tf.nn.softmax(self.logits) loss = seq2seq.sequence_loss_by_example([self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([args.batch_size * args.seq_length])], args.vocab_size) self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length self.final_state = last_state self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), args.grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def __init__(self, args, predict=False): self.args = args if predict: batchSize = 1 numSteps = 1 # Various parameters for the LSTM. # Hardcoded here for now. numSteps = 50 # Steps to unroll for batchSize = 50 rnnSize = 128 numLayers = 2 gradClip = 5 learningRate = 0.002 decayRate = 0.97 #Create LSTM layer and stack multiple layers. lstmCell = rnn_cell.BasicLSTMCell(rnnSize) lstmNet = rnn_cell.MultiRNNCell([lstmCell] * numLayers) #Define placeholders. self.inputData = tf.placeholder(tf.int32, [batchSize, numSteps]) self.targetOutput = tf.placeholder(tf.int32, [batchSize, numSteps]) self.initialState = lstmNet.zero_state(batchSize, tf.float32) # If rnn_decoder is told to loop, this function will return to it the output at time # 't' for feeding as the input at time 't+1'. During training, this is generally # not done because we want to feed the *correct* input at all times and not what # is output. During prediction/testing, we loop the output back to the input to # generate our sequence of notes. def feedBack(prev, _): prev = tf.nn.xw_plus_b(prev, softmax_w, softmax_b) prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embedding, prev_symbol) with tf.variable_scope('nn_lstm'): softmax_w = tf.get_variable("softmax_w", [rnnSize, args.vocabSize]) softmax_b = tf.get_variable("softmax_b", [args.vocabSize]) with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [args.vocabSize, rnnSize]) inputs = tf.split(1, numSteps, tf.nn.embedding_lookup(embedding, self.inputData)) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] #Call seq2seq rnn decoder. outputs, states = seq2seq.rnn_decoder(inputs, self.initialState, lstmNet, loop_function=feedBack if predict else None, scope='nn_lstm') output = tf.reshape(tf.concat(1, outputs), [-1, rnnSize]) #Logit and probability #softmax_w = tf.get_variable("softmax_w", rnnSize, [args.vocabSize]) #softmax_b = tf.get_variable("softmax_b", [args.vocabSize]) self.logits = tf.matmul(output, softmax_w) + softmax_b self.probs = tf.nn.softmax(self.logits) # Calculate loss compared to targetOutput loss = seq2seq.sequence_loss_by_example([self.logits], [tf.reshape(self.targetOutput, [-1])], [tf.ones([batchSize * numSteps])], args.vocabSize) # Set the cost to minimize total loss. self.cost = tf.reduce_sum(loss) # Learning rate remains constant (not trainable) self.finalState = states[-1] self.learningRate = tf.Variable(0.0, trainable=False) # Define gradient and trainable variables for adjusting # during training/optimization. trainableVars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, trainableVars), gradClip) # We use the Adam optimizer. #optimizer = tf.train.GradientDescentOptimizer(self.learningRate).minimize(loss) #optimizer = tf.train.AdagradOptimizer(self.learningRate, initial_accumulator_value=0.1) #self.trainStep = optimizer.apply_gradients(zip(grads, trainableVars)) optimizer = tf.train.AdamOptimizer(self.learningRate) self.trainStep = optimizer.apply_gradients(zip(grads, trainableVars))
def build_model(self, inputs, infer): x_in, lx_in, y_in, my_in = inputs if self.rnn_type == 'rnn': cell_fn = rnn_cell.BasicRNNCell elif self.rnn_type == 'gru': cell_fn = rnn_cell.GRUCell elif self.rnn_type == 'lstm': cell_fn = rnn_cell.BasicLSTMCell else: raise Exception('rnn type not supported: {}'.format(rnn_type)) cell_enc = cell_fn(self.num_units) cell_dec = cell_fn(self.num_units) embedding = tf.get_variable('embedding', [self.vocab_size, self.dim_emb]) # encoding enc_in = tf.nn.embedding_lookup(embedding, x_in) enc_in = tf.split(1, self.seq_len, enc_in) enc_in = [tf.squeeze(input_, [1]) for input_ in enc_in] #enc_in is seq_len * [batch_size, embdding_size] print("enc_in size:", len(enc_in)) print("enc_in[0] shape:", enc_in[0].get_shape()) _, initial_state = rnn.rnn(cell_enc, enc_in, sequence_length=lx_in, dtype='float32', scope='encoder') #self.initial_state = tf.Variable(initial_value=initial_state, validate_shape=False, name="initial_state") self.initial_state = tf.mul(1.0, initial_state, name='initial_state') # decoding if infer == False: dec_in = tf.nn.embedding_lookup( embedding, tf.concat(1, [ tf.zeros([self.batch_size, 1], dtype='int32'), y_in[:, :self.seq_len - 1] ])) dec_in = tf.split(1, self.seq_len, dec_in) dec_in = [tf.squeeze(input_, [1]) for input_ in dec_in] else: dec_in = tf.nn.embedding_lookup(embedding, y_in) dec_in = tf.split(1, 1, dec_in) dec_in = [tf.squeeze(input_, [1]) for input_ in dec_in] # seq_len * [batch_size , embedding_size] # 50 * [32, 300] print("dec_in size:", len(dec_in)) print("dec_in[0] shape:", dec_in[0].get_shape()) # output is seq_len * [batch_size, num_units] output, last_state = seq2seq.rnn_decoder(dec_in, self.initial_state, cell_dec, scope='decoder') print("output[0] shape:", output[0].get_shape()) print("last_state shape:", last_state.get_shape()) # output shape [batch_size*seq_len, num_units] # [32*50, 512] output = tf.reshape(tf.concat(1, output), [-1, self.num_units]) self.last_state = tf.mul(1.0, last_state, name='last_state') #self.last_state = tf.Variable(initial_value=last_state, validate_shape=False, name="last_state") #self.last_state = last_state # get loss #with tf.variable_scope('softmax'): softmax_w = tf.get_variable('softmax_w', [self.num_units, self.vocab_size]) softmax_b = tf.get_variable('softmax_b', [self.vocab_size]) logits = tf.matmul(output, softmax_w) + softmax_b noname_probs = tf.nn.softmax(logits) self.probs = tf.mul(1.0, noname_probs, name='probs') self.log_probs = tf.log(self.probs, name='log_probs') loss = tf.nn.seq2seq.sequence_loss_by_example( [logits], [tf.reshape(y_in, [-1])], [tf.reshape(my_in, [-1])]) #self.loss = loss print "loss shape:", loss.get_shape() self.cost = cost = tf.reduce_sum(loss) / tf.to_float(self.batch_size) self.loss = cost #tvars = tf.trainable_variables() #grads = tf.gradients(cost, tvars) #if self.grad_clip: grads, _ = tf.clip_by_global_norm(grads, self.grad_clip) #optimizer = tf.train.AdamOptimizer(self.lr) #self.train_op = optimizer.apply_gradients(zip(grads, tvars)) return cost
def create_model(self): self.input_data = tf.placeholder(tf.int32, [self.batch_size, self.seq_length], name="input_data") self.target_data = tf.placeholder(tf.int32, [self.batch_size, self.seq_length], name="target_data") # define hyper_parameters self.keep_prob = tf.Variable(0.3, trainable=False, name="keep_prob") self.lr = tf.Variable(0.0, trainable=False, name="lr") softmax_weights = tf.get_variable("softmax_weights", [self.rnn_size, self.vocab_size]) softmax_biases = tf.get_variable("softmax_biases", [self.vocab_size]) lstm_cell = rnn_cell.BasicLSTMCell(self.rnn_size) # if self.is_training and self.keep_prob < 1: # lstm_cell = rnn_cell.DropoutWrapper(lstm_cell, output_keep_prob=self.keep_prob) multilayer_cell = rnn_cell.MultiRNNCell([lstm_cell] * self.num_layers) self.initial_state = multilayer_cell.zero_state(self.batch_size, tf.float32) with tf.device("/cpu:0"): # define the embedding matrix for the whole vocabulary self.embedding = tf.get_variable("embeddings", [self.vocab_size, self.rnn_size]) # take the vector representation for each word in the embeddings embeds = tf.nn.embedding_lookup(self.embedding, self.input_data) if self.is_training and self.keep_prob < 1: embeds = tf.nn.dropout(embeds, self.keep_prob) def loop(prev, _): prev = tf.nn.xw_plus_b(prev, softmax_weights, softmax_biases) prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(self.embedding, prev_symbol) # convert input to a list of seq_length inputs = tf.split(1, self.seq_length, embeds) # after splitting the shape becomes (batch_size,1,rnn_size). We need to modify it to [batch*rnn_size] inputs = [tf.squeeze(input_, [1]) for input_ in inputs] output, states = seq2seq.rnn_decoder( inputs, self.initial_state, multilayer_cell, loop_function=loop if self.infer else None, scope="rnnlm" ) output = tf.reshape(tf.concat(1, output), [-1, self.rnn_size]) self.logits = tf.nn.xw_plus_b(output, softmax_weights, softmax_biases) self.probs = tf.nn.softmax(self.logits, name="probability") loss = seq2seq.sequence_loss_by_example( [self.logits], [tf.reshape(self.target_data, [-1])], [tf.ones([self.batch_size * self.seq_length])], self.vocab_size, ) self.cost = tf.reduce_sum(loss) / (self.batch_size * self.seq_length) self.final_state = states[-1] tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), self.grad_clip) optimizer = tf.train.AdamOptimizer(0.01) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def __init__(self, args, infer=False): self.dim = 1 self.args = args if infer: args.batch_size = 1 args.seq_length = 1 if args.model == 'rnn': cell_fn = rnn_cell.BasicRNNCell elif args.model == 'gru': cell_fn = rnn_cell.GRUCell elif args.model == 'lstm': cell_fn = rnn_cell.BasicLSTMCell else: raise Exception("model type not supported: {}".format(args.model)) cell = cell_fn(args.rnn_size) cell = rnn_cell.MultiRNNCell([cell] * args.num_layers) if (infer == False and args.keep_prob < 1): # training mode cell = rnn_cell.DropoutWrapper(cell, output_keep_prob=args.keep_prob) self.cell = cell self.input_data = tf.placeholder( dtype=tf.float32, shape=[None, args.seq_length, self.dim]) self.target_data = tf.placeholder( dtype=tf.float32, shape=[None, args.seq_length, self.dim]) self.initial_state = cell.zero_state(batch_size=args.batch_size, dtype=tf.float32) self.num_mixture = args.num_mixture NOUT = self.num_mixture * (1 + 2 * self.dim) # prob + mu + sig # [prob 1-20, dim1 mu, dim1 sig, dim2,... ] with tf.variable_scope('rnnlm'): output_w = tf.get_variable("output_w", [args.rnn_size, NOUT]) output_b = tf.get_variable("output_b", [NOUT]) self.w = output_w inputs = tf.split(1, args.seq_length, self.input_data) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] outputs, states = seq2seq.rnn_decoder(inputs, self.initial_state, cell, loop_function=None, scope='rnnlm') output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size]) output = tf.nn.xw_plus_b(output, output_w, output_b) self.final_state = states # reshape target data so that it is compatible with prediction shape flat_target_data = tf.reshape(self.target_data, [-1, self.dim]) #[x1_data, x2_data, eos_data] = tf.split(1, 3, flat_target_data) x_data = flat_target_data def tf_normal(x, mu, sig): return tf.exp(-tf.square(x - mu) / (2 * tf.square(sig))) / (sig * tf.sqrt(2 * np.pi)) def get_lossfunc(z_pi, z_mu, z_sig, x_data): result0 = tf_normal(x_data, z_mu, z_sig) result1 = tf.reduce_sum(result0 * z_pi, 1, keep_dims=True) result2 = -tf.log(tf.maximum(result1, 1e-20)) return tf.reduce_sum(result2) self.pi = output[:, 0:self.num_mixture] max_pi = tf.reduce_max(self.pi, 1, keep_dims=True) self.pi = tf.exp(tf.sub(self.pi, max_pi)) normalize_pi = tf.inv(tf.reduce_sum(self.pi, 1, keep_dims=True)) self.pi = normalize_pi * self.pi output_each_dim = tf.split(1, self.dim, output[:, self.num_mixture:]) self.mu = [] self.sig = [] self.cost = 0 for i in range(self.dim): [o_mu, o_sig] = tf.split(1, 2, output_each_dim[i]) o_sig = tf.exp(o_sig) self.mu.append(o_mu) self.sig.append(o_sig) lossfunc = get_lossfunc(self.pi, o_mu, o_sig, x_data[:, i:i + 1]) self.cost += lossfunc / (args.batch_size * args.seq_length * self.dim) self.mu = tf.concat(1, self.mu) self.sig = tf.concat(1, self.sig) self.loss_summary = tf.scalar_summary("loss", self.cost) self.summary = tf.merge_all_summaries() self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), args.grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def __init__(self, args, infer=False): self.args = args # if infer: # args.batch_size = 1 # args.seq_length = 1 if args.model == 'rnn': cell_fn = rnn_cell.BasicRNNCell elif args.model == 'gru': cell_fn = rnn_cell.GRUCell elif args.model == 'lstm': cell_fn = rnn_cell.BasicLSTMCell else: raise Exception("model type not supported: {}".format(args.model)) cell = cell_fn(args.rnn_size) self.cell = cell = rnn_cell.MultiRNNCell([cell] * args.num_layers) #self.seq_length = tf.placeholder(tf.int32) #args.seq_length = self.seq_length self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.targets = tf.placeholder(tf.int32, [args.batch_size]) self.initial_state = cell.zero_state(args.batch_size, tf.float32) with tf.variable_scope('rnnlm'): softmax_w = tf.get_variable("softmax_w", [args.rnn_size, args.vocab_size]) softmax_b = tf.get_variable("softmax_b", [args.vocab_size]) with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [args.vocab_size, args.rnn_size]) inputs = tf.split(1, args.seq_length, tf.nn.embedding_lookup(embedding, self.input_data)) # len(inputs)==args.seq_length, shape(inputs[0])==(args.batch_size, args.rnn_size) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] def loop(prev, _): return None # TODO prev = tf.nn.xw_plus_b(prev, softmax_w, softmax_b) prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embedding, prev_symbol) # len(outputs)==args.seq_length, shape(outputs[0])==(args.batch_size, args.rnn_size) outputs, states = seq2seq.rnn_decoder(inputs, self.initial_state, cell, loop_function=loop if infer else None, scope='rnnlm') # # shape(output) = (batch_size*seq_length, rnn_size) # output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size]) def handle_outputs(use_lastone=True): """ Shape of return is [batch_size, rnn_size]. """ if use_lastone: return outputs[-1] output = tf.add_n(outputs) output = tf.div(output, len(outputs)) return output output = handle_outputs(use_lastone=False) # shape(logits) = (batch_size, vocab_size) self.logits = tf.nn.xw_plus_b(output, softmax_w, softmax_b) self.probs = tf.nn.softmax(self.logits) loss = seq2seq.sequence_loss_by_example([self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([args.batch_size])], args.vocab_size) self.cost = tf.reduce_sum(loss) / args.batch_size _ = tf.scalar_summary('cost', self.cost) # Evaluate accuracy correct_pred = tf.equal(tf.cast(tf.argmax(self.logits, 1), tf.int32), tf.reshape(self.targets, [-1])) self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) _ = tf.scalar_summary('accuracy', self.accuracy) self.final_state = states self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), args.grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--data_dir', type=str, default='../data/xinhua', help='data directory containing input.txt') parser.add_argument('--batch_size', type=int, default=120, help='minibatch size') parser.add_argument('--seq_length', type=int, default=5, help='RNN sequence length') parser.add_argument('--hidden_num', type=int, default=256, help='number of hidden layers') parser.add_argument('--word_dim', type=int, default=256, help='number of word embedding') parser.add_argument('--num_epochs', type=int, default=50, help='number of epochs') parser.add_argument('--model', type=str, default='lstm', help='rnn, gru, or lstm') parser.add_argument('--grad_clip', type=float, default=10., help='clip gradients at this value') args = parser.parse_args() #参数集合 #准备训练数据 data_loader = TextLoader2(args.data_dir, args.batch_size, args.seq_length) args.vocab_size = data_loader.vocab_size #模型定义 graph = tf.Graph() with graph.as_default(): if args.model == 'rnn': cell_fn = rnn_cell.BasicRNNCell elif args.model == 'gru': cell_fn = rnn_cell.GRUCell elif args.model == 'lstm': cell_fn = rnn_cell.BasicLSTMCell else: raise Exception("model type not supported: {}".format(args.model)) cell = cell_fn(args.hidden_num) #输入变量 input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) targets = tf.placeholder(tf.int64, [args.batch_size, args.seq_length]) initial_state = cell.zero_state(args.batch_size, tf.float32) #模型参数 with tf.variable_scope('rnnlm' + 'embedding'): embeddings = tf.Variable( tf.random_uniform([args.vocab_size, args.word_dim], -1.0, 1.0)) embeddings = tf.nn.l2_normalize(embeddings, 1) with tf.variable_scope('rnnlm' + 'weight'): softmax_w = tf.get_variable("softmax_w", [args.hidden_num, args.vocab_size]) softmax_b = tf.get_variable("softmax_b", [args.vocab_size]) # def loop(prev, _): # prev = tf.matmul(prev, softmax_w) + softmax_b # prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) # return tf.nn.embedding_lookup(embeddings, prev_symbol) inputs = tf.split(1, args.seq_length, tf.nn.embedding_lookup(embeddings, input_data)) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] outputs, last_state = seq2seq.rnn_decoder(inputs, initial_state, cell) output = tf.reshape(tf.concat(1, outputs), [-1, args.hidden_num]) logits = tf.matmul(output, softmax_w) + softmax_b probs = tf.nn.softmax(logits) loss_rnn = seq2seq.sequence_loss_by_example( [logits], [tf.reshape(targets, [-1])], [tf.ones([args.batch_size * args.seq_length])], args.vocab_size) cost = tf.reduce_sum(loss_rnn) / args.batch_size / args.seq_length final_state = last_state lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), args.grad_clip) optimizer = tf.train.AdagradOptimizer(0.1) train_op = optimizer.apply_gradients(zip(grads, tvars)) #输出词向量 embeddings_norm = tf.sqrt( tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True)) normalized_embeddings = embeddings / embeddings_norm #模型训练 with tf.Session(graph=graph) as sess: tf.initialize_all_variables().run() for e in range(args.num_epochs): data_loader.reset_batch_pointer() for b in range(data_loader.num_batches): start = time.time() x, y = data_loader.next_batch() feed = {input_data: x, targets: y} train_loss, _ = sess.run([cost, train_op], feed) end = time.time() print( "{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" .format(b, data_loader.num_batches, e, train_loss, end - start)) np.save('rnnlm_word_embeddings', normalized_embeddings.eval())
def __init__( self, vocab, tagset, alphabet, word_embedding_size, char_embedding_size, num_chars, num_steps, optimizer_desc, generate_lemmas, l2, dropout_prob_values, experiment_name, supply_form_characters_to_lemma, threads=0, seed=None, write_summaries=True, use_attention=True, scheduled_sampling=None, ): """ Builds the tagger computation graph and initializes it in a TensorFlow session. Arguments: vocab: Vocabulary of word forms. tagset: Vocabulary of possible tags. alphabet: Vocabulary of possible characters. word_embedding_size (int): Size of the form-based word embedding. char_embedding_size (int): Size of character embeddings, i.e. a half of the size of the character-based words embeddings. num_chars: Maximum length of a word. num_steps: Maximum lenght of a sentence. optimizer_desc: Description of the optimizer. generate_lemmas: Generate lemmas during tagging. seed: TensorFlow seed write_summaries: Write summaries using TensorFlow interface. """ self.num_steps = num_steps self.num_chars = num_chars self.word_embedding_size = word_embedding_size self.char_embedding_size = char_embedding_size self.lstm_size = word_embedding_size + 2 * char_embedding_size ### self.vocab = vocab self.tagset = tagset self.alphabet = alphabet self.dropout_prob_values = dropout_prob_values self.forward_initial_state = tf.placeholder( tf.float32, [None, rnn_cell.BasicLSTMCell(self.lstm_size).state_size], name="forward_lstm_initial_state" ) self.backward_initial_state = tf.placeholder( tf.float32, [None, rnn_cell.BasicLSTMCell(self.lstm_size).state_size], name="backward_lstm_initial_state" ) self.sentence_lengths = tf.placeholder(tf.int64, [None], name="sentence_lengths") self.tags = tf.placeholder(tf.int32, [None, num_steps], name="ground_truth_tags") self.dropout_prob = tf.placeholder(tf.float32, [None], name="dropout_keep_p") self.generate_lemmas = generate_lemmas global_step = tf.Variable(0, trainable=False) input_list = [] regularize = [] # Word-level embeddings if word_embedding_size: self.words = tf.placeholder(tf.int32, [None, num_steps], name="words") word_embeddings = tf.Variable(tf.random_uniform([len(vocab), word_embedding_size], -1.0, 1.0)) we_lookup = tf.nn.embedding_lookup(word_embeddings, self.words) input_list.append(we_lookup) # Character-level embeddings if char_embedding_size: self.chars = tf.placeholder(tf.int32, [None, num_steps, num_chars], name="chars") self.chars_lengths = tf.placeholder(tf.int64, [None, num_steps], name="chars_lengths") char_embeddings = tf.Variable(tf.random_uniform([len(alphabet), char_embedding_size], -1.0, 1.0)) ce_lookup = tf.nn.embedding_lookup(char_embeddings, self.chars) reshaped_ce_lookup = tf.reshape(ce_lookup, [-1, num_chars, char_embedding_size], name="reshape-char_inputs") char_inputs = [tf.squeeze(input_, [1]) for input_ in tf.split(1, num_chars, reshaped_ce_lookup)] char_inputs_lengths = tf.reshape(self.chars_lengths, [-1]) with tf.variable_scope("char_forward"): char_lstm = rnn_cell.BasicLSTMCell(char_embedding_size) _, char_last_state = rnn.rnn( cell=char_lstm, inputs=char_inputs, sequence_length=char_inputs_lengths, dtype=tf.float32 ) tf.get_variable_scope().reuse_variables() regularize.append(tf.get_variable("RNN/BasicLSTMCell/Linear/Matrix")) with tf.variable_scope("char_backward"): char_lstm_rev = rnn_cell.BasicLSTMCell(char_embedding_size) _, char_last_state_rev = rnn.rnn( cell=char_lstm_rev, inputs=self._reverse_seq(char_inputs, char_inputs_lengths), sequence_length=char_inputs_lengths, dtype=tf.float32, ) tf.get_variable_scope().reuse_variables() regularize.append(tf.get_variable("RNN/BasicLSTMCell/Linear/Matrix")) last_char_lstm_state = tf.split(1, 2, char_last_state)[1] last_char_lstm_state_rev = tf.split(1, 2, char_last_state_rev)[1] last_char_states = tf.reshape( last_char_lstm_state, [-1, num_steps, char_embedding_size], name="reshape-charstates" ) last_char_states_rev = tf.reshape( last_char_lstm_state_rev, [-1, num_steps, char_embedding_size], name="reshape-charstates_rev" ) char_output = tf.concat(2, [last_char_states, last_char_states_rev]) input_list.append(char_output) # All inputs correctly sliced input_list_dropped = [tf.nn.dropout(x, self.dropout_prob[0]) for x in input_list] inputs = [tf.squeeze(input_, [1]) for input_ in tf.split(1, num_steps, tf.concat(2, input_list_dropped))] with tf.variable_scope("forward"): lstm = rnn_cell.BasicLSTMCell(self.lstm_size) outputs, last_state = rnn.rnn( cell=lstm, inputs=inputs, dtype=tf.float32, initial_state=self.forward_initial_state, sequence_length=self.sentence_lengths, ) tf.get_variable_scope().reuse_variables() regularize.append(tf.get_variable("RNN/BasicLSTMCell/Linear/Matrix")) with tf.variable_scope("backward"): lstm_rev = rnn_cell.BasicLSTMCell(self.lstm_size) outputs_rev_rev, last_state_rev = rnn.rnn( cell=lstm_rev, inputs=self._reverse_seq(inputs, self.sentence_lengths), dtype=tf.float32, initial_state=self.backward_initial_state, sequence_length=self.sentence_lengths, ) outputs_rev = self._reverse_seq(outputs_rev_rev, self.sentence_lengths) tf.get_variable_scope().reuse_variables() regularize.append(tf.get_variable("RNN/BasicLSTMCell/Linear/Matrix")) # outputs_forward = tf.reshape(tf.concat(1, outputs), [-1, self.lstm_size], # name="reshape-outputs_forward") # outputs_backward = tf.reshape(tf.concat(1, outputs_rev), [-1, self.lstm_size], # name="reshape-outputs_backward") # forward_w = tf.get_variable("forward_w", [self.lstm_size, self.lstm_size]) # backward_w = tf.get_variable("backward_w", [self.lstm_size, self.lstm_size]) # non_linearity_bias = tf.get_variable("non_linearity_b", [self.lstm_size]) outputs_bidi = [tf.concat(1, [o1, o2]) for o1, o2 in zip(outputs, reversed(outputs_rev))] # output = tf.tanh(tf.matmul(outputs_forward, forward_w) + tf.matmul(outputs_backward, backward_w) + non_linearity_bias) output = tf.reshape(tf.concat(1, outputs_bidi), [-1, 2 * self.lstm_size], name="reshape-outputs_bidi") output_dropped = tf.nn.dropout(output, self.dropout_prob[1]) # We are computing only the logits, not the actual softmax -- while # computing the loss, it is done by the sequence_loss_by_example and # during the runtime classification, the argmax over logits is enough. softmax_w = tf.get_variable("softmax_w", [2 * self.lstm_size, len(tagset)]) logits_flatten = tf.nn.xw_plus_b(output_dropped, softmax_w, tf.get_variable("softmax_b", [len(tagset)])) # tf.get_variable_scope().reuse_variables() regularize.append(softmax_w) self.logits = tf.reshape(logits_flatten, [-1, num_steps, len(tagset)], name="reshape-logits") estimated_tags_flat = tf.to_int32(tf.argmax(logits_flatten, dimension=1)) self.last_state = last_state # output maks: compute loss only if it insn't a padded word (i.e. zero index) output_mask = tf.reshape(tf.to_float(tf.not_equal(self.tags, 0)), [-1]) gt_tags_flat = tf.reshape(self.tags, [-1]) tagging_loss = seq2seq.sequence_loss_by_example( logits=[logits_flatten], targets=[gt_tags_flat], weights=[output_mask] ) tagging_accuracy = tf.reduce_sum( tf.to_float(tf.equal(estimated_tags_flat, gt_tags_flat)) * output_mask ) / tf.reduce_sum(output_mask) tf.scalar_summary("train_accuracy", tagging_accuracy, collections=["train"]) tf.scalar_summary("dev_accuracy", tagging_accuracy, collections=["dev"]) self.cost = tf.reduce_mean(tagging_loss) tf.scalar_summary("train_tagging_loss", tf.reduce_mean(tagging_loss), collections=["train"]) tf.scalar_summary("dev_tagging_loss", tf.reduce_mean(tagging_loss), collections=["dev"]) if generate_lemmas: with tf.variable_scope("decoder"): self.lemma_chars = tf.placeholder(tf.int32, [None, num_steps, num_chars + 2], name="lemma_chars") lemma_state_size = self.lstm_size lemma_w = tf.Variable(tf.random_uniform([lemma_state_size, len(alphabet)], 0.5), name="state_to_char_w") lemma_b = tf.Variable(tf.fill([len(alphabet)], -math.log(len(alphabet))), name="state_to_char_b") lemma_char_embeddings = tf.Variable( tf.random_uniform( [len(alphabet), lemma_state_size / (2 if supply_form_characters_to_lemma else 1)], -0.5, 0.5 ), name="char_embeddings", ) lemma_char_inputs = [ tf.squeeze(input_, [1]) for input_ in tf.split( 1, num_chars + 2, tf.reshape(self.lemma_chars, [-1, num_chars + 2], name="reshape-lemma_char_inputs"), ) ] if supply_form_characters_to_lemma: char_inputs_zeros = [ tf.squeeze(chars, [1]) for chars in tf.split( 1, num_chars, tf.reshape(self.chars, [-1, num_chars], name="reshape-char_inputs_zeros") ) ] char_inputs_zeros.append(char_inputs_zeros[0] * 0) def loop(prev_state, i): # it takes the previous hidden state, finds the character and formats it # as input for the next time step ... used in the decoder in the "real decoding scenario" out_activation = tf.matmul(prev_state, lemma_w) + lemma_b prev_char_index = tf.argmax(out_activation, 1) return tf.concat( 1, [ tf.nn.embedding_lookup(lemma_char_embeddings, prev_char_index), tf.nn.embedding_lookup(lemma_char_embeddings, char_inputs_zeros[i]), ], ) embedded_lemma_characters = [] for lemma_chars, form_chars in zip(lemma_char_inputs[:-1], char_inputs_zeros): embedded_lemma_characters.append( tf.concat( 1, [ tf.nn.embedding_lookup(lemma_char_embeddings, lemma_chars), tf.nn.embedding_lookup(lemma_char_embeddings, form_chars), ], ) ) else: def loop(prev_state, _): # it takes the previous hidden state, finds the character and formats it # as input for the next time step ... used in the decoder in the "real decoding scenario" out_activation = tf.matmul(prev_state, lemma_w) + lemma_b prev_char_index = tf.argmax(out_activation, 1) return tf.nn.embedding_lookup(lemma_char_embeddings, prev_char_index) embedded_lemma_characters = [] for lemma_chars in lemma_char_inputs[:-1]: embedded_lemma_characters.append(tf.nn.embedding_lookup(lemma_char_embeddings, lemma_chars)) def sampling_loop(prev_state, i): threshold = scheduled_sampling / (scheduled_sampling + tf.exp(tf.to_float(global_step))) condition = tf.less_equal(tf.random_uniform(tf.shape(embedded_lemma_characters[0])), threshold) return tf.select(condition, embedded_lemma_characters[i], loop(prev_state, i)) decoder_cell = rnn_cell.BasicLSTMCell(lemma_state_size) if scheduled_sampling: lf = sampling_loop else: lf = None if use_attention: lemma_outputs_train, _ = seq2seq.attention_decoder( embedded_lemma_characters, output_dropped, reshaped_ce_lookup, decoder_cell, loop_function=lf ) else: lemma_outputs_train, _ = seq2seq.rnn_decoder( embedded_lemma_characters, output_dropped, decoder_cell, loop_function=lf ) tf.get_variable_scope().reuse_variables() # regularize.append(tf.get_variable('attention_decoder/BasicLSTMCell/Linear/Matrix')) tf.get_variable_scope().reuse_variables() if use_attention: lemma_outputs_runtime, _ = seq2seq.attention_decoder( embedded_lemma_characters, output_dropped, reshaped_ce_lookup, decoder_cell, loop_function=loop ) else: lemma_outputs_runtime, _ = seq2seq.rnn_decoder( embedded_lemma_characters, output_dropped, decoder_cell, loop_function=loop ) lemma_char_logits_train = [tf.matmul(o, lemma_w) + lemma_b for o in lemma_outputs_train] lemma_char_logits_runtime = [tf.matmul(o, lemma_w) + lemma_b for o in lemma_outputs_runtime] self.lemmas_decoded = tf.reshape( tf.transpose(tf.argmax(tf.pack(lemma_char_logits_runtime), 2)), [-1, num_steps, num_chars + 1] ) lemma_char_weights = [] for lemma_chars in lemma_char_inputs[1:]: lemma_char_weights.append(tf.to_float(tf.not_equal(lemma_chars, 0))) lemmatizer_loss = seq2seq.sequence_loss( lemma_char_logits_train, lemma_char_inputs[1:], lemma_char_weights ) lemmatizer_loss_runtime = seq2seq.sequence_loss( lemma_char_logits_runtime, lemma_char_inputs[1:], lemma_char_weights ) tf.scalar_summary( "train_lemma_loss_with_gt_inputs", tf.reduce_mean(lemmatizer_loss), collections=["train"] ) tf.scalar_summary("dev_lemma_loss_with_gt_inputs", tf.reduce_mean(lemmatizer_loss), collections=["dev"]) tf.scalar_summary( "train_lemma_loss_with_decoded_inputs", tf.reduce_mean(lemmatizer_loss_runtime), collections=["train"], ) tf.scalar_summary( "dev_lemma_loss_with_decoded_inputs", tf.reduce_mean(lemmatizer_loss_runtime), collections=["dev"] ) self.cost += tf.reduce_mean(lemmatizer_loss) + tf.reduce_mean(lemmatizer_loss_runtime) self.cost += l2 * sum([tf.nn.l2_loss(variable) for variable in regularize]) tf.scalar_summary("train_optimization_cost", self.cost, collections=["train"]) tf.scalar_summary("dev_optimization_cost", self.cost, collections=["dev"]) def decay(learning_rate, exponent, iteration_steps): return tf.train.exponential_decay(learning_rate, global_step, iteration_steps, exponent, staircase=True) optimizer = eval("tf.train." + optimizer_desc) self.train = optimizer.minimize(self.cost, global_step=global_step) if threads > 0: self.session = tf.Session( config=tf.ConfigProto(inter_op_parallelism_threads=threads, intra_op_parallelism_threads=threads) ) else: self.session = tf.Session() self.session.run(tf.initialize_all_variables()) if write_summaries: self.summary_train = tf.merge_summary(tf.get_collection("train")) self.summary_dev = tf.merge_summary(tf.get_collection("dev")) timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S") self.summary_writer = tf.train.SummaryWriter("logs/" + timestamp + "_" + experiment_name) self.steps = 0
def create(x, targets,batch_size=-1): ops = { 'conv1d':conv1d, 'conv1d_transpose':conv1d_transpose, 'feed_forward_nn':feed_forward_nn, 'autoencoder':autoencoder, 'reshape':reshape, 'lstm':lstm } results = {} def nextMethod(current_layer): global layer_index if(len(layers) == layer_index+1): return current_layer layer_index += 1 layer_def = layers[layer_index] return ops[layer_def['type']](current_layer, layer_def, nextMethod) decoded = ops[layers[0]['type']](x, layers[0], nextMethod) #decoded=input reconstructed_x = tf.reshape(decoded, [-1, SIZE,DEPTH]) print("Completed reshaping") ## hack build lstm size = SIZE#layer_def['size'] cell = rnn_cell.BasicLSTMCell(size) initial_state = cell.zero_state(batch_size, tf.float32) outputs, last_state = seq2seq.rnn_decoder([decoded], initial_state, cell) extra_outputs = tf.concat(1, outputs) print("shape of extra", extra_outputs) output = tf.reshape(extra_outputs, [-1, size]) print("shape of output", output.get_shape()) # softmax_w = tf.get_variable("softmax_w", [size, tf.shape(input)[0]]) #wrong # softmax_b = tf.get_variable("softmax_b", [tf.shape(input)[0]]) #wrong # logits = tf.nn.xw_plus_b(output, softmax_w, softmax_b) #print("shape of logits", logits.get_shape()) #probs = tf.nn.softmax(logits) #print("shape of probs", probs.get_shape()) #`weights = tf.ones_like(logits) #print("shape of targets", targets.get_shape()) num_decoder_symbols = 10 #loss = seq2seq.sequence_loss_by_example([logits], [targets], [weights], num_decoder_symbols) #output=loss #results["cost"]= tf.reduce_sum(loss) / SIZE / 1000 ## end hack predict = output results["cost"]= tf.sqrt(tf.reduce_mean(tf.square(targets-reconstructed_x)))*0.1+tf.sqrt(tf.reduce_mean(tf.square(x-reconstructed_x)))*0.9 results['predict']=predict results['decoded']=tf.reshape(decoded, [-1]) #results['arranged']= arranged_prev_layer #results['transposed']= conv_transposed return results
def __init__(self, args, infer=False): self.args = args if infer: args.batch_size = 1 args.seq_length = 1 if args.model == 'rnn': cell_fn = rnn_cell.BasicRNNCell elif args.model == 'gru': cell_fn = rnn_cell.GRUCell elif args.model == 'lstm': cell_fn = rnn_cell.BasicLSTMCell else: raise Exception("model type not supported: {}".format(args.model)) cell = cell_fn(args.rnn_size) self.cell = cell = rnn_cell.MultiRNNCell([cell] * args.num_layers) # create tensorflow placeholder self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) # Initial state of the cell memory. self.initial_state = cell.zero_state(args.batch_size, tf.float32) # create namespace for shareable variables (variable name = "rnnlm/softmax_w") with tf.variable_scope('rnnlm'): # create (or get) a variable with shape [rnn_size, vocab_size] softmax_w = tf.get_variable("softmax_w", [args.rnn_size, args.vocab_size]) softmax_b = tf.get_variable("softmax_b", [args.vocab_size]) with tf.device("/cpu:0"): # preparing dense representation of the data in a embedding matrix embedding = tf.get_variable("embedding", [args.vocab_size, args.rnn_size]) inputs = tf.split(1, args.seq_length, tf.nn.embedding_lookup(embedding, self.input_data)) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] def loop(prev, _): prev = tf.matmul(prev, softmax_w) + softmax_b prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embedding, prev_symbol) # rnn network outputs, last_state = seq2seq.rnn_decoder(inputs, self.initial_state, cell, loop_function=loop if infer else None, scope='rnnlm') output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size]) # last layer (like fully connected nn) self.logits = tf.matmul(output, softmax_w) + softmax_b # activation function of the last layer self.probs = tf.nn.softmax(self.logits) # loss function loss = seq2seq.sequence_loss_by_example([self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([args.batch_size * args.seq_length])], args.vocab_size) # training function self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length self.final_state = last_state self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), args.grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars))