def testSequenceLossByExample(self): with self.test_session() as sess: output_classes = 5 logits = [ tf.constant(i + 0.5, shape=[2, output_classes]) for i in xrange(3) ] targets = [tf.constant(i, tf.int32, shape=[2]) for i in xrange(3)] weights = [tf.constant(1.0, shape=[2]) for i in xrange(3)] average_loss_per_example = seq2seq.sequence_loss_by_example( logits, targets, weights, output_classes, average_across_timesteps=True) res = sess.run(average_loss_per_example) self.assertAllClose(res, np.asarray([1.609438, 1.609438])) loss_per_sequence = seq2seq.sequence_loss_by_example( logits, targets, weights, output_classes, average_across_timesteps=False) res = sess.run(loss_per_sequence) self.assertAllClose(res, np.asarray([4.828314, 4.828314]))
def create_model(self): self.input_data = tf.placeholder(tf.int32, [self.batch_size, self.seq_length], name="input_data") self.target_data = tf.placeholder(tf.int32,[self.batch_size, self.seq_length], name="target_data") # define hyper_parameters self.keep_prob = tf.Variable(0.3, trainable=False, name='keep_prob') self.lr = tf.Variable(0.0, trainable=False, name="lr") softmax_weights = tf.get_variable("softmax_weights",[self.rnn_size, self.vocab_size]) softmax_biases = tf.get_variable("softmax_biases", [self.vocab_size]) lstm_cell = rnn_cell.BasicLSTMCell(self.rnn_size) # if self.is_training and self.keep_prob < 1: # lstm_cell = rnn_cell.DropoutWrapper(lstm_cell, output_keep_prob=self.keep_prob) multilayer_cell = rnn_cell.MultiRNNCell([lstm_cell] * self.num_layers) self.initial_state = multilayer_cell.zero_state(self.batch_size, tf.float32) with tf.device("/cpu:0"): # define the embedding matrix for the whole vocabulary self.embedding = tf.get_variable("embeddings", [self.vocab_size, self.rnn_size]) # take the vector representation for each word in the embeddings embeds = tf.nn.embedding_lookup(self.embedding, self.input_data) if self.is_training and self.keep_prob < 1: embeds = tf.nn.dropout(embeds, self.keep_prob) def loop(prev, _): prev = tf.nn.xw_plus_b(prev, softmax_weights, softmax_biases) prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(self.embedding, prev_symbol) #convert input to a list of seq_length inputs = tf.split(1,self.seq_length, embeds) #after splitting the shape becomes (batch_size,1,rnn_size). We need to modify it to [batch*rnn_size] inputs = [ tf.squeeze(input_, [1]) for input_ in inputs] output,states= seq2seq.rnn_decoder(inputs,self.initial_state, multilayer_cell, loop_function=loop if self.infer else None, scope='rnnlm') output = tf.reshape(tf.concat(1, output), [-1, self.rnn_size]) self.logits = tf.nn.xw_plus_b(output, softmax_weights, softmax_biases) self.probs = tf.nn.softmax(self.logits, name= "probability") loss = seq2seq.sequence_loss_by_example([self.logits], [tf.reshape(self.target_data, [-1])], [tf.ones([self.batch_size * self.seq_length])], self.vocab_size ) self.cost = tf.reduce_sum(loss) / ( self.batch_size * self.seq_length ) self.final_state= states[-1] tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars),self.grad_clip) optimizer = tf.train.AdamOptimizer(0.01) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def __init__(self, vocabularySize, config_param): self.vocabularySize = vocabularySize self.config = config_param self._inputX = tf.placeholder(tf.int32, [self.config.batch_size, self.config.sequence_size], "InputsX") self._inputTargetsY = tf.placeholder(tf.int32, [self.config.batch_size, self.config.sequence_size], "InputTargetsY") #Converting Input in an Embedded form with tf.device("/cpu:0"): #Tells Tensorflow what GPU to use specifically embedding = tf.get_variable("embedding", [self.vocabularySize, self.config.embeddingSize]) embeddingLookedUp = tf.nn.embedding_lookup(embedding, self._inputX) inputs = tf.split(1, self.config.sequence_size, embeddingLookedUp) inputTensorsAsList = [tf.squeeze(input_, [1]) for input_ in inputs] #Define Tensor RNN singleRNNCell = rnn_cell.BasicRNNCell(self.config.hidden_size) self.multilayerRNN = rnn_cell.MultiRNNCell([singleRNNCell] * self.config.num_layers) self._initial_state = self.multilayerRNN.zero_state(self.config.batch_size, tf.float32) #Defining Logits hidden_layer_output, states = rnn.rnn(self.multilayerRNN, inputTensorsAsList, initial_state=self._initial_state) hidden_layer_output = tf.reshape(tf.concat(1, hidden_layer_output), [-1, self.config.hidden_size]) self._logits = tf.nn.xw_plus_b(hidden_layer_output, tf.get_variable("softmax_w", [self.config.hidden_size, self.vocabularySize]), tf.get_variable("softmax_b", [self.vocabularySize])) self._predictionSoftmax = tf.nn.softmax(self._logits) #Define the loss loss = seq2seq.sequence_loss_by_example([self._logits], [tf.reshape(self._inputTargetsY, [-1])], [tf.ones([self.config.batch_size * self.config.sequence_size])], self.vocabularySize) self._cost = tf.div(tf.reduce_sum(loss), self.config.batch_size) self._final_state = states[-1]
def __init__(self, vocab_size, batch_size, sequece_length, embedding_size, num_classes): self.hyperParam = {} self.hyperParam["hidden_num"] = 20 self.hyperParam["l2_lamda"] = 3; self.hyperParam["dropout_keep_prob"] = 0.5; l2_loss = tf.constant(0.0) self.dropout_keep_prob = 0.5 ##rnnCell = rnn_cell.BasicRNNCell(hidden_num) rnnCell = rnn_cell.BasicLSTMCell(self.hyperParam["hidden_num"], forget_bias=1.0) self.input_data = tf.placeholder(tf.int32, shape=[None, sequece_length], name = "input_data") self.weights = tf.placeholder(tf.int32, shape=[None, sequece_length], name= "weights") self.output_data = tf.placeholder(tf.int32, [None, sequece_length], name = "output_data") a = tf.shape(self.output_data)[0] #self.inputs = [] with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [vocab_size, embedding_size]) inputs = tf.nn.embedding_lookup(embedding, self.input_data) #for i, v in enumerate(input_refine): # self.inputs.append(tf.nn.embedding_lookup(embedding, input_refine[i])) self.inputs = [tf.squeeze(input_, [1]) for input_ in tf.split(1, sequece_length, inputs)] self.output, self.states = rnn.rnn(rnnCell, self.inputs, dtype=tf.float32) # Add dropout with tf.name_scope("dropout"): self.h_drop = [tf.nn.dropout(p, self.hyperParam["dropout_keep_prob"]) for p in self.output] predictions = []; with tf.name_scope("result"): W = tf.Variable(tf.truncated_normal([self.hyperParam["hidden_num"], num_classes], stddev=0.1), name="W") b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b") l2_loss += tf.nn.l2_loss(W) l2_loss += tf.nn.l2_loss(b) #output = tf.reshape(tf.concat(1, self.output), [-1, hidden_num]) output = tf.reshape(tf.concat(1, self.h_drop), [-1, self.hyperParam["hidden_num"]]) logits = tf.matmul(output, W) + b self.scores = logits #self.new_scores = [tf.squeeze(k, [1]) for k in tf.split(1, sequece_length, tf.reshape(logits, [-1, sequece_length ,num_classes]))] losses = 0; accuracy = [] with tf.name_scope("loss"): output_refine = tf.reshape(self.output_data, [-1]) #output_refine = tf.split(1, sequece_length, self.output_data) #weigth = tf.ones_like(output_refine, dtype="float32") weight = tf.reshape(tf.cast(self.weights, "float32"), [-1]) loss = seq2seq.sequence_loss_by_example([self.scores], [output_refine], [weight],num_classes); self.loss = tf.reduce_sum(loss)/tf.cast(a, "float32") + self.hyperParam["l2_lamda"]*l2_loss #self.accuracy = tf.reduce_mean(tf.cast(tf.concat(0, accuracy), "float")) with tf.name_scope("accurcy"): self.predictions = tf.argmax(tf.reshape(self.scores, [-1, sequece_length, num_classes]), 2) #self.kk = tf.cast(tf.equal(self.predictions, tf.cast(self.output_data, "int64")), "int64") aa = tf.expand_dims(tf.reshape(tf.cast(tf.equal(self.predictions, tf.cast(self.output_data, "int64")), "float32"), [-1]), 0) bb = tf.expand_dims(tf.cast(tf.reshape(self.weights, [-1]), "float32"), 0) self.kk = tf.squeeze(tf.matmul(aa, bb, transpose_b=True))/tf.reduce_sum(tf.cast(self.weights, "float32"), [0,1]) self.accuracy = tf.reduce_mean(tf.cast(tf.equal(self.predictions, tf.cast(self.output_data, "int64")), "float32"), name="accrucy")
def __init__(self, args, infer=False): self.args = args if infer: args.batch_size = 1 args.seq_length = 1 additional_cell_args = {} if args.model == 'rnn': cell_fn = rnn_cell.BasicRNNCell elif args.model == 'gru': cell_fn = rnn_cell.GRUCell elif args.model == 'lstm': cell_fn = rnn_cell.BasicLSTMCell elif args.model == 'gridlstm': cell_fn = grid_rnn.Grid2LSTMCell additional_cell_args.update({'use_peepholes': True, 'forget_bias': 1.0}) elif args.model == 'gridgru': cell_fn = grid_rnn.Grid2GRUCell else: raise Exception("model type not supported: {}".format(args.model)) cell = cell_fn(args.rnn_size, **additional_cell_args) self.cell = cell = rnn_cell.MultiRNNCell([cell] * args.num_layers) self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.initial_state = cell.zero_state(args.batch_size, tf.float32) with tf.variable_scope('rnnlm'): softmax_w = tf.get_variable("softmax_w", [args.rnn_size, args.vocab_size]) softmax_b = tf.get_variable("softmax_b", [args.vocab_size]) with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [args.vocab_size, args.rnn_size]) inputs = tf.split(1, args.seq_length, tf.nn.embedding_lookup(embedding, self.input_data)) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] def loop(prev, _): prev = tf.nn.xw_plus_b(prev, softmax_w, softmax_b) prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embedding, prev_symbol) outputs, last_state = seq2seq.rnn_decoder(inputs, self.initial_state, cell, loop_function=loop if infer else None, scope='rnnlm') output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size]) self.logits = tf.nn.xw_plus_b(output, softmax_w, softmax_b) self.probs = tf.nn.softmax(self.logits) loss = seq2seq.sequence_loss_by_example([self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([args.batch_size * args.seq_length])], args.vocab_size) self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length self.final_state = last_state self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), args.grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def __init__(self, is_training, config): self.batch_size = batch_size = config.batch_size # size for mini batch training self.num_steps = num_steps = config.num_steps # maximum number of training iteration? size = config.hidden_size # state size feature_size = config.feature_size self._input_data = tf.placeholder(tf.int32, [batch_size, num_steps, feature_size]) self._targets = tf.placeholder(tf.int32, [batch_size, num_steps, feature_size]) basic_cell = rnn_cell.BasicLSTMCell(size) if is_training and config.keep_prob < 1: # use dropout basic_cell = rnn_cell.DropoutWrapper(basic_cell, output_keep_prob=config.keep_prob) cell = rnn_cell.MultiRNNCell([basic_cell] * config.num_layers) # multiple layers self._initial_state = cell.zero_state(batch_size, tf.float32) inputs = self._input_data print inputs print "haha" if is_training and config.keep_prob < 1: inputs = tf.nn.dropout(inputs, config.keep_prob) # inputs = [tf.squeeze(input_, [1]) # for input_ in tf.split(1, num_steps, inputs)] # outputs, states = rnn.rnn( # cell, inputs, initial_state=self._initial_state) # outputs = [] states = [] state = self._initial_state with tf.variable_scope("RNN"): for time_step in range(num_steps): if time_step > 0: tf.get_variable_scope().reuse_variables() (cell_output, state) = cell(inputs[:, time_step, :], state) outputs.append(cell_output) states.append(state) print outputs output = tf.reshape(tf.concat(1, outputs), [-1, size]) print output logits = tf.nn.xw_plus_b( output, tf.get_variable("softmax_w", [size, feature_size]), tf.get_variable("softmax_b", [feature_size]) ) loss = seq2seq.sequence_loss_by_example( [logits], [tf.reshape(self._targets, [-1])], [tf.ones([batch_size * num_steps])], feature_size ) self._cost = cost = tf.reduce_sum(loss) / batch_size self._final_state = states[-1] if not is_training: return self._lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), config.max_grad_norm) optimizer = tf.train.GradientDescentOptimizer(self.lr) self._train_op = optimizer.apply_gradients(zip(grads, tvars))
def testSequenceLossByExample(self): with self.test_session() as sess: output_classes = 5 logits = [tf.constant(i + 0.5, shape=[2, output_classes]) for i in xrange(3)] targets = [tf.constant(i, tf.int32, shape=[2]) for i in xrange(3)] weights = [tf.constant(1.0, shape=[2]) for i in xrange(3)] average_loss_per_example = seq2seq.sequence_loss_by_example( logits, targets, weights, output_classes, average_across_timesteps=True) res = sess.run(average_loss_per_example) self.assertAllClose(res, np.asarray([1.609438, 1.609438])) loss_per_sequence = seq2seq.sequence_loss_by_example( logits, targets, weights, output_classes, average_across_timesteps=False) res = sess.run(loss_per_sequence) self.assertAllClose(res, np.asarray([4.828314, 4.828314]))
def __init__(self, rnn_size, num_layers, vocab_size, grad_clip, batch_size=1, seq_length=1): cell = rnn_cell.BasicLSTMCell(rnn_size) self.cell = cell = rnn_cell.MultiRNNCell([cell] * num_layers) self.input_data = tf.placeholder(tf.int32, [batch_size, seq_length]) self.targets = tf.placeholder(tf.int32, [batch_size, seq_length]) self.initial_state = cell.zero_state(batch_size, tf.float32) with tf.variable_scope('rnnlm'): softmax_w = tf.get_variable('softmax_w', [rnn_size, vocab_size]) softmax_b = tf.get_variable('softmax_b', [vocab_size]) with tf.device('/cpu:0'): embedding = tf.get_variable('embedding', [vocab_size, rnn_size]) inputs = tf.split( 1, seq_length, tf.nn.embedding_lookup(embedding, self.input_data)) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] def loop(prev, _): prev = tf.nn.xw_plus_b(prev, softmax_w, softmax_b) prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embedding, prev_symbol) train = batch_size == 1 and seq_length == 1 loop_fn = loop if train else None outputs, last_state = seq2seq.rnn_decoder(inputs, self.initial_state, cell, loop_function=loop_fn, scope='rnnlm') output = tf.reshape(tf.concat(1, outputs), [-1, rnn_size]) self.logits = tf.nn.xw_plus_b(output, softmax_w, softmax_b) self.probs = tf.nn.softmax(self.logits) loss = seq2seq.sequence_loss_by_example( [self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([batch_size * seq_length])], vocab_size) self.cost = tf.reduce_sum(loss) / batch_size / seq_length self.final_state = last_state self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def __init__(self, args, infer=False, loop=0): self.args = args if infer: args.batch_size = 1 args.seq_length = 1 if args.model == 'rnn': cell_fn = rnn_cell.BasicRNNCell elif args.model == 'gru': cell_fn = rnn_cell.GRUCell elif args.model == 'lstm': cell_fn = rnn_cell.BasicLSTMCell else: raise Exception("model type not supported: {}".format(args.model)) cell = cell_fn(args.rnn_size) self.cell = cell = rnn_cell.MultiRNNCell([cell] * args.num_layers) self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.initial_state = cell.zero_state(args.batch_size, tf.float32) with tf.variable_scope('rnnlm') as scope1: if loop > 0: scope1.reuse_variables() softmax_w = tf.get_variable("softmax_w", [args.rnn_size, args.vocab_size]) softmax_b = tf.get_variable("softmax_b", [args.vocab_size]) with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [args.vocab_size, args.rnn_size]) inputs = tf.split(1, args.seq_length, tf.nn.embedding_lookup(embedding, self.input_data)) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] def loop(prev, _): prev = tf.matmul(prev, softmax_w) + softmax_b prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embedding, prev_symbol) outputs, last_state = seq2seq.rnn_decoder(inputs, self.initial_state, cell, loop_function=loop if infer else None, scope='rnnlm') output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size]) self.logits = tf.matmul(output, softmax_w) + softmax_b self.probs = tf.nn.softmax(self.logits) loss = seq2seq.sequence_loss_by_example([self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([args.batch_size * args.seq_length])], args.vocab_size) self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length self.final_state = last_state self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), args.grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def __init__(self, is_training, config): self.batch_size = batch_size = config.batch_size self.num_steps = num_steps = config.num_steps size = config.hidden_size output_size = config.output_size self._input_data = tf.placeholder(tf.float32, [batch_size, num_steps, size]) self._targets = tf.placeholder(tf.int32, [batch_size, num_steps]) lstm_cell = rnn_cell.BasicLSTMCell(size, forget_bias=0.0) if is_training and config.keep_prob < 1: lstm_cell = rnn_cell.DropoutWrapper( lstm_cell, output_keep_prob=config.keep_prob) cell = rnn_cell.MultiRNNCell([lstm_cell] * config.num_layers) self._initial_state = cell.zero_state(batch_size, tf.float32) inputs = self._input_data outputs = [] states = [] state = self._initial_state with tf.variable_scope("RNN"): for time_step in range(num_steps): if time_step > 0: tf.get_variable_scope().reuse_variables() (cell_output, state) = cell(inputs[:, time_step, :], state) outputs.append(cell_output) states.append(state) output = tf.reshape(tf.concat(1, outputs), [-1, size]) logits = tf.nn.xw_plus_b(output, tf.get_variable("softmax_w", [size, output_size]), tf.get_variable("softmax_b", [output_size])) loss = seq2seq.sequence_loss_by_example([logits], [tf.reshape(self._targets, [-1])], [tf.ones([batch_size * num_steps])], output_size) self._cost = cost = tf.reduce_sum(loss) / batch_size self._final_state = states[-1] self._output = output self._logits = logits if not is_training: return self._lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), config.max_grad_norm) optimizer = tf.train.GradientDescentOptimizer(self.lr) self._train_op = optimizer.apply_gradients(zip(grads, tvars))
def __init__(self, args, sampling=False): self.args = args if sampling: args.batch_size = 1 args.seq_length = 1 basic_cell = rnn_cell.BasicLSTMCell(args.rnn_size) self.cell = rnn_cell.MultiRNNCell([basic_cell] * args.num_layers) self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.initial_state = self.cell.zero_state(args.batch_size, tf.float32) with tf.variable_scope('rnnlm'): softmax_w = tf.get_variable("softmax_w", [args.rnn_size, args.vocab_size]) softmax_b = tf.get_variable("softmax_b", [args.vocab_size]) with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [args.vocab_size, args.rnn_size]) inputs = tf.split(1, args.seq_length, tf.nn.embedding_lookup(embedding, self.input_data)) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] def loop(prev, _): prev = tf.nn.xw_plus_b(prev, softmax_w, softmax_b) prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embedding, prev_symbol) outputs, last_state = seq2seq.rnn_decoder( inputs, self.initial_state, self.cell, loop_function=loop if sampling else None, scope='rnnlm') output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size]) self.logits = tf.nn.xw_plus_b(output, softmax_w, softmax_b) self.probs = tf.nn.softmax(self.logits) loss = seq2seq.sequence_loss_by_example( [self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([args.batch_size * args.seq_length])], args.vocab_size) self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length self.final_state = last_state self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), args.grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def __init__(self, conf): self.conf = conf cell_fw = BasicLSTMCell(self.conf.rnn_size) cell_bw = BasicLSTMCell(self.conf.rnn_size) if conf.keep_prob < 1.0 and not conf.infer: cell_fw = DropoutWrapper(cell_fw, output_keep_prob=conf.keep_prob) cell_bw = DropoutWrapper(cell_bw, output_keep_prob=conf.keep_prob) self.cell_fw = cell_fw = MultiRNNCell([cell_fw] * self.conf.num_layers) self.cell_bw = cell_bw = MultiRNNCell([cell_bw] * self.conf.num_layers) self.input_data = tf.placeholder(tf.int32, [self.conf.batch_size, self.conf.seq_length]) self.targets = tf.placeholder(tf.int32, [self.conf.batch_size, self.conf.seq_length]) self.initial_state_fw = cell_fw.zero_state(self.conf.batch_size, tf.float32) self.initial_state_bw = cell_bw.zero_state(self.conf.batch_size, tf.float32) with tf.variable_scope('rnn'): softmax_w = tf.get_variable("softmax_w", [self.conf.rnn_size*2, self.conf.output_size]) softmax_b = tf.get_variable("softmax_b", [self.conf.output_size]) embedding = tf.get_variable("embedding", [self.conf.nerloader.vocab_size, self.conf.rnn_size]) _inputs = tf.nn.embedding_lookup(embedding, self.input_data) if conf.keep_prob < 1.0 and not conf.infer: _inputs = tf.nn.dropout(_inputs,conf.keep_prob) inputs = tf.split(1, conf.seq_length, _inputs) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] outputs_bi = rnn.bidirectional_rnn(cell_fw, cell_bw, inputs, initial_state_fw=self.initial_state_fw, initial_state_bw=self.initial_state_bw, scope='rnn') output = tf.reshape(tf.concat(1, outputs_bi), [-1, self.conf.rnn_size*2]) self.logits = tf.nn.xw_plus_b(output, softmax_w, softmax_b) self.probs = tf.nn.softmax(self.logits) self.loss_weights = [tf.ones([self.conf.batch_size * self.conf.seq_length])] loss = seq2seq.sequence_loss_by_example([self.logits], [tf.reshape(self.targets, [-1])], self.loss_weights) self.cost = (tf.reduce_sum(loss) / self.conf.batch_size / self.conf.seq_length) tf.scalar_summary("loss",self.cost) self.out = output self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), self.conf.grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars)) self.merged_summary_op = tf.merge_all_summaries()
def __init__(self, vocabularySize, config_param): self.vocabularySize = vocabularySize self.config = config_param self._inputX = tf.placeholder( tf.int32, [self.config.batch_size, self.config.sequence_size], "InputsX") self._inputTargetsY = tf.placeholder( tf.int32, [self.config.batch_size, self.config.sequence_size], "InputTargetsY") #Converting Input in an Embedded form with tf.device( "/cpu:0"): #Tells Tensorflow what GPU to use specifically embedding = tf.get_variable( "embedding", [self.vocabularySize, self.config.embeddingSize]) embeddingLookedUp = tf.nn.embedding_lookup(embedding, self._inputX) inputs = tf.split(1, self.config.sequence_size, embeddingLookedUp) inputTensorsAsList = [tf.squeeze(input_, [1]) for input_ in inputs] #Define Tensor RNN singleRNNCell = rnn_cell.BasicRNNCell(self.config.hidden_size) self.multilayerRNN = rnn_cell.MultiRNNCell([singleRNNCell] * self.config.num_layers) self._initial_state = self.multilayerRNN.zero_state( self.config.batch_size, tf.float32) #Defining Logits hidden_layer_output, states = rnn.rnn( self.multilayerRNN, inputTensorsAsList, initial_state=self._initial_state) hidden_layer_output = tf.reshape(tf.concat(1, hidden_layer_output), [-1, self.config.hidden_size]) self._logits = tf.nn.xw_plus_b( hidden_layer_output, tf.get_variable("softmax_w", [self.config.hidden_size, self.vocabularySize]), tf.get_variable("softmax_b", [self.vocabularySize])) self._predictionSoftmax = tf.nn.softmax(self._logits) #Define the loss loss = seq2seq.sequence_loss_by_example( [self._logits], [tf.reshape(self._inputTargetsY, [-1])], [tf.ones([self.config.batch_size * self.config.sequence_size])], self.vocabularySize) self._cost = tf.div(tf.reduce_sum(loss), self.config.batch_size) self._final_state = states[-1]
def __init__(self, args, infer=False): self.args = args # if infer: # args.batch_size = 1 # args.seq_length = 1 if args.model == 'rnn': cell_fn = rnn_cell.BasicRNNCell elif args.model == 'gru': cell_fn = rnn_cell.GRUCell elif args.model == 'lstm': cell_fn = rnn_cell.BasicLSTMCell else: raise Exception("model type not supported: {}".format(args.model)) cell = cell_fn(args.rnn_size) self.cell = cell = rnn_cell.MultiRNNCell([cell] * args.num_layers) #self.seq_length = tf.placeholder(tf.int32) #args.seq_length = self.seq_length self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.targets = tf.placeholder(tf.int32, [args.batch_size]) self.initial_state = cell.zero_state(args.batch_size, tf.float32) with tf.variable_scope('rnnlm'): softmax_w = tf.get_variable("softmax_w", [args.rnn_size, args.vocab_size]) softmax_b = tf.get_variable("softmax_b", [args.vocab_size]) with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [args.vocab_size, args.rnn_size]) inputs = tf.split( 1, args.seq_length, tf.nn.embedding_lookup(embedding, self.input_data)) # len(inputs)==args.seq_length, shape(inputs[0])==(args.batch_size, args.rnn_size) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] def loop(prev, _): return None # TODO prev = tf.nn.xw_plus_b(prev, softmax_w, softmax_b) prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embedding, prev_symbol) # len(outputs)==args.seq_length, shape(outputs[0])==(args.batch_size, args.rnn_size) outputs, states = seq2seq.rnn_decoder( inputs, self.initial_state, cell, loop_function=loop if infer else None, scope='rnnlm') # # shape(output) = (batch_size*seq_length, rnn_size) # output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size]) def handle_outputs(use_lastone=True): """ Shape of return is [batch_size, rnn_size]. """ if use_lastone: return outputs[-1] output = tf.add_n(outputs) output = tf.div(output, len(outputs)) return output output = handle_outputs(use_lastone=False) # shape(logits) = (batch_size, vocab_size) self.logits = tf.nn.xw_plus_b(output, softmax_w, softmax_b) self.probs = tf.nn.softmax(self.logits) loss = seq2seq.sequence_loss_by_example( [self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([args.batch_size])], args.vocab_size) self.cost = tf.reduce_sum(loss) / args.batch_size _ = tf.scalar_summary('cost', self.cost) # Evaluate accuracy correct_pred = tf.equal(tf.cast(tf.argmax(self.logits, 1), tf.int32), tf.reshape(self.targets, [-1])) self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) _ = tf.scalar_summary('accuracy', self.accuracy) self.final_state = states self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), args.grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def __init__(self, vocab, tagset, alphabet, word_embedding_size, char_embedding_size, num_chars, num_steps, optimizer_desc, generate_lemmas, l2, dropout_prob_values, experiment_name, supply_form_characters_to_lemma, threads=0, seed=None, write_summaries=True, use_attention=True, scheduled_sampling=None): """ Builds the tagger computation graph and initializes it in a TensorFlow session. Arguments: vocab: Vocabulary of word forms. tagset: Vocabulary of possible tags. alphabet: Vocabulary of possible characters. word_embedding_size (int): Size of the form-based word embedding. char_embedding_size (int): Size of character embeddings, i.e. a half of the size of the character-based words embeddings. num_chars: Maximum length of a word. num_steps: Maximum lenght of a sentence. optimizer_desc: Description of the optimizer. generate_lemmas: Generate lemmas during tagging. seed: TensorFlow seed write_summaries: Write summaries using TensorFlow interface. """ self.num_steps = num_steps self.num_chars = num_chars self.word_embedding_size = word_embedding_size self.char_embedding_size = char_embedding_size self.lstm_size = word_embedding_size + 2 * char_embedding_size ### self.vocab = vocab self.tagset = tagset self.alphabet = alphabet self.dropout_prob_values = dropout_prob_values self.forward_initial_state = tf.placeholder( tf.float32, [None, rnn_cell.BasicLSTMCell(self.lstm_size).state_size], name="forward_lstm_initial_state") self.backward_initial_state = tf.placeholder( tf.float32, [None, rnn_cell.BasicLSTMCell(self.lstm_size).state_size], name="backward_lstm_initial_state") self.sentence_lengths = tf.placeholder(tf.int64, [None], name="sentence_lengths") self.tags = tf.placeholder(tf.int32, [None, num_steps], name="ground_truth_tags") self.dropout_prob = tf.placeholder(tf.float32, [None], name="dropout_keep_p") self.generate_lemmas = generate_lemmas global_step = tf.Variable(0, trainable=False) input_list = [] regularize = [] # Word-level embeddings if word_embedding_size: self.words = tf.placeholder(tf.int32, [None, num_steps], name='words') word_embeddings = tf.Variable( tf.random_uniform([len(vocab), word_embedding_size], -1.0, 1.0)) we_lookup = tf.nn.embedding_lookup(word_embeddings, self.words) input_list.append(we_lookup) # Character-level embeddings if char_embedding_size: self.chars = tf.placeholder(tf.int32, [None, num_steps, num_chars], name='chars') self.chars_lengths = tf.placeholder(tf.int64, [None, num_steps], name='chars_lengths') char_embeddings = \ tf.Variable(tf.random_uniform([len(alphabet), char_embedding_size], -1.0, 1.0)) ce_lookup = tf.nn.embedding_lookup(char_embeddings, self.chars) reshaped_ce_lookup = tf.reshape( ce_lookup, [-1, num_chars, char_embedding_size], name="reshape-char_inputs") char_inputs = [ tf.squeeze(input_, [1]) for input_ in tf.split(1, num_chars, reshaped_ce_lookup) ] char_inputs_lengths = tf.reshape(self.chars_lengths, [-1]) with tf.variable_scope('char_forward'): char_lstm = rnn_cell.BasicLSTMCell(char_embedding_size) _, char_last_state = rnn.rnn( cell=char_lstm, inputs=char_inputs, sequence_length=char_inputs_lengths, dtype=tf.float32) tf.get_variable_scope().reuse_variables() regularize.append( tf.get_variable('RNN/BasicLSTMCell/Linear/Matrix')) with tf.variable_scope('char_backward'): char_lstm_rev = rnn_cell.BasicLSTMCell(char_embedding_size) _, char_last_state_rev = rnn.rnn( cell=char_lstm_rev, inputs=self._reverse_seq(char_inputs, char_inputs_lengths), sequence_length=char_inputs_lengths, dtype=tf.float32) tf.get_variable_scope().reuse_variables() regularize.append( tf.get_variable('RNN/BasicLSTMCell/Linear/Matrix')) last_char_lstm_state = tf.split(1, 2, char_last_state)[1] last_char_lstm_state_rev = tf.split(1, 2, char_last_state_rev)[1] last_char_states = \ tf.reshape(last_char_lstm_state, [-1, num_steps, char_embedding_size], name="reshape-charstates") last_char_states_rev = tf.reshape( last_char_lstm_state_rev, [-1, num_steps, char_embedding_size], name="reshape-charstates_rev") char_output = tf.concat(2, [last_char_states, last_char_states_rev]) input_list.append(char_output) # All inputs correctly sliced input_list_dropped = [ tf.nn.dropout(x, self.dropout_prob[0]) for x in input_list ] inputs = [ tf.squeeze(input_, [1]) for input_ in tf.split( 1, num_steps, tf.concat(2, input_list_dropped)) ] with tf.variable_scope('forward'): lstm = rnn_cell.BasicLSTMCell(self.lstm_size) outputs, last_state = rnn.rnn( cell=lstm, inputs=inputs, dtype=tf.float32, initial_state=self.forward_initial_state, sequence_length=self.sentence_lengths) tf.get_variable_scope().reuse_variables() regularize.append( tf.get_variable('RNN/BasicLSTMCell/Linear/Matrix')) with tf.variable_scope('backward'): lstm_rev = rnn_cell.BasicLSTMCell(self.lstm_size) outputs_rev_rev, last_state_rev = rnn.rnn( cell=lstm_rev, inputs=self._reverse_seq(inputs, self.sentence_lengths), dtype=tf.float32, initial_state=self.backward_initial_state, sequence_length=self.sentence_lengths) outputs_rev = self._reverse_seq(outputs_rev_rev, self.sentence_lengths) tf.get_variable_scope().reuse_variables() regularize.append( tf.get_variable('RNN/BasicLSTMCell/Linear/Matrix')) #outputs_forward = tf.reshape(tf.concat(1, outputs), [-1, self.lstm_size], # name="reshape-outputs_forward") #outputs_backward = tf.reshape(tf.concat(1, outputs_rev), [-1, self.lstm_size], # name="reshape-outputs_backward") #forward_w = tf.get_variable("forward_w", [self.lstm_size, self.lstm_size]) #backward_w = tf.get_variable("backward_w", [self.lstm_size, self.lstm_size]) #non_linearity_bias = tf.get_variable("non_linearity_b", [self.lstm_size]) outputs_bidi = [ tf.concat(1, [o1, o2]) for o1, o2 in zip(outputs, reversed(outputs_rev)) ] #output = tf.tanh(tf.matmul(outputs_forward, forward_w) + tf.matmul(outputs_backward, backward_w) + non_linearity_bias) output = tf.reshape(tf.concat(1, outputs_bidi), [-1, 2 * self.lstm_size], name="reshape-outputs_bidi") output_dropped = tf.nn.dropout(output, self.dropout_prob[1]) # We are computing only the logits, not the actual softmax -- while # computing the loss, it is done by the sequence_loss_by_example and # during the runtime classification, the argmax over logits is enough. softmax_w = tf.get_variable( "softmax_w", [2 * self.lstm_size, len(tagset)]) logits_flatten = tf.nn.xw_plus_b( output_dropped, softmax_w, tf.get_variable("softmax_b", [len(tagset)])) #tf.get_variable_scope().reuse_variables() regularize.append(softmax_w) self.logits = tf.reshape(logits_flatten, [-1, num_steps, len(tagset)], name="reshape-logits") estimated_tags_flat = tf.to_int32( tf.argmax(logits_flatten, dimension=1)) self.last_state = last_state # output maks: compute loss only if it insn't a padded word (i.e. zero index) output_mask = tf.reshape(tf.to_float(tf.not_equal(self.tags, 0)), [-1]) gt_tags_flat = tf.reshape(self.tags, [-1]) tagging_loss = seq2seq.sequence_loss_by_example( logits=[logits_flatten], targets=[gt_tags_flat], weights=[output_mask]) tagging_accuracy = \ tf.reduce_sum(tf.to_float(tf.equal(estimated_tags_flat, gt_tags_flat)) * output_mask) \ / tf.reduce_sum(output_mask) tf.scalar_summary('train_accuracy', tagging_accuracy, collections=["train"]) tf.scalar_summary('dev_accuracy', tagging_accuracy, collections=["dev"]) self.cost = tf.reduce_mean(tagging_loss) tf.scalar_summary('train_tagging_loss', tf.reduce_mean(tagging_loss), collections=["train"]) tf.scalar_summary('dev_tagging_loss', tf.reduce_mean(tagging_loss), collections=["dev"]) if generate_lemmas: with tf.variable_scope('decoder'): self.lemma_chars = tf.placeholder( tf.int32, [None, num_steps, num_chars + 2], name='lemma_chars') lemma_state_size = self.lstm_size lemma_w = tf.Variable(tf.random_uniform( [lemma_state_size, len(alphabet)], 0.5), name="state_to_char_w") lemma_b = tf.Variable(tf.fill([len(alphabet)], -math.log(len(alphabet))), name="state_to_char_b") lemma_char_embeddings = tf.Variable(tf.random_uniform([ len(alphabet), lemma_state_size / (2 if supply_form_characters_to_lemma else 1) ], -0.5, 0.5), name="char_embeddings") lemma_char_inputs = \ [tf.squeeze(input_, [1]) for input_ in tf.split(1, num_chars + 2, tf.reshape(self.lemma_chars, [-1, num_chars + 2], name="reshape-lemma_char_inputs"))] if supply_form_characters_to_lemma: char_inputs_zeros = \ [tf.squeeze(chars, [1]) for chars in tf.split(1, num_chars, tf.reshape(self.chars, [-1, num_chars], name="reshape-char_inputs_zeros"))] char_inputs_zeros.append(char_inputs_zeros[0] * 0) def loop(prev_state, i): # it takes the previous hidden state, finds the character and formats it # as input for the next time step ... used in the decoder in the "real decoding scenario" out_activation = tf.matmul(prev_state, lemma_w) + lemma_b prev_char_index = tf.argmax(out_activation, 1) return tf.concat(1, [ tf.nn.embedding_lookup(lemma_char_embeddings, prev_char_index), tf.nn.embedding_lookup(lemma_char_embeddings, char_inputs_zeros[i]) ]) embedded_lemma_characters = [] for lemma_chars, form_chars in zip(lemma_char_inputs[:-1], char_inputs_zeros): embedded_lemma_characters.append( tf.concat(1, [ tf.nn.embedding_lookup(lemma_char_embeddings, lemma_chars), tf.nn.embedding_lookup(lemma_char_embeddings, form_chars) ])) else: def loop(prev_state, _): # it takes the previous hidden state, finds the character and formats it # as input for the next time step ... used in the decoder in the "real decoding scenario" out_activation = tf.matmul(prev_state, lemma_w) + lemma_b prev_char_index = tf.argmax(out_activation, 1) return tf.nn.embedding_lookup(lemma_char_embeddings, prev_char_index) embedded_lemma_characters = [] for lemma_chars in lemma_char_inputs[:-1]: embedded_lemma_characters.append( tf.nn.embedding_lookup(lemma_char_embeddings, lemma_chars)) def sampling_loop(prev_state, i): threshold = scheduled_sampling / ( scheduled_sampling + tf.exp(tf.to_float(global_step))) condition = tf.less_equal( tf.random_uniform( tf.shape(embedded_lemma_characters[0])), threshold) return tf.select(condition, embedded_lemma_characters[i], loop(prev_state, i)) decoder_cell = rnn_cell.BasicLSTMCell(lemma_state_size) if scheduled_sampling: lf = sampling_loop else: lf = None if use_attention: lemma_outputs_train, _ = seq2seq.attention_decoder( embedded_lemma_characters, output_dropped, reshaped_ce_lookup, decoder_cell, loop_function=lf) else: lemma_outputs_train, _ = seq2seq.rnn_decoder( embedded_lemma_characters, output_dropped, decoder_cell, loop_function=lf) tf.get_variable_scope().reuse_variables() #regularize.append(tf.get_variable('attention_decoder/BasicLSTMCell/Linear/Matrix')) tf.get_variable_scope().reuse_variables() if use_attention: lemma_outputs_runtime, _ = \ seq2seq.attention_decoder(embedded_lemma_characters, output_dropped, reshaped_ce_lookup, decoder_cell, loop_function=loop) else: lemma_outputs_runtime, _ = \ seq2seq.rnn_decoder(embedded_lemma_characters, output_dropped, decoder_cell, loop_function=loop) lemma_char_logits_train = \ [tf.matmul(o, lemma_w) + lemma_b for o in lemma_outputs_train] lemma_char_logits_runtime = \ [tf.matmul(o, lemma_w) + lemma_b for o in lemma_outputs_runtime] self.lemmas_decoded = \ tf.reshape(tf.transpose(tf.argmax(tf.pack(lemma_char_logits_runtime), 2)), [-1, num_steps, num_chars + 1]) lemma_char_weights = [] for lemma_chars in lemma_char_inputs[1:]: lemma_char_weights.append( tf.to_float(tf.not_equal(lemma_chars, 0))) lemmatizer_loss = seq2seq.sequence_loss( lemma_char_logits_train, lemma_char_inputs[1:], lemma_char_weights) lemmatizer_loss_runtime = \ seq2seq.sequence_loss(lemma_char_logits_runtime, lemma_char_inputs[1:], lemma_char_weights) tf.scalar_summary('train_lemma_loss_with_gt_inputs', tf.reduce_mean(lemmatizer_loss), collections=["train"]) tf.scalar_summary('dev_lemma_loss_with_gt_inputs', tf.reduce_mean(lemmatizer_loss), collections=["dev"]) tf.scalar_summary('train_lemma_loss_with_decoded_inputs', tf.reduce_mean(lemmatizer_loss_runtime), collections=["train"]) tf.scalar_summary('dev_lemma_loss_with_decoded_inputs', tf.reduce_mean(lemmatizer_loss_runtime), collections=["dev"]) self.cost += tf.reduce_mean(lemmatizer_loss) + tf.reduce_mean( lemmatizer_loss_runtime) self.cost += l2 * sum( [tf.nn.l2_loss(variable) for variable in regularize]) tf.scalar_summary('train_optimization_cost', self.cost, collections=["train"]) tf.scalar_summary('dev_optimization_cost', self.cost, collections=["dev"]) def decay(learning_rate, exponent, iteration_steps): return tf.train.exponential_decay(learning_rate, global_step, iteration_steps, exponent, staircase=True) optimizer = eval('tf.train.' + optimizer_desc) self.train = optimizer.minimize(self.cost, global_step=global_step) if threads > 0: self.session = tf.Session( config=tf.ConfigProto(inter_op_parallelism_threads=threads, intra_op_parallelism_threads=threads)) else: self.session = tf.Session() self.session.run(tf.initialize_all_variables()) if write_summaries: self.summary_train = tf.merge_summary(tf.get_collection("train")) self.summary_dev = tf.merge_summary(tf.get_collection("dev")) timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S") self.summary_writer = tf.train.SummaryWriter("logs/" + timestamp + "_" + experiment_name) self.steps = 0
inputs_dis = [tf.matmul(tf.squeeze(i, [1]), embedding) for i in inputs_dis] state = initial_state_dis outputs = [] for i, inp in enumerate(inputs_dis): if i > 0: tf.get_variable_scope().reuse_variables() output, state = cell_dis(inp, state) outputs.append(output) last_state = state output_tf = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size]) logits = tf.nn.xw_plus_b(output_tf, softmax_w, softmax_b) probs = tf.nn.softmax(logits) loss = seq2seq.sequence_loss_by_example( [logits], [tf.reshape(targets, [-1])], [tf.ones([args.batch_size * args.seq_length])], 2) cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length final_state = last_state lr = tf.Variable(0.0, trainable = False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars, aggregation_method = 2), args.grad_clip) optimizer = tf.train.AdamOptimizer(lr) train_op = optimizer.apply_gradients(zip(grads, tvars))
# def loop(prev, _): # prev = tf.matmul(prev, softmax_w) + softmax_b # prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) # return tf.nn.embedding_lookup(embeddings, prev_symbol) inputs = tf.split(1, seq_length, tf.nn.embedding_lookup(embeddings, input_data)) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] outputs, last_state = seq2seq.rnn_decoder(inputs, initial_state, cell) output = tf.reshape(tf.concat(1, outputs), [-1, hidden_num]) logits = tf.matmul(output, softmax_w) + softmax_b probs = tf.nn.softmax(logits) loss_rnn = seq2seq.sequence_loss_by_example( [logits], [tf.reshape(targets, [-1])], [tf.ones([batch_size * seq_length])], vocab_size) cost = tf.reduce_sum(loss_rnn) / batch_size / seq_length final_state = last_state lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), grad_clip) optimizer = tf.train.AdagradOptimizer(0.1) train_op = optimizer.apply_gradients(zip(grads, tvars)) #输出词向量 embeddings_norm = tf.sqrt( tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True)) normalized_embeddings = embeddings / embeddings_norm #模型训练
def __init__(self, CellType, is_training, config): self.batch_size = batch_size = config.batch_size self.num_steps = num_steps = config.num_steps size = config.hidden_size vocab_size = config.vocab_size self.input_data = tf.placeholder(tf.int32, [batch_size, num_steps], name="input_data") self.targets = tf.placeholder(tf.int32, [batch_size, num_steps], name="targets") lstm_cell = CellType(size) if is_training and config.keep_prob < 1: lstm_cell = rnn_cell.DropoutWrapper(lstm_cell, output_keep_prob=config.keep_prob) cell = rnn_cell.MultiRNNCell([lstm_cell] * config.num_layers) self.initial_state = cell.zero_state(batch_size, tf.float32) # initializer used for reusable variable initializer (see `get_variable`) initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [vocab_size, size], initializer=initializer) inputs = tf.nn.embedding_lookup(embedding, self.input_data) if is_training and config.keep_prob < 1: inputs = tf.nn.dropout(inputs, config.keep_prob) outputs = [] states = [] state = self.initial_state with tf.variable_scope("RNN", initializer=initializer): for time_step in range(num_steps): if time_step > 0: tf.get_variable_scope().reuse_variables() inputs_slice = inputs[:,time_step,:] (cell_output, state) = cell(inputs_slice, state) outputs.append(cell_output) states.append(state) self.final_state = states[-1] output = tf.reshape(tf.concat(1, outputs), [-1, size]) w = tf.get_variable("softmax_w", [size, vocab_size], initializer=initializer) b = tf.get_variable("softmax_b", [vocab_size], initializer=initializer) logits = tf.nn.xw_plus_b(output, w, b) # compute logits for loss targets = tf.reshape(self.targets, [-1]) # reshape our target outputs weights = tf.ones([batch_size * num_steps]) # used to scale the loss average # computes loss and performs softmax on our fully-connected output layer loss = sequence_loss_by_example([logits], [targets], [weights], vocab_size) self.cost = cost = tf.div(tf.reduce_sum(loss), batch_size, name="cost") if is_training: # setup learning rate variable to decay self.lr = tf.Variable(1.0, trainable=False) # define training operation and clip the gradients tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), config.max_grad_norm) optimizer = tf.train.GradientDescentOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars), name="train") else: # if this model isn't for training (i.e. testing/validation) then we don't do anything here self.train_op = tf.no_op()
# with tf.Session() as sess: # sess = tf.InteractiveSession() sess = tf.Session(config=tf.ConfigProto(log_device_placement=True)) # tensors to store model state and training data for each batch seqs = [tf.placeholder(tf.int32, shape=[_seq_length]) for _ in xrange(_batch_size)] encoder_inputs = [tf.placeholder(tf.int32, shape=[_seq_length]) for _ in xrange(_batch_size)] decoder_inputs = [tf.placeholder(tf.int32, shape=[_seq_length]) for _ in xrange(_batch_size)] targets = [tf.placeholder(tf.int32, shape=[_seq_length]) for _ in xrange(_batch_size)] target_weights = [tf.ones(dtype=tf.float32, shape=[_seq_length]) for _ in xrange(_batch_size)] # set up the tied seq-to-seq LSTM with given parameters single_cell = rnn_cell.BasicLSTMCell(_lstm_cell_dimension) cell = rnn_cell.MultiRNNCell([single_cell] * _lstm_num_layers) outputs, _ = seq2seq.embedding_tied_rnn_seq2seq(encoder_inputs, decoder_inputs, cell, _vocab_size_including_GO) seqloss = seq2seq.sequence_loss_by_example(outputs, encoder_inputs, target_weights, _vocab_size_including_GO) tf.train.SummaryWriter(_train_log_dir, sess.graph_def) global_step = tf.Variable(0, name='global_step', trainable=False) sess.run(tf.initialize_all_variables()) # Set up the optimizer with gradient clipping params = tf.trainable_variables() gradients = tf.gradients(seqloss, params) optimizer = tf.train.GradientDescentOptimizer(_lstm_learn_rate) clipped_gradients, norm = tf.clip_by_global_norm(gradients, _lstm_max_grad_norm) train_op = optimizer.apply_gradients(zip(clipped_gradients, params), global_step=global_step) # train_step = tf.train.GradientDescentOptimizer(_lstm_learn_rate).minimize(seqloss)
def __init__(self, session, config, training_flag=False): # get configuration from config class vocab_size = config.vocab_size size = config.size net_type = config.net_type batch_size = config.batch_size num_steps = config.num_steps max_grad_norm = config.max_grad_norm forget_bias = config.forget_bias keep_prob = config.keep_prob #create placeholders for input, answers, learning rate self._input_data = tf.placeholder(tf.int32, [batch_size, num_steps]) self._answers = tf.placeholder(tf.int32, [batch_size, num_steps]) self._lr = tf.placeholder(tf.float32, name='learning_rate') #create cell, either GRU or LSTM as defined by the config class if net_type == "LSTM": cell = rnn_cell.BasicLSTMCell(size, forget_bias=forget_bias) elif net_type == "GRU": cell = rnn_cell.GRUCell(size) else: print("Unknown network type. config.net_type must be GRU or LSTM") #create multiple layers of cells defined by config.num_layer cell_layers = rnn_cell.MultiRNNCell([cell] * config.num_layers) #set the initial state of the network self._initial_state = cell_layers.zero_state(batch_size, tf.float32) with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [vocab_size, size]) inputs = tf.nn.embedding_lookup(embedding, self._input_data) if training_flag and keep_prob < 1: inputs = tf.nn.dropout(inputs, keep_prob) inputs = [tf.squeeze(input_, [1])for input_ in tf.split(1, num_steps, inputs)] #pass inputs through the cell outputs, states = RNN.rnn(cell_layers, inputs, initial_state=self._initial_state) # get the final state of the network after input has passed through self._final_state = states output = tf.reshape(tf.concat(1, outputs), [-1, size]) softmax_w = tf.get_variable("softmax_w", [size, vocab_size]) softmax_b = tf.get_variable("softmax_b", [vocab_size]) self._logits = logits = tf.matmul(output, softmax_w) + softmax_b self._soft_out = soft_out = tf.nn.softmax(logits, name='soft_max') correct_prediction = 1 if tf.arg_max(soft_out, 1) == self._answers else 0 self._accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) loss = seq2seq.sequence_loss_by_example([logits], [tf.reshape(self._answers, [-1])], [tf.ones([batch_size * num_steps])], vocab_size) self._cost = cost = tf.reduce_sum(loss) / batch_size if not training_flag: return tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), config.max_grad_norm) optimizer = tf.train.AdagradOptimizer(self.lr, initial_accumulator_value=0.1) self._train_op = optimizer.apply_gradients(zip(grads, tvars))
def __init__(self, args, predict=False): self.args = args if predict: batchSize = 1 numSteps = 1 # Various parameters for the LSTM. # Hardcoded here for now. numSteps = 50 # Steps to unroll for batchSize = 50 rnnSize = 128 numLayers = 2 gradClip = 5 learningRate = 0.002 decayRate = 0.97 #Create LSTM layer and stack multiple layers. lstmCell = rnn_cell.BasicLSTMCell(rnnSize) lstmNet = rnn_cell.MultiRNNCell([lstmCell] * numLayers) #Define placeholders. self.inputData = tf.placeholder(tf.int32, [batchSize, numSteps]) self.targetOutput = tf.placeholder(tf.int32, [batchSize, numSteps]) self.initialState = lstmNet.zero_state(batchSize, tf.float32) # If rnn_decoder is told to loop, this function will return to it the output at time # 't' for feeding as the input at time 't+1'. During training, this is generally # not done because we want to feed the *correct* input at all times and not what # is output. During prediction/testing, we loop the output back to the input to # generate our sequence of notes. def feedBack(prev, _): prev = tf.nn.xw_plus_b(prev, softmax_w, softmax_b) prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embedding, prev_symbol) with tf.variable_scope('nn_lstm'): softmax_w = tf.get_variable("softmax_w", [rnnSize, args.vocabSize]) softmax_b = tf.get_variable("softmax_b", [args.vocabSize]) with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [args.vocabSize, rnnSize]) inputs = tf.split(1, numSteps, tf.nn.embedding_lookup(embedding, self.inputData)) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] #Call seq2seq rnn decoder. outputs, states = seq2seq.rnn_decoder(inputs, self.initialState, lstmNet, loop_function=feedBack if predict else None, scope='nn_lstm') output = tf.reshape(tf.concat(1, outputs), [-1, rnnSize]) #Logit and probability #softmax_w = tf.get_variable("softmax_w", rnnSize, [args.vocabSize]) #softmax_b = tf.get_variable("softmax_b", [args.vocabSize]) self.logits = tf.matmul(output, softmax_w) + softmax_b self.probs = tf.nn.softmax(self.logits) # Calculate loss compared to targetOutput loss = seq2seq.sequence_loss_by_example([self.logits], [tf.reshape(self.targetOutput, [-1])], [tf.ones([batchSize * numSteps])], args.vocabSize) # Set the cost to minimize total loss. self.cost = tf.reduce_sum(loss) # Learning rate remains constant (not trainable) self.finalState = states[-1] self.learningRate = tf.Variable(0.0, trainable=False) # Define gradient and trainable variables for adjusting # during training/optimization. trainableVars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, trainableVars), gradClip) # We use the Adam optimizer. #optimizer = tf.train.GradientDescentOptimizer(self.learningRate).minimize(loss) #optimizer = tf.train.AdagradOptimizer(self.learningRate, initial_accumulator_value=0.1) #self.trainStep = optimizer.apply_gradients(zip(grads, trainableVars)) optimizer = tf.train.AdamOptimizer(self.learningRate) self.trainStep = optimizer.apply_gradients(zip(grads, trainableVars))
def __init__(self, is_training, config, decode_only=False): self.batch_size = batch_size = config.batch_size self.num_steps = num_steps = config.num_steps size = config.hidden_size self.is_training = is_training vocab_size = config.vocab_size self._input_data = tf.placeholder(tf.int32, [batch_size, num_steps]) self._targets = tf.placeholder(tf.int32, [batch_size, num_steps]) # Slightly better results can be obtained with forget gate biases # initialized to 1 but the hyperparameters of the model would need to be # different than reported in the paper. with tf.variable_scope("cell_encoder"): lstm_encoder_cell = rnn_cell.BasicLSTMCell(size, forget_bias=0.0) if is_training and config.keep_prob < 1: lstm_encoder_cell = rnn_cell.DropoutWrapper( lstm_encoder_cell, output_keep_prob=config.keep_prob) cell_encoder = rnn_cell.MultiRNNCell([lstm_encoder_cell] * config.num_layers) # this is the linear projection layer down to num_encoder_symbols = 2*config.z_dim cell_encoder = rnn_cell.OutputProjectionWrapper(cell_encoder, 2 * config.z_dim) self._initial_state_encoder = cell_encoder.zero_state(batch_size, tf.float32) with tf.variable_scope("cell_decoder"): lstm_decoder_cell = rnn_cell.BasicLSTMCell(size, forget_bias=0.0) if is_training and config.keep_prob < 1: lstm_decoder_cell = rnn_cell.DropoutWrapper( lstm_decoder_cell, output_keep_prob=config.keep_prob) cell_decoder = rnn_cell.MultiRNNCell([lstm_decoder_cell] * config.num_layers) self._initial_state_decoder = cell_decoder.zero_state(batch_size, tf.float32) with tf.device("/cpu:0"): with tf.variable_scope("embedding"): embedding = tf.get_variable("embedding", [vocab_size, size]) inputs = tf.split( 1, num_steps, tf.nn.embedding_lookup(embedding, self._input_data)) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] if is_training and config.keep_prob < 1: inputs = [tf.nn.dropout(input_, config.keep_prob) for input_ in inputs] # initial inputs inputs_encoder = inputs outputs_encoder, states_encoder = rnn.rnn(cell_encoder, inputs_encoder, initial_state=self._initial_state_encoder) # split the outputs to mu and log_sigma mu_and_log_sigmas = [tf.split(1, 2, output_encoder) for output_encoder in outputs_encoder] mus = [mu_and_log_sigma[0] for mu_and_log_sigma in mu_and_log_sigmas] log_sigmas = [mu_and_log_sigma[1] for mu_and_log_sigma in mu_and_log_sigmas] # epsilon is sampled from N(0,1) for location-scale transform epsilons = [tf.random_normal([config.batch_size, config.z_dim], dtype=tf.float32) for i in range(len(log_sigmas))] # do the location-scale transform z_samples = [tf.add(mu, tf.mul(tf.exp(log_sigma), epsilon)) for mu, log_sigma, epsilon in zip(mus, log_sigmas, epsilons)] if decode_only: # if we're decoding, just sample from a random normal z_samples = [tf.random_normal([1, config.z_dim], dtype=tf.float32) for i in range(len(z_samples))] # calculate KL. equation 10 from kingma - auto-encoding variational bayes. neg_KL_list = [tf.add_n([tf.ones_like(mu), tf.log(tf.square(tf.exp(log_sigma))), tf.neg(tf.square(mu)), tf.neg(tf.square(tf.exp(log_sigma)))]) for mu, log_sigma in zip(mus, log_sigmas)] # multiply by 0.5 neg_KL_list = [tf.mul(tf.constant(0.5, shape=[1, config.z_dim]), KL_term) for KL_term in neg_KL_list] # merge the list like we merge the outputs neg_KL = tf.reshape(tf.concat(1, neg_KL_list), [-1, config.z_dim]) # no pure decoding opt # outputs_decoder, states_decoder = rnn_decoder(decoder_inputs, self._initial_state_decoder, cell_decoder) softmax_w = tf.get_variable("softmax_w", [size, vocab_size]) softmax_b = tf.get_variable("softmax_b", [vocab_size]) # concatenate z_samples with previous timesteps # decoder_inputs = [tf.concat(1, [single_input, z_sample]) for single_input, z_sample in zip(inputs_encoder, z_samples)] # outputs_decoder, states_decoder = rnn_decoder_argmax(decoder_inputs, self._initial_state_decoder, cell_decoder, vocab_size, # output_projection=[softmax_w, softmax_b], # feed_previous=True, # config=config) # refactored to be like sam's outputs_decoder, states_decoder = vae_decoder_argmax( inputs_encoder, z_samples, self._initial_state_decoder, cell_decoder, vocab_size, output_projection=[softmax_w, softmax_b], feed_previous=True, config=config) # final output # change to vanilla lstm outputs = outputs_encoder # do a softmax over the vocabulary using the decoder outputs! output = tf.reshape(tf.concat(1, outputs), [-1, size]) logits = tf.nn.xw_plus_b(output, softmax_w, softmax_b) NLL = seq2seq.sequence_loss_by_example([logits], [tf.reshape(self._targets, [-1])], [tf.ones([batch_size * num_steps])], vocab_size) NLL_scalar = tf.reduce_sum(NLL) KL_scalar = tf.neg(tf.reduce_sum(neg_KL)) # here we compute the *NEGATIVE* ELBO (because we don't know how the optimizer deals with negative learning rates / gradients) # the loss in seq2seq.sequence_loss_by_example is the cross-entropy, which is the *negative* log-likelihood, so we can add it. neg_ELBO = KL_scalar + NLL_scalar# / batch_size # grads_unclipped = tf.gradients(neg_ELBO, tvars) # grads, _ = tf.clip_by_global_norm(grads_unclipped, # config.max_grad_norm) def normalize(tensor): return tf.reduce_sum( tf.mul(tf.constant(1/(batch_size * self.num_steps), shape=tensor.get_shape()), tensor)) # summaries neg_ELBO_normalized = normalize(neg_ELBO) KL_normalized = normalize(KL_scalar) NLL_normalized = normalize(NLL_scalar) neg_ELBO_summary = tf.scalar_summary("neg_ELBO_normalized", neg_ELBO_normalized) KL_summary = tf.scalar_summary('KL_normalized', KL_normalized) NLL_summary = tf.scalar_summary('NLL_normalized', NLL_normalized) # expose costs, h self._neg_ELBO = neg_ELBO self._KL_scalar = KL_scalar self._NLL_scalar = NLL_scalar self._final_state = states_encoder[-1] if decode_only: self._logits = logits return if not is_training: return self._lr = tf.Variable(0.0, trainable=False, name='learning_rate') tvars = tf.trainable_variables() tvar_names = [tvar.name for tvar in tvars] grads_unclipped = tf.gradients(neg_ELBO, tvars) grads, _ = tf.clip_by_global_norm(grads_unclipped, config.max_grad_norm) grad_hists = [] for idx, grad in enumerate(grads_unclipped): if grad is None: pass else: grad_hists.append(tf.histogram_summary(tvar_names[idx], grad)) # optimizer = tf.train.GradientDescentOptimizer(self.lr) #NB: for adam, need to set epsilon to other than the default 1e-8, otherwise get nans! optimizer = tf.train.AdamOptimizer(learning_rate=self.lr, epsilon=1e-1) self._train_op = optimizer.apply_gradients(zip(grads, tvars)) merged = tf.merge_all_summaries() self._merged = merged
def __init__(self, args, infer=False): self.args = args if infer: args.batch_size = 1 args.seq_length = 1 if args.model == 'rnn': cell_fn = rnn_cell.BasicRNNCell elif args.model == 'gru': cell_fn = rnn_cell.GRUCell elif args.model == 'lstm': cell_fn = rnn_cell.BasicLSTMCell else: raise Exception("model type not supported: {}".format(args.model)) #cell = cell_fn(args.rnn_size) con_size = 50 #args.seq_length #self.cell = cell = rnn_cell.MultiRNNCell([cell] * args.num_layers) self.network = Network(cell_fn, args.vocab_size, 20, args.vocab_size, args.rnn_size, con_size, args.num_layers) self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.con_data = tf.placeholder(tf.int32, [args.batch_size, con_size]) self.initial_state = self.network.zero_state(args.batch_size, tf.float32) with tf.variable_scope('rnnlm'): softmax_w = tf.get_variable("softmax_w", [args.rnn_size * args.num_layers, args.vocab_size]) softmax_b = tf.get_variable("softmax_b", [args.vocab_size]) #embedding = tf.get_variable("embedding", [args.vocab_size, args.rnn_size]) embedding = tf.constant(np.identity(args.vocab_size, dtype=np.float32)) inputs = tf.split(1, args.seq_length, tf.nn.embedding_lookup(embedding, self.input_data)) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] # [(batch_size * seq_length) x vocab_size] con = tf.nn.embedding_lookup(embedding, self.con_data) def loop(prev, _): prev = tf.nn.xw_plus_b(prev, softmax_w, softmax_b) prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embedding, prev_symbol) #outputs, states = seq2seq.rnn_decoder(inputs, self.initial_state, cell, loop_function=loop if infer else None, scope='rnnlm') outputs, states = decoder(inputs, self.initial_state, self.network, con, loop_function=loop if infer else None, scope='rnnlm') # turn a list of output into row matrix where each row is output output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size * args.num_layers]) self.logits = tf.nn.xw_plus_b(output, softmax_w, softmax_b) self.probs = tf.nn.softmax(self.logits) loss = seq2seq.sequence_loss_by_example([self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([args.batch_size * args.seq_length])], args.vocab_size) self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length print states self.final_state = states self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), args.grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def create_model(self): self.input_data = tf.placeholder(tf.int32, [self.batch_size, self.seq_length], name="input_data") self.target_data = tf.placeholder(tf.int32, [self.batch_size, self.seq_length], name="target_data") # define hyper_parameters self.keep_prob = tf.Variable(0.3, trainable=False, name="keep_prob") self.lr = tf.Variable(0.0, trainable=False, name="lr") softmax_weights = tf.get_variable("softmax_weights", [self.rnn_size, self.vocab_size]) softmax_biases = tf.get_variable("softmax_biases", [self.vocab_size]) lstm_cell = rnn_cell.BasicLSTMCell(self.rnn_size) # if self.is_training and self.keep_prob < 1: # lstm_cell = rnn_cell.DropoutWrapper(lstm_cell, output_keep_prob=self.keep_prob) multilayer_cell = rnn_cell.MultiRNNCell([lstm_cell] * self.num_layers) self.initial_state = multilayer_cell.zero_state(self.batch_size, tf.float32) with tf.device("/cpu:0"): # define the embedding matrix for the whole vocabulary self.embedding = tf.get_variable("embeddings", [self.vocab_size, self.rnn_size]) # take the vector representation for each word in the embeddings embeds = tf.nn.embedding_lookup(self.embedding, self.input_data) if self.is_training and self.keep_prob < 1: embeds = tf.nn.dropout(embeds, self.keep_prob) def loop(prev, _): prev = tf.nn.xw_plus_b(prev, softmax_weights, softmax_biases) prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(self.embedding, prev_symbol) # convert input to a list of seq_length inputs = tf.split(1, self.seq_length, embeds) # after splitting the shape becomes (batch_size,1,rnn_size). We need to modify it to [batch*rnn_size] inputs = [tf.squeeze(input_, [1]) for input_ in inputs] output, states = seq2seq.rnn_decoder( inputs, self.initial_state, multilayer_cell, loop_function=loop if self.infer else None, scope="rnnlm" ) output = tf.reshape(tf.concat(1, output), [-1, self.rnn_size]) self.logits = tf.nn.xw_plus_b(output, softmax_weights, softmax_biases) self.probs = tf.nn.softmax(self.logits, name="probability") loss = seq2seq.sequence_loss_by_example( [self.logits], [tf.reshape(self.target_data, [-1])], [tf.ones([self.batch_size * self.seq_length])], self.vocab_size, ) self.cost = tf.reduce_sum(loss) / (self.batch_size * self.seq_length) self.final_state = states[-1] tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), self.grad_clip) optimizer = tf.train.AdamOptimizer(0.01) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def __init__(self, args): # define cell if args.model == 'rnn': cell_fn = rnn_cell.BasicRNNCell elif args.model == 'lstm': cell_fn = rnn_cell.BasicLSTMCell else: print "Invalid cell" sys.exit() cell = cell_fn(args.rnn_size) cell = rnn_cell.MultiRNNCell([cell] * args.num_layers) # define inputs and targets, initialize state self.inputs = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") self.initial_state = cell.zero_state(args.batch_size, tf.float32) # prepare word embedding, reshape inputs with tf.name_scope("embedding"): with tf.device("/cpu:0"): if args.emb_vocab is None: E = tf.get_variable("E", [args.vocab_size, args.rnn_size]) else: emb_dim = len(args.emb_vocab[args.emb_vocab.keys()[0]][1]) emb_mat = np.random.rand(args.vocab_size, emb_dim) for word, (idx, emb_vec) in args.emb_vocab.iteritems(): emb_mat[idx] = emb_vec E = tf.Variable(tf.convert_to_tensor(emb_mat, dtype=tf.float32), name="E") inputs = tf.split(1, args.seq_length, tf.nn.embedding_lookup(E, self.inputs)) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] # feed inputs into rnn with tf.name_scope("rnn"): outputs, states = seq2seq.rnn_decoder(inputs, self.initial_state, cell, loop_function=None, scope='rnnlm') self.output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size]) # Add dropout with tf.name_scope("dropout"): self.h_drop = tf.nn.dropout(self.output, self.dropout_keep_prob) # output layer with tf.name_scope("output"): W = tf.Variable(tf.truncated_normal([args.rnn_size, args.num_classes], stddev=0.1), name="W") b = tf.Variable(tf.constant(0.1, shape=[args.num_classes]), name="b") self.logits = tf.nn.xw_plus_b(self.h_drop, W, b) self.probs = tf.nn.softmax(self.logits) self.predictions = tf.cast(tf.argmax(self.logits, 1), tf.int32) # accuracy with tf.name_scope("accuracy"): # calculate token-level accuracy self.reshaped_targets = tf.reshape(self.targets, [-1]) correct_predictions = tf.equal(self.predictions, self.reshaped_targets) self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float")) # calculate sentence-level accuracy self.predictions_sentence = tf.reshape(self.predictions, [-1, args.seq_length]) # batch_size * seq_length correct_predictions_sentence_tokens = tf.equal(self.predictions_sentence, self.targets) # batch_size X seq_length multiply_mat = tf.constant(1, shape=[args.seq_length, 1]) sentence_accuracy_mat = tf.matmul(tf.cast(correct_predictions_sentence_tokens, tf.int32), multiply_mat) # batch_size X 1 correct_predictions_sentence = \ tf.equal(sentence_accuracy_mat, tf.constant(args.seq_length, shape=[args.batch_size, 1])) # batch_size X 1 self.accuracy_sentence = tf.reduce_mean(tf.cast(correct_predictions_sentence, "float")) # calculate loss with tf.name_scope("loss"): self.loss = seq2seq.sequence_loss_by_example( [self.logits], # TODO: should I use a list of 2D tensors ? [self.reshaped_targets], # TODO: correct ??? [tf.ones([args.batch_size * args.seq_length])], args.num_classes) self.cost = tf.reduce_sum(self.loss) / args.batch_size / args.seq_length # train and update with tf.name_scope("update"): tvars = tf.trainable_variables() self.grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), args.grad_clip) # TODO: correct ??? optimizer = tf.train.AdamOptimizer(args.learning_rate) self.global_step = tf.Variable(0, name="global_step", trainable=False) self.train_op = optimizer.apply_gradients(zip(self.grads, tvars), global_step=self.global_step) # l2 norm clipping self.weight_clipping_op = [] trainable_vars = tf.trainable_variables() for var in trainable_vars: if var.name.startswith('output/W'): updated_var = tf.clip_by_norm(var, args.l2_limit) self.weight_clipping_op.append(tf.assign(var, updated_var))
inputs = tf.split(1, seq_length, tf.nn.embedding_lookup(embedding, input_data)) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] # Loop function for seq2seq def loop(prev, _): prev = tf.nn.xw_plus_b(prev, softmax_w, softmax_b) prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embedding, prev_symbol) # Output of RNN outputs, last_state = seq2seq.rnn_decoder(inputs, initial_state, cell, loop_function=None, scope='rnnlm') output = tf.reshape(tf.concat(1, outputs), [-1, rnn_size]) logits = tf.nn.xw_plus_b(output, softmax_w, softmax_b) # Next word probability probs = tf.nn.softmax(logits) # Define LOSS loss = seq2seq.sequence_loss_by_example([logits], # Input [tf.reshape(targets, [-1])], # Target [tf.ones([batch_size * seq_length])], # Weight vocab_size) # Define Optimizer cost = tf.reduce_sum(loss) / batch_size / seq_length final_state = last_state lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), grad_clip) _optm = tf.train.AdamOptimizer(lr) optm = _optm.apply_gradients(zip(grads, tvars)) print ("Network Ready") # In[ ]:
def __init__(self, args, infer=False): self.args = args if infer: args.batch_size = 1 args.seq_length = 1 if args.model == 'rnn': cell_fn = rnn_cell.BasicRNNCell elif args.model == 'gru': cell_fn = rnn_cell.GRUCell elif args.model == 'lstm': cell_fn = rnn_cell.BasicLSTMCell else: raise Exception("model type not supported: {}".format(args.model)) #cell = cell_fn(args.rnn_size) con_size = 50 #args.seq_length #self.cell = cell = rnn_cell.MultiRNNCell([cell] * args.num_layers) self.network = Network(cell_fn, args.vocab_size, 20, args.vocab_size, args.rnn_size, con_size, args.num_layers) self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.con_data = tf.placeholder(tf.int32, [args.batch_size, con_size]) self.initial_state = self.network.zero_state(args.batch_size, tf.float32) with tf.variable_scope('rnnlm'): softmax_w = tf.get_variable( "softmax_w", [args.rnn_size * args.num_layers, args.vocab_size]) softmax_b = tf.get_variable("softmax_b", [args.vocab_size]) #embedding = tf.get_variable("embedding", [args.vocab_size, args.rnn_size]) embedding = tf.constant( np.identity(args.vocab_size, dtype=np.float32)) inputs = tf.split( 1, args.seq_length, tf.nn.embedding_lookup(embedding, self.input_data)) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] # [(batch_size * seq_length) x vocab_size] con = tf.nn.embedding_lookup(embedding, self.con_data) def loop(prev, _): prev = tf.nn.xw_plus_b(prev, softmax_w, softmax_b) prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embedding, prev_symbol) #outputs, states = seq2seq.rnn_decoder(inputs, self.initial_state, cell, loop_function=loop if infer else None, scope='rnnlm') outputs, states = decoder(inputs, self.initial_state, self.network, con, loop_function=loop if infer else None, scope='rnnlm') # turn a list of output into row matrix where each row is output output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size * args.num_layers]) self.logits = tf.nn.xw_plus_b(output, softmax_w, softmax_b) self.probs = tf.nn.softmax(self.logits) loss = seq2seq.sequence_loss_by_example( [self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([args.batch_size * args.seq_length])], args.vocab_size) self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length print states self.final_state = states self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), args.grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def __init__(self, args, infer=False): self.args = args # if infer: # args.batch_size = 1 # args.seq_length = 1 if args.model == 'rnn': cell_fn = rnn_cell.BasicRNNCell elif args.model == 'gru': cell_fn = rnn_cell.GRUCell elif args.model == 'lstm': cell_fn = rnn_cell.BasicLSTMCell else: raise Exception("model type not supported: {}".format(args.model)) cell = cell_fn(args.rnn_size) self.cell = cell = rnn_cell.MultiRNNCell([cell] * args.num_layers) #self.seq_length = tf.placeholder(tf.int32) #args.seq_length = self.seq_length self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.targets = tf.placeholder(tf.int32, [args.batch_size]) self.initial_state = cell.zero_state(args.batch_size, tf.float32) with tf.variable_scope('rnnlm'): softmax_w = tf.get_variable("softmax_w", [args.rnn_size, args.vocab_size]) softmax_b = tf.get_variable("softmax_b", [args.vocab_size]) with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [args.vocab_size, args.rnn_size]) inputs = tf.split(1, args.seq_length, tf.nn.embedding_lookup(embedding, self.input_data)) # len(inputs)==args.seq_length, shape(inputs[0])==(args.batch_size, args.rnn_size) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] def loop(prev, _): return None # TODO prev = tf.nn.xw_plus_b(prev, softmax_w, softmax_b) prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embedding, prev_symbol) # len(outputs)==args.seq_length, shape(outputs[0])==(args.batch_size, args.rnn_size) outputs, states = seq2seq.rnn_decoder(inputs, self.initial_state, cell, loop_function=loop if infer else None, scope='rnnlm') # # shape(output) = (batch_size*seq_length, rnn_size) # output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size]) def handle_outputs(use_lastone=True): """ Shape of return is [batch_size, rnn_size]. """ if use_lastone: return outputs[-1] output = tf.add_n(outputs) output = tf.div(output, len(outputs)) return output output = handle_outputs(use_lastone=False) # shape(logits) = (batch_size, vocab_size) self.logits = tf.nn.xw_plus_b(output, softmax_w, softmax_b) self.probs = tf.nn.softmax(self.logits) loss = seq2seq.sequence_loss_by_example([self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([args.batch_size])], args.vocab_size) self.cost = tf.reduce_sum(loss) / args.batch_size _ = tf.scalar_summary('cost', self.cost) # Evaluate accuracy correct_pred = tf.equal(tf.cast(tf.argmax(self.logits, 1), tf.int32), tf.reshape(self.targets, [-1])) self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) _ = tf.scalar_summary('accuracy', self.accuracy) self.final_state = states self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), args.grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--data_dir', type=str, default='../data/xinhua', help='data directory containing input.txt') parser.add_argument('--batch_size', type=int, default=120, help='minibatch size') parser.add_argument('--seq_length', type=int, default=5, help='RNN sequence length') parser.add_argument('--hidden_num', type=int, default=256, help='number of hidden layers') parser.add_argument('--word_dim', type=int, default=256, help='number of word embedding') parser.add_argument('--num_epochs', type=int, default=50, help='number of epochs') parser.add_argument('--model', type=str, default='lstm', help='rnn, gru, or lstm') parser.add_argument('--grad_clip', type=float, default=10., help='clip gradients at this value') args = parser.parse_args() #参数集合 #准备训练数据 data_loader = TextLoader2(args.data_dir, args.batch_size, args.seq_length) args.vocab_size = data_loader.vocab_size #模型定义 graph = tf.Graph() with graph.as_default(): if args.model == 'rnn': cell_fn = rnn_cell.BasicRNNCell elif args.model == 'gru': cell_fn = rnn_cell.GRUCell elif args.model == 'lstm': cell_fn = rnn_cell.BasicLSTMCell else: raise Exception("model type not supported: {}".format(args.model)) cell = cell_fn(args.hidden_num) #输入变量 input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) targets = tf.placeholder(tf.int64, [args.batch_size, args.seq_length]) initial_state = cell.zero_state(args.batch_size, tf.float32) #模型参数 with tf.variable_scope('rnnlm' + 'embedding'): embeddings = tf.Variable( tf.random_uniform([args.vocab_size, args.word_dim], -1.0, 1.0)) embeddings = tf.nn.l2_normalize(embeddings, 1) with tf.variable_scope('rnnlm' + 'weight'): softmax_w = tf.get_variable("softmax_w", [args.hidden_num, args.vocab_size]) softmax_b = tf.get_variable("softmax_b", [args.vocab_size]) # def loop(prev, _): # prev = tf.matmul(prev, softmax_w) + softmax_b # prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) # return tf.nn.embedding_lookup(embeddings, prev_symbol) inputs = tf.split(1, args.seq_length, tf.nn.embedding_lookup(embeddings, input_data)) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] outputs, last_state = seq2seq.rnn_decoder(inputs, initial_state, cell) output = tf.reshape(tf.concat(1, outputs), [-1, args.hidden_num]) logits = tf.matmul(output, softmax_w) + softmax_b probs = tf.nn.softmax(logits) loss_rnn = seq2seq.sequence_loss_by_example( [logits], [tf.reshape(targets, [-1])], [tf.ones([args.batch_size * args.seq_length])], args.vocab_size) cost = tf.reduce_sum(loss_rnn) / args.batch_size / args.seq_length final_state = last_state lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), args.grad_clip) optimizer = tf.train.AdagradOptimizer(0.1) train_op = optimizer.apply_gradients(zip(grads, tvars)) #输出词向量 embeddings_norm = tf.sqrt( tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True)) normalized_embeddings = embeddings / embeddings_norm #模型训练 with tf.Session(graph=graph) as sess: tf.initialize_all_variables().run() for e in range(args.num_epochs): data_loader.reset_batch_pointer() for b in range(data_loader.num_batches): start = time.time() x, y = data_loader.next_batch() feed = {input_data: x, targets: y} train_loss, _ = sess.run([cost, train_op], feed) end = time.time() print( "{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" .format(b, data_loader.num_batches, e, train_loss, end - start)) np.save('rnnlm_word_embeddings', normalized_embeddings.eval())
def __init__( self, vocab, tagset, alphabet, word_embedding_size, char_embedding_size, num_chars, num_steps, optimizer_desc, generate_lemmas, l2, dropout_prob_values, experiment_name, supply_form_characters_to_lemma, threads=0, seed=None, write_summaries=True, use_attention=True, scheduled_sampling=None, ): """ Builds the tagger computation graph and initializes it in a TensorFlow session. Arguments: vocab: Vocabulary of word forms. tagset: Vocabulary of possible tags. alphabet: Vocabulary of possible characters. word_embedding_size (int): Size of the form-based word embedding. char_embedding_size (int): Size of character embeddings, i.e. a half of the size of the character-based words embeddings. num_chars: Maximum length of a word. num_steps: Maximum lenght of a sentence. optimizer_desc: Description of the optimizer. generate_lemmas: Generate lemmas during tagging. seed: TensorFlow seed write_summaries: Write summaries using TensorFlow interface. """ self.num_steps = num_steps self.num_chars = num_chars self.word_embedding_size = word_embedding_size self.char_embedding_size = char_embedding_size self.lstm_size = word_embedding_size + 2 * char_embedding_size ### self.vocab = vocab self.tagset = tagset self.alphabet = alphabet self.dropout_prob_values = dropout_prob_values self.forward_initial_state = tf.placeholder( tf.float32, [None, rnn_cell.BasicLSTMCell(self.lstm_size).state_size], name="forward_lstm_initial_state" ) self.backward_initial_state = tf.placeholder( tf.float32, [None, rnn_cell.BasicLSTMCell(self.lstm_size).state_size], name="backward_lstm_initial_state" ) self.sentence_lengths = tf.placeholder(tf.int64, [None], name="sentence_lengths") self.tags = tf.placeholder(tf.int32, [None, num_steps], name="ground_truth_tags") self.dropout_prob = tf.placeholder(tf.float32, [None], name="dropout_keep_p") self.generate_lemmas = generate_lemmas global_step = tf.Variable(0, trainable=False) input_list = [] regularize = [] # Word-level embeddings if word_embedding_size: self.words = tf.placeholder(tf.int32, [None, num_steps], name="words") word_embeddings = tf.Variable(tf.random_uniform([len(vocab), word_embedding_size], -1.0, 1.0)) we_lookup = tf.nn.embedding_lookup(word_embeddings, self.words) input_list.append(we_lookup) # Character-level embeddings if char_embedding_size: self.chars = tf.placeholder(tf.int32, [None, num_steps, num_chars], name="chars") self.chars_lengths = tf.placeholder(tf.int64, [None, num_steps], name="chars_lengths") char_embeddings = tf.Variable(tf.random_uniform([len(alphabet), char_embedding_size], -1.0, 1.0)) ce_lookup = tf.nn.embedding_lookup(char_embeddings, self.chars) reshaped_ce_lookup = tf.reshape(ce_lookup, [-1, num_chars, char_embedding_size], name="reshape-char_inputs") char_inputs = [tf.squeeze(input_, [1]) for input_ in tf.split(1, num_chars, reshaped_ce_lookup)] char_inputs_lengths = tf.reshape(self.chars_lengths, [-1]) with tf.variable_scope("char_forward"): char_lstm = rnn_cell.BasicLSTMCell(char_embedding_size) _, char_last_state = rnn.rnn( cell=char_lstm, inputs=char_inputs, sequence_length=char_inputs_lengths, dtype=tf.float32 ) tf.get_variable_scope().reuse_variables() regularize.append(tf.get_variable("RNN/BasicLSTMCell/Linear/Matrix")) with tf.variable_scope("char_backward"): char_lstm_rev = rnn_cell.BasicLSTMCell(char_embedding_size) _, char_last_state_rev = rnn.rnn( cell=char_lstm_rev, inputs=self._reverse_seq(char_inputs, char_inputs_lengths), sequence_length=char_inputs_lengths, dtype=tf.float32, ) tf.get_variable_scope().reuse_variables() regularize.append(tf.get_variable("RNN/BasicLSTMCell/Linear/Matrix")) last_char_lstm_state = tf.split(1, 2, char_last_state)[1] last_char_lstm_state_rev = tf.split(1, 2, char_last_state_rev)[1] last_char_states = tf.reshape( last_char_lstm_state, [-1, num_steps, char_embedding_size], name="reshape-charstates" ) last_char_states_rev = tf.reshape( last_char_lstm_state_rev, [-1, num_steps, char_embedding_size], name="reshape-charstates_rev" ) char_output = tf.concat(2, [last_char_states, last_char_states_rev]) input_list.append(char_output) # All inputs correctly sliced input_list_dropped = [tf.nn.dropout(x, self.dropout_prob[0]) for x in input_list] inputs = [tf.squeeze(input_, [1]) for input_ in tf.split(1, num_steps, tf.concat(2, input_list_dropped))] with tf.variable_scope("forward"): lstm = rnn_cell.BasicLSTMCell(self.lstm_size) outputs, last_state = rnn.rnn( cell=lstm, inputs=inputs, dtype=tf.float32, initial_state=self.forward_initial_state, sequence_length=self.sentence_lengths, ) tf.get_variable_scope().reuse_variables() regularize.append(tf.get_variable("RNN/BasicLSTMCell/Linear/Matrix")) with tf.variable_scope("backward"): lstm_rev = rnn_cell.BasicLSTMCell(self.lstm_size) outputs_rev_rev, last_state_rev = rnn.rnn( cell=lstm_rev, inputs=self._reverse_seq(inputs, self.sentence_lengths), dtype=tf.float32, initial_state=self.backward_initial_state, sequence_length=self.sentence_lengths, ) outputs_rev = self._reverse_seq(outputs_rev_rev, self.sentence_lengths) tf.get_variable_scope().reuse_variables() regularize.append(tf.get_variable("RNN/BasicLSTMCell/Linear/Matrix")) # outputs_forward = tf.reshape(tf.concat(1, outputs), [-1, self.lstm_size], # name="reshape-outputs_forward") # outputs_backward = tf.reshape(tf.concat(1, outputs_rev), [-1, self.lstm_size], # name="reshape-outputs_backward") # forward_w = tf.get_variable("forward_w", [self.lstm_size, self.lstm_size]) # backward_w = tf.get_variable("backward_w", [self.lstm_size, self.lstm_size]) # non_linearity_bias = tf.get_variable("non_linearity_b", [self.lstm_size]) outputs_bidi = [tf.concat(1, [o1, o2]) for o1, o2 in zip(outputs, reversed(outputs_rev))] # output = tf.tanh(tf.matmul(outputs_forward, forward_w) + tf.matmul(outputs_backward, backward_w) + non_linearity_bias) output = tf.reshape(tf.concat(1, outputs_bidi), [-1, 2 * self.lstm_size], name="reshape-outputs_bidi") output_dropped = tf.nn.dropout(output, self.dropout_prob[1]) # We are computing only the logits, not the actual softmax -- while # computing the loss, it is done by the sequence_loss_by_example and # during the runtime classification, the argmax over logits is enough. softmax_w = tf.get_variable("softmax_w", [2 * self.lstm_size, len(tagset)]) logits_flatten = tf.nn.xw_plus_b(output_dropped, softmax_w, tf.get_variable("softmax_b", [len(tagset)])) # tf.get_variable_scope().reuse_variables() regularize.append(softmax_w) self.logits = tf.reshape(logits_flatten, [-1, num_steps, len(tagset)], name="reshape-logits") estimated_tags_flat = tf.to_int32(tf.argmax(logits_flatten, dimension=1)) self.last_state = last_state # output maks: compute loss only if it insn't a padded word (i.e. zero index) output_mask = tf.reshape(tf.to_float(tf.not_equal(self.tags, 0)), [-1]) gt_tags_flat = tf.reshape(self.tags, [-1]) tagging_loss = seq2seq.sequence_loss_by_example( logits=[logits_flatten], targets=[gt_tags_flat], weights=[output_mask] ) tagging_accuracy = tf.reduce_sum( tf.to_float(tf.equal(estimated_tags_flat, gt_tags_flat)) * output_mask ) / tf.reduce_sum(output_mask) tf.scalar_summary("train_accuracy", tagging_accuracy, collections=["train"]) tf.scalar_summary("dev_accuracy", tagging_accuracy, collections=["dev"]) self.cost = tf.reduce_mean(tagging_loss) tf.scalar_summary("train_tagging_loss", tf.reduce_mean(tagging_loss), collections=["train"]) tf.scalar_summary("dev_tagging_loss", tf.reduce_mean(tagging_loss), collections=["dev"]) if generate_lemmas: with tf.variable_scope("decoder"): self.lemma_chars = tf.placeholder(tf.int32, [None, num_steps, num_chars + 2], name="lemma_chars") lemma_state_size = self.lstm_size lemma_w = tf.Variable(tf.random_uniform([lemma_state_size, len(alphabet)], 0.5), name="state_to_char_w") lemma_b = tf.Variable(tf.fill([len(alphabet)], -math.log(len(alphabet))), name="state_to_char_b") lemma_char_embeddings = tf.Variable( tf.random_uniform( [len(alphabet), lemma_state_size / (2 if supply_form_characters_to_lemma else 1)], -0.5, 0.5 ), name="char_embeddings", ) lemma_char_inputs = [ tf.squeeze(input_, [1]) for input_ in tf.split( 1, num_chars + 2, tf.reshape(self.lemma_chars, [-1, num_chars + 2], name="reshape-lemma_char_inputs"), ) ] if supply_form_characters_to_lemma: char_inputs_zeros = [ tf.squeeze(chars, [1]) for chars in tf.split( 1, num_chars, tf.reshape(self.chars, [-1, num_chars], name="reshape-char_inputs_zeros") ) ] char_inputs_zeros.append(char_inputs_zeros[0] * 0) def loop(prev_state, i): # it takes the previous hidden state, finds the character and formats it # as input for the next time step ... used in the decoder in the "real decoding scenario" out_activation = tf.matmul(prev_state, lemma_w) + lemma_b prev_char_index = tf.argmax(out_activation, 1) return tf.concat( 1, [ tf.nn.embedding_lookup(lemma_char_embeddings, prev_char_index), tf.nn.embedding_lookup(lemma_char_embeddings, char_inputs_zeros[i]), ], ) embedded_lemma_characters = [] for lemma_chars, form_chars in zip(lemma_char_inputs[:-1], char_inputs_zeros): embedded_lemma_characters.append( tf.concat( 1, [ tf.nn.embedding_lookup(lemma_char_embeddings, lemma_chars), tf.nn.embedding_lookup(lemma_char_embeddings, form_chars), ], ) ) else: def loop(prev_state, _): # it takes the previous hidden state, finds the character and formats it # as input for the next time step ... used in the decoder in the "real decoding scenario" out_activation = tf.matmul(prev_state, lemma_w) + lemma_b prev_char_index = tf.argmax(out_activation, 1) return tf.nn.embedding_lookup(lemma_char_embeddings, prev_char_index) embedded_lemma_characters = [] for lemma_chars in lemma_char_inputs[:-1]: embedded_lemma_characters.append(tf.nn.embedding_lookup(lemma_char_embeddings, lemma_chars)) def sampling_loop(prev_state, i): threshold = scheduled_sampling / (scheduled_sampling + tf.exp(tf.to_float(global_step))) condition = tf.less_equal(tf.random_uniform(tf.shape(embedded_lemma_characters[0])), threshold) return tf.select(condition, embedded_lemma_characters[i], loop(prev_state, i)) decoder_cell = rnn_cell.BasicLSTMCell(lemma_state_size) if scheduled_sampling: lf = sampling_loop else: lf = None if use_attention: lemma_outputs_train, _ = seq2seq.attention_decoder( embedded_lemma_characters, output_dropped, reshaped_ce_lookup, decoder_cell, loop_function=lf ) else: lemma_outputs_train, _ = seq2seq.rnn_decoder( embedded_lemma_characters, output_dropped, decoder_cell, loop_function=lf ) tf.get_variable_scope().reuse_variables() # regularize.append(tf.get_variable('attention_decoder/BasicLSTMCell/Linear/Matrix')) tf.get_variable_scope().reuse_variables() if use_attention: lemma_outputs_runtime, _ = seq2seq.attention_decoder( embedded_lemma_characters, output_dropped, reshaped_ce_lookup, decoder_cell, loop_function=loop ) else: lemma_outputs_runtime, _ = seq2seq.rnn_decoder( embedded_lemma_characters, output_dropped, decoder_cell, loop_function=loop ) lemma_char_logits_train = [tf.matmul(o, lemma_w) + lemma_b for o in lemma_outputs_train] lemma_char_logits_runtime = [tf.matmul(o, lemma_w) + lemma_b for o in lemma_outputs_runtime] self.lemmas_decoded = tf.reshape( tf.transpose(tf.argmax(tf.pack(lemma_char_logits_runtime), 2)), [-1, num_steps, num_chars + 1] ) lemma_char_weights = [] for lemma_chars in lemma_char_inputs[1:]: lemma_char_weights.append(tf.to_float(tf.not_equal(lemma_chars, 0))) lemmatizer_loss = seq2seq.sequence_loss( lemma_char_logits_train, lemma_char_inputs[1:], lemma_char_weights ) lemmatizer_loss_runtime = seq2seq.sequence_loss( lemma_char_logits_runtime, lemma_char_inputs[1:], lemma_char_weights ) tf.scalar_summary( "train_lemma_loss_with_gt_inputs", tf.reduce_mean(lemmatizer_loss), collections=["train"] ) tf.scalar_summary("dev_lemma_loss_with_gt_inputs", tf.reduce_mean(lemmatizer_loss), collections=["dev"]) tf.scalar_summary( "train_lemma_loss_with_decoded_inputs", tf.reduce_mean(lemmatizer_loss_runtime), collections=["train"], ) tf.scalar_summary( "dev_lemma_loss_with_decoded_inputs", tf.reduce_mean(lemmatizer_loss_runtime), collections=["dev"] ) self.cost += tf.reduce_mean(lemmatizer_loss) + tf.reduce_mean(lemmatizer_loss_runtime) self.cost += l2 * sum([tf.nn.l2_loss(variable) for variable in regularize]) tf.scalar_summary("train_optimization_cost", self.cost, collections=["train"]) tf.scalar_summary("dev_optimization_cost", self.cost, collections=["dev"]) def decay(learning_rate, exponent, iteration_steps): return tf.train.exponential_decay(learning_rate, global_step, iteration_steps, exponent, staircase=True) optimizer = eval("tf.train." + optimizer_desc) self.train = optimizer.minimize(self.cost, global_step=global_step) if threads > 0: self.session = tf.Session( config=tf.ConfigProto(inter_op_parallelism_threads=threads, intra_op_parallelism_threads=threads) ) else: self.session = tf.Session() self.session.run(tf.initialize_all_variables()) if write_summaries: self.summary_train = tf.merge_summary(tf.get_collection("train")) self.summary_dev = tf.merge_summary(tf.get_collection("dev")) timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S") self.summary_writer = tf.train.SummaryWriter("logs/" + timestamp + "_" + experiment_name) self.steps = 0
def __init__(self, CellType, is_training, config): self.batch_size = batch_size = config.batch_size self.num_steps = num_steps = config.num_steps size = config.hidden_size vocab_size = config.vocab_size self.input_data = tf.placeholder(tf.int32, [batch_size, num_steps], name="input_data") self.targets = tf.placeholder(tf.int32, [batch_size, num_steps], name="targets") lstm_cell = CellType(size) if is_training and config.keep_prob < 1: lstm_cell = rnn_cell.DropoutWrapper( lstm_cell, output_keep_prob=config.keep_prob) cell = rnn_cell.MultiRNNCell([lstm_cell] * config.num_layers) self.initial_state = cell.zero_state(batch_size, tf.float32) # initializer used for reusable variable initializer (see `get_variable`) initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [vocab_size, size], initializer=initializer) inputs = tf.nn.embedding_lookup(embedding, self.input_data) if is_training and config.keep_prob < 1: inputs = tf.nn.dropout(inputs, config.keep_prob) outputs = [] states = [] state = self.initial_state with tf.variable_scope("RNN", initializer=initializer): for time_step in range(num_steps): if time_step > 0: tf.get_variable_scope().reuse_variables() inputs_slice = inputs[:, time_step, :] (cell_output, state) = cell(inputs_slice, state) outputs.append(cell_output) states.append(state) self.final_state = states[-1] output = tf.reshape(tf.concat(1, outputs), [-1, size]) w = tf.get_variable("softmax_w", [size, vocab_size], initializer=initializer) b = tf.get_variable("softmax_b", [vocab_size], initializer=initializer) logits = tf.nn.xw_plus_b(output, w, b) # compute logits for loss targets = tf.reshape(self.targets, [-1]) # reshape our target outputs weights = tf.ones([batch_size * num_steps ]) # used to scale the loss average # computes loss and performs softmax on our fully-connected output layer loss = sequence_loss_by_example([logits], [targets], [weights], vocab_size) self.cost = cost = tf.div(tf.reduce_sum(loss), batch_size, name="cost") if is_training: # setup learning rate variable to decay self.lr = tf.Variable(1.0, trainable=False) # define training operation and clip the gradients tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), config.max_grad_norm) optimizer = tf.train.GradientDescentOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars), name="train") else: # if this model isn't for training (i.e. testing/validation) then we don't do anything here self.train_op = tf.no_op()
def __init__(self, is_training, config): """constructs a graph""" self.batch_size = batch_size = config.batch_size self.num_steps = num_steps = config.num_steps size = config.hidden_size vocab_size = config.vocab_size self._input_data = tf.placeholder(tf.int32, [batch_size, num_steps], name="input_data") self._targets = tf.placeholder(tf.int32, [batch_size, num_steps], name="targets") # here it is lstm_cell = rnn_cell.BasicLSTMCell(size, forget_bias=1.0) if is_training and config.keep_prob < 1: lstm_cell = rnn_cell.DropoutWrapper( lstm_cell, output_keep_prob=config.keep_prob) cell = rnn_cell.MultiRNNCell([lstm_cell] * config.num_layers) self._initial_state = cell.zero_state(batch_size, tf.float32) # do an embedding (always on cpu) with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [vocab_size, size]) inputs = tf.split( 1, num_steps, tf.nn.embedding_lookup(embedding, self._input_data)) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] if is_training and config.keep_prob < 1: inputs = [tf.nn.dropout(input_, config.keep_prob) for input_ in inputs] from tensorflow.models.rnn import rnn outputs, states = rnn.rnn(cell, inputs, initial_state=self._initial_state) # reshape outputs = tf.reshape(tf.concat(1, outputs), [-1, size]) logits = tf.nn.xw_plus_b(outputs, tf.get_variable("softmax_W", [size,vocab_size]), tf.get_variable("softmax_b", [vocab_size])) self._softmax_out = tf.nn.softmax(logits) # this is just used for sampling loss = seq2seq.sequence_loss_by_example([logits], [tf.reshape(self._targets,[-1])], [tf.ones([batch_size * num_steps])], vocab_size) self._cost = cost = tf.div(tf.reduce_sum(loss), tf.constant(batch_size, dtype=tf.float32)) self._final_state = states[-1] if not is_training: return # don't need to optimisation ops self._lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), config.max_grad_norm) # actually the simple guy does good # with the grad clipping and the lr schedule and whatnot #ftrl? #optimizer = tf.train.GradientDescentOptimizer(self.lr) optimizer = tf.train.FtrlOptimizer(self.lr) self._train_op = optimizer.apply_gradients(zip(grads, tvars))
tf.placeholder(tf.int32, shape=[_seq_length]) for _ in xrange(_batch_size) ] targets = [ tf.placeholder(tf.int32, shape=[_seq_length]) for _ in xrange(_batch_size) ] target_weights = [ tf.ones(dtype=tf.float32, shape=[_seq_length]) for _ in xrange(_batch_size) ] # set up the tied seq-to-seq LSTM with given parameters single_cell = rnn_cell.BasicLSTMCell(_lstm_cell_dimension) cell = rnn_cell.MultiRNNCell([single_cell] * _lstm_num_layers) outputs, _ = seq2seq.embedding_tied_rnn_seq2seq(encoder_inputs, decoder_inputs, cell, _vocab_size_including_GO) seqloss = seq2seq.sequence_loss_by_example(outputs, encoder_inputs, target_weights, _vocab_size_including_GO) tf.train.SummaryWriter(_train_log_dir, sess.graph_def) global_step = tf.Variable(0, name='global_step', trainable=False) sess.run(tf.initialize_all_variables()) # Set up the optimizer with gradient clipping params = tf.trainable_variables() gradients = tf.gradients(seqloss, params) optimizer = tf.train.GradientDescentOptimizer(_lstm_learn_rate) clipped_gradients, norm = tf.clip_by_global_norm(gradients, _lstm_max_grad_norm) train_op = optimizer.apply_gradients(zip(clipped_gradients, params), global_step=global_step)
if __name__ == '__main__': ops.reset_default_graph() if 'session' in globals(): session.close() session = tf.Session() args = parse_args() # Generator Training input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) targts = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) # Should be 1 for real gen_seq = generator(input_data, args) gen_loss = seq2seq.sequence_loss_by_example( [discriminator(gen_seq, args)[1]], # Input wants logits, not probs [tf.reshape(targets, [-1])], [tf.ones([args.batch_size * args.seq_length])], 2) gen_cost = tf.reduce_sum(gen_loss) / args.batch_size / args.seq_length gen_vars = [v for v in tf.all_variables() if v.name.startswith("generator/")] gen_optimizer = tf.train.AdamOptimizer(args.learning_rate_gen) gen_train_op = minimize_and_clip(gen_optimizer, objective = gen_cost, var_list = gen_vars) # Discriminator Training # TODO: Should this be tf.int32? input_real_seq = tf.placholder(tf.float32, [args.batch_size, args.seq_length, args.vocab_size]) input_gen_seq = tf.placholder(tf.float32, [args.batch_size, args.seq_length, args.vocab_size]) dis_real_prob = discriminator(input_real_seq, args) dis_fake_prob = discriminator(input_gen_seq, args)
def __init__(self, args, predict=False): self.args = args if predict: batchSize = 1 numSteps = 1 # Various parameters for the LSTM. # Hardcoded here for now. numSteps = 50 # Steps to unroll for batchSize = 50 rnnSize = 128 numLayers = 2 gradClip = 5 learningRate = 0.002 decayRate = 0.97 #Create LSTM layer and stack multiple layers. lstmCell = rnn_cell.BasicLSTMCell(rnnSize) lstmNet = rnn_cell.MultiRNNCell([lstmCell] * numLayers) #Define placeholders. self.inputData = tf.placeholder(tf.int32, [batchSize, numSteps]) self.targetOutput = tf.placeholder(tf.int32, [batchSize, numSteps]) self.initialState = lstmNet.zero_state(batchSize, tf.float32) # If rnn_decoder is told to loop, this function will return to it the output at time # 't' for feeding as the input at time 't+1'. During training, this is generally # not done because we want to feed the *correct* input at all times and not what # is output. During prediction/testing, we loop the output back to the input to # generate our sequence of notes. def feedBack(prev, _): prev = tf.nn.xw_plus_b(prev, softmax_w, softmax_b) prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embedding, prev_symbol) with tf.variable_scope('nn_lstm'): softmax_w = tf.get_variable("softmax_w", [rnnSize, args.vocabSize]) softmax_b = tf.get_variable("softmax_b", [args.vocabSize]) with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [args.vocabSize, rnnSize]) inputs = tf.split( 1, numSteps, tf.nn.embedding_lookup(embedding, self.inputData)) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] #Call seq2seq rnn decoder. outputs, states = seq2seq.rnn_decoder( inputs, self.initialState, lstmNet, loop_function=feedBack if predict else None, scope='nn_lstm') output = tf.reshape(tf.concat(1, outputs), [-1, rnnSize]) #Logit and probability #softmax_w = tf.get_variable("softmax_w", rnnSize, [args.vocabSize]) #softmax_b = tf.get_variable("softmax_b", [args.vocabSize]) self.logits = tf.matmul(output, softmax_w) + softmax_b self.probs = tf.nn.softmax(self.logits) # Calculate loss compared to targetOutput loss = seq2seq.sequence_loss_by_example( [self.logits], [tf.reshape(self.targetOutput, [-1])], [tf.ones([batchSize * numSteps])], args.vocabSize) # Set the cost to minimize total loss. self.cost = tf.reduce_sum(loss) # Learning rate remains constant (not trainable) self.finalState = states[-1] self.learningRate = tf.Variable(0.0, trainable=False) # Define gradient and trainable variables for adjusting # during training/optimization. trainableVars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm( tf.gradients(self.cost, trainableVars), gradClip) # We use the Adam optimizer. #optimizer = tf.train.GradientDescentOptimizer(self.learningRate).minimize(loss) #optimizer = tf.train.AdagradOptimizer(self.learningRate, initial_accumulator_value=0.1) #self.trainStep = optimizer.apply_gradients(zip(grads, trainableVars)) optimizer = tf.train.AdamOptimizer(self.learningRate) self.trainStep = optimizer.apply_gradients(zip(grads, trainableVars))
def model_with_buckets(self): #build an rnn model for each bucket, since tensor flow can't deal with variable length sequences #variables are shared across the different buckets, and if you only ask for the outputs #up to a certain bucket, then additional computation won't be done past teh steps in that bucket #UNCOMMENT FOR ATTENTION all_inputs = self._input_refinement + [ seg for seg in self._input_recipe_segments ] + self._target costs = [] losses = [] outputs = [] with tf.op_scope(all_inputs, None, "model_with_buckets"): for j in xrange(len(self.buckets)): if j > 0: outside_reuse = True else: outside_reuse = None with tf.variable_scope("bucket_model_outside", reuse=outside_reuse): phrase_num = self.buckets[j][0] phrase_len = self.buckets[j][1] bucket_refinement_inputs = [ self._input_refinement[i] for i in xrange(phrase_len) ] bucket_recipe_segments_inputs = [] bucket_target = [] bucket_weights = [] forward_attention_weights = [] #backward_attention_weights = [] for i in xrange(phrase_num): with tf.variable_scope("bucket_model_outside", reuse=outside_reuse): bucket_target.append(self._target[i]) bucket_weights.append( tf.constant(1, dtype=np.float32, shape=[self._batch_size])) # if j > 0 and i < self.buckets[j-1][0]: # with tf.variable_scope("attention", reuse=True): # forward_weight = tf.get_variable("forward_attention_weight%d"%(i)) #backward_weight = tf.get_variable("backward_attention_weight%d"%(i)) # else: # with tf.variable_scope("attention", reuse=None): # forward_weight = tf.get_variable("forward_attention_weight%d"%(i), [self._batch_size], dtype=tf.float32) #backward_weight = tf.get_variable("backward_attention_weight%d"%(i), [self._batch_size], dtype=tf.float32) # forward_attention_weights.append(forward_weight) #backward_attention_weights.append(backward_weight) with tf.variable_scope("bucket_model_outside", reuse=outside_reuse): bucket_recipe_segments_inputs.append([]) for k in xrange(phrase_len): bucket_recipe_segments_inputs[-1].append( self._input_recipe_segments[i][k]) with tf.variable_scope("bucket_model_outside", reuse=outside_reuse): bucket_logits = self.build_rnn_model( bucket_refinement_inputs, bucket_recipe_segments_inputs) #forward_attention_weights)#, backward_attention_weights) outputs.append([ tf.nn.softmax(bucket_logit) for bucket_logit in bucket_logits ]) loss = seq2seq.sequence_loss_by_example( bucket_logits, bucket_target, bucket_weights, 2) losses.append(loss) costs.append(tf.reduce_sum(loss)) tf.histogram_summary("cost_bucket_%d" % j, costs[-1]) # for i,f in enumerate(forward_attention_weights): # tf.histogram_summary("forward_attention_weight%d"%i, f) #tf.histogram_summary("backward_attention_weight%d"%i, backward_attention_weights[i]) return outputs, losses, costs
def __init__(self, args, infer=False): self.args = args if infer: args.batch_size = 1 args.seq_length = 1 if args.model == 'rnn': cell_fn = rnn_cell.BasicRNNCell elif args.model == 'gru': cell_fn = rnn_cell.GRUCell elif args.model == 'lstm': cell_fn = rnn_cell.BasicLSTMCell else: raise Exception("model type not supported: {}".format(args.model)) cell = cell_fn(args.rnn_size) self.cell = cell = rnn_cell.MultiRNNCell([cell] * args.num_layers) # create tensorflow placeholder self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) # Initial state of the cell memory. self.initial_state = cell.zero_state(args.batch_size, tf.float32) # create namespace for shareable variables (variable name = "rnnlm/softmax_w") with tf.variable_scope('rnnlm'): # create (or get) a variable with shape [rnn_size, vocab_size] softmax_w = tf.get_variable("softmax_w", [args.rnn_size, args.vocab_size]) softmax_b = tf.get_variable("softmax_b", [args.vocab_size]) with tf.device("/cpu:0"): # preparing dense representation of the data in a embedding matrix embedding = tf.get_variable("embedding", [args.vocab_size, args.rnn_size]) inputs = tf.split(1, args.seq_length, tf.nn.embedding_lookup(embedding, self.input_data)) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] def loop(prev, _): prev = tf.matmul(prev, softmax_w) + softmax_b prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embedding, prev_symbol) # rnn network outputs, last_state = seq2seq.rnn_decoder(inputs, self.initial_state, cell, loop_function=loop if infer else None, scope='rnnlm') output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size]) # last layer (like fully connected nn) self.logits = tf.matmul(output, softmax_w) + softmax_b # activation function of the last layer self.probs = tf.nn.softmax(self.logits) # loss function loss = seq2seq.sequence_loss_by_example([self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([args.batch_size * args.seq_length])], args.vocab_size) # training function self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length self.final_state = last_state self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), args.grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
# add dropout here if needed # create outputs and states outputs, states = rnn.rnn(cell, inputs, initial_state=initial_state) # reshape output output = tf.reshape(tf.concat(1, outputs), [-1, hidden_size]) # specify XW + b logits = tf.nn.xw_plus_b(output, tf.get_variable('softmax_w', [hidden_size, vocab_size]), tf.get_variable('softmax_b', [vocab_size])) # define loss loss = seq2seq.sequence_loss_by_example([logits], [tf.reshape(targets, [-1])], [tf.ones([batch_size * num_steps])], vocab_size) # define individual cost _cost = tf.reduce_sum(loss) / batch_size # get final state final_state = states[-1] # create learning rate variable _lr = tf.Variable(0.0, trainable=False) # define gradient clipping tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(_cost, tvars), max_grad_norm)
def __init__(self, sess, params, vocabs_size): NNModel.Model.__init__(self, vocabs_size) self.params = params self.batch_size = self.params.get("batch_size") self.max_length = self.params.get("max_length") self.size = self.params.get("size") self.num_layers = self.params.get("num_layers") # the learning rate could be a float, but this way we can adjust it during training # self.learning_rate = tf.Variable(float(learning_rate), trainable=False) self.learning_rate = self.params.get("learning_rate") self.embedding_size = self.params.get("embedding_size") # self.global_step = tf.Variable(0, trainable=False) self.incorrect = [0] * self.max_length self.global_step = 0 self.corpus_name = self.params.get("corpus_name") logging.info( "BiRNN model created with {0} layers of {1} cells. Embedding = {2}. Vocabulary sizes = {3}, length = {4}, batch = {5}." .format(self.num_layers, self.size, self.embedding_size, vocabs_size, self.max_length, self.batch_size)) # forward RNN with tf.variable_scope('forward'): fcell = rnn_cell.GRUCell(self.size, input_size=self.embedding_size) forward_cell = fcell if self.num_layers > 1: fcell2 = rnn_cell.GRUCell(self.size) forward_cell = rnn_cell.MultiRNNCell([fcell] + ([fcell2] * self.num_layers)) # backward RNN with tf.variable_scope('backward'): bcell = rnn_cell.GRUCell(self.size, input_size=self.embedding_size) backward_cell = bcell if self.num_layers > 1: bcell2 = rnn_cell.GRUCell(self.size) backward_cell = rnn_cell.MultiRNNCell([bcell] + ([bcell2] * self.num_layers)) #seq_len = tf.fill([self.batch_size], constant(self.max_length, dtype=tf.int64)) # self.inputs = tf.placeholder(tf.float32, shape=[self.max_length, self.batch_size, self.vocab_sizes[0]], name="inputs") self.inputs = [ tf.placeholder(tf.int32, shape=[None], name="inputs{0}".format(i)) for i in range(self.max_length) ] self.targets = [ tf.placeholder(tf.int32, shape=[None], name="targets{0}".format(i)) for i in range(self.max_length) ] self.sentence_lengths = tf.placeholder(tf.int64, shape=[None], name="sequence_lengths") self.dropout_placeholder = tf.placeholder(tf.float32, shape=[], name="dropout") self.word_embeddings = tf.Variable( tf.random_uniform([self.vocab_sizes[0], self.embedding_size], -1.0, 1.0)) embedded_inputs = [ tf.nn.embedding_lookup(self.word_embeddings, input_) for input_ in self.inputs ] dropped_embedded_inputs = [ tf.nn.dropout(i, self.dropout_placeholder) for i in embedded_inputs ] # dropout je realny cislo weights = { # Hidden layer weights => 2*n_hidden because of foward + backward cells # 'hidden': tf.Variable(tf.random_uniform([self.vocab_sizes[0], 2 * size]), name="hidden-weight"), 'out': tf.Variable(tf.random_uniform([2 * self.size, self.vocab_sizes[1]]), name="out-weight") } biases = { # 'hidden': tf.Variable(tf.random_uniform([2 * size]), name="hidden-bias"), 'out': tf.Variable(tf.random_uniform([self.vocab_sizes[1]]), name="out-bias") } # hack to omit information from RNN creation logging.getLogger().setLevel(logging.CRITICAL) with tf.variable_scope('BiRNN-net'): # bidi_layer = BidirectionalRNNLayer(forward_cell, backward_cell, dropped_embedded_inputs, self.sentence_lengths) # with tf.variable_scope('forward'): # output_fw, last_state = rnn.rnn(cell=forward_cell, inputs=dropped_embedded_inputs, dtype=tf.float32, sequence_length=self.sentence_lengths) # # with tf.variable_scope('backward'): # outputs_rev_rev, last_state_rev = rnn.rnn(cell=backward_cell, inputs=rnn._reverse_seq(dropped_embedded_inputs, self.sentence_lengths), dtype=tf.float32, # sequence_length=self.sentence_lengths) # output_bw = self.rnn._reverse_seq(outputs_rev_rev, self.sentence_lengths) # # outputs = [array_ops.concat(1, [fw, bw]) for fw, bw in zip(output_fw, output_bw)] outputs = rnn.bidirectional_rnn( forward_cell, backward_cell, dropped_embedded_inputs, sequence_length=self.sentence_lengths, dtype=tf.float32) logging.getLogger().setLevel(logging.INFO) self.out = [] self.probs = [] # after switch to TF 0.8 it started outputing some merges for FC a BC for o in outputs[0]: # TODO ############# pridat tf.nn.relu(MATMUL+BIAs) ??? intermediate_out = tf.matmul(o, weights['out']) + biases['out'] self.out.append(intermediate_out) self.probs.append(tf.nn.softmax(intermediate_out)) loss = seq2seq.sequence_loss_by_example(self.out, self.targets, [tf.ones([self.batch_size])] * self.max_length, self.vocab_sizes[1]) self.cost = tf.reduce_sum(loss) / self.batch_size tf.scalar_summary("Cost", self.cost) self.updates = tf.train.AdamOptimizer( self.learning_rate).minimize(loss) self.saver = tf.train.Saver(max_to_keep=0) # don't remove old models self.summaries = tf.merge_all_summaries() self.sum_writer = tf.python.training.summary_io.SummaryWriter( "tmp", sess.graph) # Initializing the variables & Launch the graph sess.run(tf.initialize_all_variables()) logging.info("BiRNN model initialized.")
def __init__(self, is_training, config): """constructs a graph""" self.batch_size = batch_size = config.batch_size self.num_steps = num_steps = config.num_steps size = config.hidden_size vocab_size = config.vocab_size self._input_data = tf.placeholder(tf.int32, [batch_size, num_steps], name="input_data") self._targets = tf.placeholder(tf.int32, [batch_size, num_steps], name="targets") # here it is lstm_cell = rnn_cell.BasicLSTMCell(size, forget_bias=1.0) if is_training and config.keep_prob < 1: lstm_cell = rnn_cell.DropoutWrapper( lstm_cell, output_keep_prob=config.keep_prob) cell = rnn_cell.MultiRNNCell([lstm_cell] * config.num_layers) self._initial_state = cell.zero_state(batch_size, tf.float32) # do an embedding (always on cpu) with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [vocab_size, size]) inputs = tf.split( 1, num_steps, tf.nn.embedding_lookup(embedding, self._input_data)) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] if is_training and config.keep_prob < 1: inputs = [ tf.nn.dropout(input_, config.keep_prob) for input_ in inputs ] from tensorflow.models.rnn import rnn outputs, states = rnn.rnn(cell, inputs, initial_state=self._initial_state) # reshape outputs = tf.reshape(tf.concat(1, outputs), [-1, size]) logits = tf.nn.xw_plus_b( outputs, tf.get_variable("softmax_W", [size, vocab_size]), tf.get_variable("softmax_b", [vocab_size])) self._softmax_out = tf.nn.softmax( logits) # this is just used for sampling loss = seq2seq.sequence_loss_by_example( [logits], [tf.reshape(self._targets, [-1])], [tf.ones([batch_size * num_steps])], vocab_size) self._cost = cost = tf.div(tf.reduce_sum(loss), tf.constant(batch_size, dtype=tf.float32)) self._final_state = states[-1] if not is_training: return # don't need to optimisation ops self._lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), config.max_grad_norm) # actually the simple guy does good # with the grad clipping and the lr schedule and whatnot #ftrl? #optimizer = tf.train.GradientDescentOptimizer(self.lr) optimizer = tf.train.FtrlOptimizer(self.lr) self._train_op = optimizer.apply_gradients(zip(grads, tvars))
def __init__(self, is_training, image_tensor, config, global_step_tensor): self.batch_size = batch_size = config.batch_size self.num_steps = num_steps = config.num_steps size = config.hidden_size vocab_size = config.vocab_size image_tensor = tf.nn.local_response_normalization(image_tensor) self.alexnet = alexnet.AlexNet({'data': image_tensor}, trainable=False) self.image_input = image_tensor self._input_data = tf.placeholder(tf.int32, [batch_size, num_steps]) self._targets = tf.placeholder(tf.int32, [batch_size, num_steps]) lstm_cell = rnn_cell.LSTMCell(size, size, use_peepholes=True, cell_clip=2.) if is_training and config.keep_prob < 1: lstm_cell = rnn_cell.DropoutWrapper(lstm_cell, output_keep_prob=config.keep_prob) cell = rnn_cell.MultiRNNCell([lstm_cell] * config.num_layers) self._initial_state = cell.zero_state(batch_size, tf.float32) with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [vocab_size, size]) inputs = tf.nn.embedding_lookup(embedding, self._input_data) if is_training and config.keep_prob < 1: inputs = tf.nn.dropout(inputs, config.keep_prob) inputs = [tf.squeeze(input_, [1]) for input_ in tf.split(1, num_steps, inputs)] outputs, states = rnn.rnn(cell, inputs, initial_state=self._initial_state) fc8 = self.alexnet.layers['fc8'] print(fc8.get_shape()) with tf.name_scope('image_features'): w = tf.get_variable('Weights', [1000, config.image_features_size]) b = tf.get_variable('Biases', [config.image_features_size]) image_features = tf.nn.sigmoid(tf.matmul(fc8, w) + b) image_features_size = config.image_features_size#int(image_features.get_shape().num_elements() / batch_size) #outputs = [tf.concat(1, [o, image_features, i]) for o, i in zip(outputs, inputs)] #new_size = size + image_features_size + size # The size of input and output is size outputs = [tf.concat(1, [o, image_features]) for o in outputs] new_size = size + image_features_size output = tf.concat(1, outputs) output = tf.reshape(output, [-1, new_size]) self.outputs = output softmax_w = tf.get_variable("softmax_w", [new_size, vocab_size]) softmax_b = tf.get_variable("softmax_b", [vocab_size]) logits = tf.nn.xw_plus_b(output, softmax_w, softmax_b) weights = tf.ones([batch_size * num_steps]) loss = seq2seq.sequence_loss_by_example([logits], [ tf.reshape(self._targets, [-1]) ], [weights], vocab_size) self.logits = logits self._cost = cost = tf.reduce_sum(loss) * (1.0 / batch_size) self._final_state = states[-1] self.probs = tf.nn.softmax(logits) if not is_training: self._train_op = tf.no_op() return self._lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), config.max_grad_norm) #optimizer = tf.train.GradientDescentOptimizer(self.lr) optimizer = tf.train.AdamOptimizer(self.lr) self._train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step_tensor) tf.scalar_summary('perplexity', cost) tf.histogram_summary('loss', loss) tf.histogram_summary('probs', self.probs)
def __init__(self, is_training, config): self.batch_size = batch_size = config.batch_size self.num_steps = num_steps = config.num_steps size = config.hidden_size vocab_size = config.vocab_size self._input_data = tf.placeholder(tf.int32, [batch_size, num_steps]) self._targets = tf.placeholder(tf.int32, [batch_size, num_steps]) # Slightly better results can be obtained with forget gate biases # initialized to 1 but the hyperparameters of the model would need to be # different than reported in the paper. lstm_cell = rnn_cell.BasicLSTMCell(size, forget_bias=0.0) if is_training and config.keep_prob < 1: lstm_cell = rnn_cell.DropoutWrapper( lstm_cell, output_keep_prob=config.keep_prob) cell = rnn_cell.MultiRNNCell([lstm_cell] * config.num_layers) self._initial_state = cell.zero_state(batch_size, tf.float32) with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [vocab_size, size]) inputs = tf.nn.embedding_lookup(embedding, self._input_data) if is_training and config.keep_prob < 1: inputs = tf.nn.dropout(inputs, config.keep_prob) # Simplified version of tensorflow.models.rnn.rnn.py's rnn(). # This builds an unrolled LSTM for tutorial purposes only. # In general, use the rnn() or state_saving_rnn() from rnn.py. # # The alternative version of the code below is: # # from tensorflow.models.rnn import rnn # inputs = [tf.squeeze(input_, [1]) # for input_ in tf.split(1, num_steps, inputs)] # outputs, states = rnn.rnn(cell, inputs, initial_state=self._initial_state) outputs = [] states = [] state = self._initial_state with tf.variable_scope("RNN"): for time_step in range(num_steps): if time_step > 0: tf.get_variable_scope().reuse_variables() (cell_output, state) = cell(inputs[:, time_step, :], state) outputs.append(cell_output) states.append(state) output = tf.reshape(tf.concat(1, outputs), [-1, size]) logits = tf.nn.xw_plus_b(output, tf.get_variable("softmax_w", [size, vocab_size]), tf.get_variable("softmax_b", [vocab_size])) self.probs = tf.nn.softmax(self.logits) loss = seq2seq.sequence_loss_by_example([logits], [tf.reshape(self._targets, [-1])], [tf.ones([batch_size * num_steps])], vocab_size) self._cost = cost = tf.reduce_sum(loss) / batch_size self._final_state = states[-1] if not is_training: return self._lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), config.max_grad_norm) optimizer = tf.train.GradientDescentOptimizer(self.lr) self._train_op = optimizer.apply_gradients(zip(grads, tvars))
def __init__(self, args, infer=False): self.args = args if infer: args.batch_size = 1 args.seq_length = 1 if args.model == 'rnn': cell_fn = jzRNNCell elif args.model == 'gru': cell_fn = jzGRUCell elif args.model == 'lstm': cell_fn = jzLSTMCell else: raise Exception("model type not supported: {}".format(args.model)) if args.activation == 'tanh': cell_af = tf.tanh elif args.activation == 'sigmoid': cell_af = tf.sigmoid elif args.activation == 'relu': cell_af = tf.nn.relu else: raise Exception("activation function not supported: {}".format(args.activation)) self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) with tf.variable_scope('rnnlm'): if not args.bidirectional: softmax_w = tf.get_variable("softmax_w", [args.rnn_size, args.vocab_size]) else: softmax_w = tf.get_variable("softmax_w", [args.rnn_size*2, args.vocab_size]) softmax_b = tf.get_variable("softmax_b", [args.vocab_size]) with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [args.vocab_size, args.rnn_size]) inputs = tf.split(1, args.seq_length, tf.nn.embedding_lookup(embedding, self.input_data)) inputs = [tf.nn.dropout(tf.squeeze(input_, [1]),args.dropout) for input_ in inputs] # one-directional RNN (nothing changed here..) if not args.bidirectional: cell = cell_fn(args.rnn_size,activation=cell_af) self.cell = cell = rnn_cell.MultiRNNCell([cell] * args.num_layers) self.initial_state = cell.zero_state(args.batch_size, tf.float32) def loop(prev, _): prev = tf.matmul(prev, softmax_w) + softmax_b prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embedding, prev_symbol) outputs, last_state = seq2seq.rnn_decoder(inputs, self.initial_state, cell, loop_function=loop if infer else None, scope='rnnlm') output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size]) # bi-directional RNN else: lstm_fw = cell_fn(args.rnn_size,activation=cell_af) lstm_bw = cell_fn(args.rnn_size,activation=cell_af) self.lstm_fw = lstm_fw = rnn_cell.MultiRNNCell([lstm_fw]*args.num_layers) self.lstm_bw = lstm_bw = rnn_cell.MultiRNNCell([lstm_bw]*args.num_layers) self.initial_state_fw = lstm_fw.zero_state(args.batch_size,tf.float32) self.initial_state_bw = lstm_bw.zero_state(args.batch_size,tf.float32) outputs,_,_ = rnn.bidirectional_rnn(lstm_fw, lstm_bw, inputs, initial_state_fw=self.initial_state_fw, initial_state_bw=self.initial_state_bw, sequence_length=args.batch_size) output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size*2]) self.logits = tf.matmul(tf.nn.dropout(output,args.dropout), softmax_w) + softmax_b self.probs = tf.nn.softmax(self.logits) loss = seq2seq.sequence_loss_by_example([self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([args.batch_size * args.seq_length])], args.vocab_size) self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length self.final_state = last_state self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), args.grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def __init__(self, is_training, config): self.batch_size = batch_size = config.batch_size self.num_steps = num_steps = config.num_steps size = config.hidden_size vocab_size = config.vocab_size self._input_data = tf.placeholder(tf.int32, [batch_size, num_steps]) self._targets = tf.placeholder(tf.int32, [batch_size, num_steps]) # Slightly better results can be obtained with forget gate biases # initialized to 1 but the hyperparameters of the model would need to be # different than reported in the paper. lstm_cell = rnn_cell.BasicLSTMCell(size, forget_bias=0.0) if is_training and config.keep_prob < 1: lstm_cell = rnn_cell.DropoutWrapper( lstm_cell, output_keep_prob=config.keep_prob) cell = rnn_cell.MultiRNNCell([lstm_cell] * config.num_layers) self._initial_state = cell.zero_state(batch_size, tf.float32) with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [vocab_size, size]) inputs = tf.nn.embedding_lookup(embedding, self._input_data) if is_training and config.keep_prob < 1: inputs = tf.nn.dropout(inputs, config.keep_prob) # Simplified version of tensorflow.models.rnn.rnn.py's rnn(). # This builds an unrolled LSTM for tutorial purposes only. # In general, use the rnn() or state_saving_rnn() from rnn.py. # # The alternative version of the code below is: # # from tensorflow.models.rnn import rnn # inputs = [tf.squeeze(input_, [1]) # for input_ in tf.split(1, num_steps, inputs)] # outputs, states = rnn.rnn(cell, inputs, initial_state=self._initial_state) outputs = [] states = [] state = self._initial_state with tf.variable_scope("RNN"): for time_step in range(num_steps): if time_step > 0: tf.get_variable_scope().reuse_variables() (cell_output, state) = cell(inputs[:, time_step, :], state) outputs.append(cell_output) states.append(state) output = tf.reshape(tf.concat(1, outputs), [-1, size]) logits = tf.nn.xw_plus_b( output, tf.get_variable("softmax_w", [size, vocab_size]), tf.get_variable("softmax_b", [vocab_size])) loss = seq2seq.sequence_loss_by_example( [logits], [tf.reshape(self._targets, [-1])], [tf.ones([batch_size * num_steps])], vocab_size) self._cost = cost = tf.reduce_sum(loss) / batch_size self._final_state = states[-1] if not is_training: return self._lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), config.max_grad_norm) optimizer = tf.train.GradientDescentOptimizer(self.lr) self._train_op = optimizer.apply_gradients(zip(grads, tvars))
def __init__(self, is_training, learning_rate=1.0, optimizer="sgd", max_grad_norm=5, num_layers=2, use_lstm=True, num_steps=35, num_steps_valid=120, proj_size=650, hidden_size=650, hidden_proj=650, num_samples=512, init_scale=0.1, dropout_rate=0.0, lr_decay=0.8, batch_size=20, attentive=False, projection_attention_f=None, output_form=lm_ops.OUTPUT_CONCAT, vocab_size=10000): with tf.device("/gpu:0"): if attentive: assert projection_attention_f is not None self.batch_size = batch_size = batch_size self.num_steps = num_steps self.num_steps_valid = num_steps_valid vocab_size = vocab_size self._input_data_train = [] self._targets_train = [] self.mask_train = [] for i in xrange(num_steps): # Last bucket is the biggest one. self.input_data_train.append(tf.placeholder(tf.int32, shape=[None], name="input_train{0}".format(i))) self.targets_train.append(tf.placeholder(tf.int32, shape=[None], name="target_train{0}".format(i))) self.mask_train.append(tf.placeholder(tf.float32, shape=[None], name="mask_train{0}".format(i))) self._input_data_valid = [] self._targets_valid = [] self.mask_valid = [] for i in xrange(num_steps_valid): # Last bucket is the biggest one. self.input_data_valid.append(tf.placeholder(tf.int32, shape=[None], name="input_valid{0}".format(i))) self.targets_valid.append(tf.placeholder(tf.int32, shape=[None], name="target_valid{0}".format(i))) self.mask_valid.append(tf.placeholder(tf.float32, shape=[None], name="mask_valid{0}".format(i))) hidden_projection = None if hidden_proj > 0: hidden_projection = hidden_proj self.cell = cells.build_lm_multicell_rnn(num_layers, hidden_size, proj_size, use_lstm=use_lstm, hidden_projection=hidden_projection, dropout=dropout_rate) self.dropout_feed = tf.placeholder(tf.float32, name="dropout_rate") self._initial_state_train = self.cell.zero_state(batch_size, tf.float32) self._initial_state_valid = self.cell.zero_state(1, tf.float32) # learning rate ops self.learning_rate = tf.Variable(float(learning_rate), trainable=False) self.learning_rate_decay_op = self.learning_rate.assign(self.learning_rate * lr_decay) # epoch ops self.epoch = tf.Variable(0, trainable=False) self.epoch_update_op = self.epoch.assign(self.epoch + 1) # samples seen ops self.samples_seen = tf.Variable(0, trainable=False) self.samples_seen_update_op = self.samples_seen.assign(self.samples_seen + batch_size) self.samples_seen_reset_op = self.samples_seen.assign(0) # global step variable - controled by the model self.global_step = tf.Variable(0.0, trainable=False) # average loss ops self.current_ppx = tf.Variable(1.0, trainable=False) self.current_loss = tf.Variable(0.0, trainable=False) # self.current_loss_update_op = None self.best_eval_ppx = tf.Variable(numpy.inf, trainable=False) self.estop_counter = tf.Variable(0, trainable=False) self.estop_counter_update_op = self.estop_counter.assign(self.estop_counter + 1) self.estop_counter_reset_op = self.estop_counter.assign(0) initializer = tf.random_uniform_initializer(minval=init_scale, maxval=init_scale, seed=_SEED) out_proj = hidden_size if hidden_proj > 0: out_proj = hidden_proj with tf.device("/cpu:0"): w = tf.get_variable("proj_w", [out_proj, vocab_size]) w_t = tf.transpose(w) b = tf.get_variable("proj_b", [vocab_size]) self.output_projection = (w, b) sampled_softmax = False # Sampled softmax only makes sense if we sample less than vocabulary size. if 0 < num_samples < vocab_size: sampled_softmax = True def sampled_loss(logits, labels): with tf.device("/cpu:0"): labels = tf.reshape(labels, [-1, 1]) losses = tf.nn.sampled_softmax_loss(w_t, b, logits, labels, num_samples, vocab_size) return losses loss_function = sampled_loss with tf.device("/cpu:0"): # input come as one big tensor so we have to split it into a list of tensors to run the rnn cell embedding = tf.Variable( tf.random_uniform( [vocab_size, proj_size], minval=-init_scale, maxval=init_scale ), name="embedding" ) # embedding = tf.get_variable("embedding", [vocab_size, proj_size]) inputs_train = [tf.nn.embedding_lookup(embedding, i) for i in self.input_data_train] inputs_valid = [tf.nn.embedding_lookup(embedding, i) for i in self.input_data_valid] with tf.variable_scope("RNN", initializer=initializer): if attentive: outputs_train, state_train, _ = lm_ops.apply_attentive_lm( self.cell, inputs_train, sequence_length=array_ops.squeeze(math_ops.add_n(self.mask_train)), projection_attention_f=projection_attention_f, output_form=output_form, dropout=self.dropout_feed, initializer=initializer, dtype=tf.float32 ) outputs_valid, state_valid, _ = lm_ops.apply_attentive_lm( self.cell, inputs_valid, sequence_length=array_ops.squeeze(math_ops.add_n(self.mask_valid)), projection_attention_f=projection_attention_f, output_form=output_form, dropout=self.dropout_feed, initializer=initializer, dtype=tf.float32 ) else: outputs_train, state_train = lm_ops.apply_lm( self.cell, inputs_train, sequence_length=math_ops.add_n(self.mask_train), dropout=self.dropout_feed, dtype=tf.float32 ) outputs_valid, state_valid = lm_ops.apply_lm( self.cell, inputs_valid, sequence_length=math_ops.add_n(self.mask_valid), dropout=self.dropout_feed, dtype=tf.float32 ) if sampled_softmax is False: logits_train = [tf.nn.xw_plus_b(o, self.output_projection[0], self.output_projection[1]) for o in outputs_train] logits_valid = [tf.nn.xw_plus_b(o, self.output_projection[0], self.output_projection[1]) for o in outputs_valid] else: logits_train = outputs_train logits_valid = outputs_valid loss_train = seq2seq.sequence_loss_by_example( logits_train, self.targets_train, self.mask_train, average_across_timesteps=True ) loss_valid = seq2seq.sequence_loss_by_example( logits_valid, self.targets_valid, self.mask_valid, average_across_timesteps=True ) self._cost_train = cost = tf.reduce_sum(loss_train) / float(batch_size) self._final_state_train = state_train self._cost_valid = tf.reduce_sum(loss_valid) / float(batch_size) self._final_state_valid = state_valid if not is_training: return tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), max_grad_norm) opt = optimization_ops.get_optimizer(optimizer, learning_rate) self._train_op = opt.apply_gradients(zip(grads, tvars), global_step=self.global_step) self._valid_op = tf.no_op() self.saver = tf.train.Saver(tf.all_variables()) self.saver_best = tf.train.Saver(tf.all_variables())
def model_with_buckets(encoder_inputs, decoder_inputs, targets, weights, buckets, seq2seq_f, softmax_loss_function=None, per_example_loss=False, name=None): """Create a sequence-to-sequence model with support for bucketing. The seq2seq argument is a function that defines a sequence-to-sequence model, e.g., seq2seq = lambda x, y: basic_rnn_seq2seq(x, y, rnn_cell.GRUCell(24)) Args: encoder_inputs: A list of Tensors to feed the encoder; first seq2seq input. decoder_inputs: A list of Tensors to feed the decoder; second seq2seq input. targets: A list of 1D batch-sized int32 Tensors (desired output sequence). weights: List of 1D batch-sized float-Tensors to weight the targets. buckets: A list of pairs of (input size, output size) for each bucket. seq2seq_f: A sequence-to-sequence model function; it takes 2 input that agree with encoder_inputs and decoder_inputs, and returns a pair consisting of outputs and states (as, e.g., basic_rnn_seq2seq). softmax_loss_function: Function (inputs-batch, labels-batch) -> loss-batch to be used instead of the standard softmax (the default if this is None). per_example_loss: Boolean. If set, the returned loss will be a batch-sized tensor of losses for each sequence in the batch. If unset, it will be a scalar with the averaged loss from all examples. name: Optional name for this operation, defaults to "model_with_buckets". Returns: A tuple of the form (outputs, losses), where: outputs: The outputs for each bucket. Its j'th element consists of a list of 2D Tensors of shape [batch_size x num_decoder_symbols] (jth outputs). losses: List of scalar Tensors, representing losses for each bucket, or, if per_example_loss is set, a list of 1D batch-sized float Tensors. Raises: ValueError: If length of encoder_inputsut, targets, or weights is smaller than the largest (last) bucket. """ if len(encoder_inputs) < buckets[-1][0]: raise ValueError("Length of encoder_inputs (%d) must be at least that of la" "st bucket (%d)." % (len(encoder_inputs), buckets[-1][0])) if len(targets) < buckets[-1][1]: raise ValueError("Length of targets (%d) must be at least that of last" "bucket (%d)." % (len(targets), buckets[-1][1])) if len(weights) < buckets[-1][1]: raise ValueError("Length of weights (%d) must be at least that of last" "bucket (%d)." % (len(weights), buckets[-1][1])) all_inputs = encoder_inputs + decoder_inputs + targets + weights losses = [] outputs = [] with ops.op_scope(all_inputs, name, "model_with_buckets"): for j, bucket in enumerate(buckets): with variable_scope.variable_scope(variable_scope.get_variable_scope(), reuse=True if j > 0 else None): bucket_outputs, _ = seq2seq_f(encoder_inputs[:bucket[0]], decoder_inputs[:bucket[1]]) outputs.append(bucket_outputs) if per_example_loss: losses.append(seq2seq.sequence_loss_by_example( outputs[-1], targets[:bucket[1]], weights[:bucket[1]], average_across_timesteps=True, softmax_loss_function=softmax_loss_function)) else: losses.append(seq2seq.sequence_loss( outputs[-1], targets[:bucket[1]], weights[:bucket[1]], average_across_timesteps=True, softmax_loss_function=softmax_loss_function)) return outputs, losses
def __init__(self, is_training, learning_rate=1.0, optimizer="sgd", max_grad_norm=5, num_layers=2, use_lstm=True, num_steps=35, num_steps_valid=120, proj_size=650, hidden_size=650, hidden_proj=650, num_samples=512, init_scale=0.1, dropout_rate=0.0, lr_decay=0.8, batch_size=20, attentive=False, projection_attention_f=None, output_form=lm_ops.OUTPUT_CONCAT, vocab_size=10000): with tf.device("/gpu:0"): if attentive: assert projection_attention_f is not None self.batch_size = batch_size = batch_size self.num_steps = num_steps self.num_steps_valid = num_steps_valid vocab_size = vocab_size self._input_data_train = [] self._targets_train = [] self.mask_train = [] for i in xrange(num_steps): # Last bucket is the biggest one. self.input_data_train.append( tf.placeholder(tf.int32, shape=[None], name="input_train{0}".format(i))) self.targets_train.append( tf.placeholder(tf.int32, shape=[None], name="target_train{0}".format(i))) self.mask_train.append( tf.placeholder(tf.float32, shape=[None], name="mask_train{0}".format(i))) self._input_data_valid = [] self._targets_valid = [] self.mask_valid = [] for i in xrange( num_steps_valid): # Last bucket is the biggest one. self.input_data_valid.append( tf.placeholder(tf.int32, shape=[None], name="input_valid{0}".format(i))) self.targets_valid.append( tf.placeholder(tf.int32, shape=[None], name="target_valid{0}".format(i))) self.mask_valid.append( tf.placeholder(tf.float32, shape=[None], name="mask_valid{0}".format(i))) hidden_projection = None if hidden_proj > 0: hidden_projection = hidden_proj self.cell = cells.build_lm_multicell_rnn( num_layers, hidden_size, proj_size, use_lstm=use_lstm, hidden_projection=hidden_projection, dropout=dropout_rate) self.dropout_feed = tf.placeholder(tf.float32, name="dropout_rate") self._initial_state_train = self.cell.zero_state( batch_size, tf.float32) self._initial_state_valid = self.cell.zero_state(1, tf.float32) # learning rate ops self.learning_rate = tf.Variable(float(learning_rate), trainable=False) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * lr_decay) # epoch ops self.epoch = tf.Variable(0, trainable=False) self.epoch_update_op = self.epoch.assign(self.epoch + 1) # samples seen ops self.samples_seen = tf.Variable(0, trainable=False) self.samples_seen_update_op = self.samples_seen.assign( self.samples_seen + batch_size) self.samples_seen_reset_op = self.samples_seen.assign(0) # global step variable - controled by the model self.global_step = tf.Variable(0.0, trainable=False) # average loss ops self.current_ppx = tf.Variable(1.0, trainable=False) self.current_loss = tf.Variable(0.0, trainable=False) # self.current_loss_update_op = None self.best_eval_ppx = tf.Variable(numpy.inf, trainable=False) self.estop_counter = tf.Variable(0, trainable=False) self.estop_counter_update_op = self.estop_counter.assign( self.estop_counter + 1) self.estop_counter_reset_op = self.estop_counter.assign(0) initializer = tf.random_uniform_initializer(minval=init_scale, maxval=init_scale, seed=_SEED) out_proj = hidden_size if hidden_proj > 0: out_proj = hidden_proj with tf.device("/cpu:0"): w = tf.get_variable("proj_w", [out_proj, vocab_size]) w_t = tf.transpose(w) b = tf.get_variable("proj_b", [vocab_size]) self.output_projection = (w, b) sampled_softmax = False # Sampled softmax only makes sense if we sample less than vocabulary size. if 0 < num_samples < vocab_size: sampled_softmax = True def sampled_loss(logits, labels): with tf.device("/cpu:0"): labels = tf.reshape(labels, [-1, 1]) losses = tf.nn.sampled_softmax_loss( w_t, b, logits, labels, num_samples, vocab_size) return losses loss_function = sampled_loss with tf.device("/cpu:0"): # input come as one big tensor so we have to split it into a list of tensors to run the rnn cell embedding = tf.Variable(tf.random_uniform( [vocab_size, proj_size], minval=-init_scale, maxval=init_scale), name="embedding") # embedding = tf.get_variable("embedding", [vocab_size, proj_size]) inputs_train = [ tf.nn.embedding_lookup(embedding, i) for i in self.input_data_train ] inputs_valid = [ tf.nn.embedding_lookup(embedding, i) for i in self.input_data_valid ] with tf.variable_scope("RNN", initializer=initializer): if attentive: outputs_train, state_train, _ = lm_ops.apply_attentive_lm( self.cell, inputs_train, sequence_length=array_ops.squeeze( math_ops.add_n(self.mask_train)), projection_attention_f=projection_attention_f, output_form=output_form, dropout=self.dropout_feed, initializer=initializer, dtype=tf.float32) outputs_valid, state_valid, _ = lm_ops.apply_attentive_lm( self.cell, inputs_valid, sequence_length=array_ops.squeeze( math_ops.add_n(self.mask_valid)), projection_attention_f=projection_attention_f, output_form=output_form, dropout=self.dropout_feed, initializer=initializer, dtype=tf.float32) else: outputs_train, state_train = lm_ops.apply_lm( self.cell, inputs_train, sequence_length=math_ops.add_n(self.mask_train), dropout=self.dropout_feed, dtype=tf.float32) outputs_valid, state_valid = lm_ops.apply_lm( self.cell, inputs_valid, sequence_length=math_ops.add_n(self.mask_valid), dropout=self.dropout_feed, dtype=tf.float32) if sampled_softmax is False: logits_train = [ tf.nn.xw_plus_b(o, self.output_projection[0], self.output_projection[1]) for o in outputs_train ] logits_valid = [ tf.nn.xw_plus_b(o, self.output_projection[0], self.output_projection[1]) for o in outputs_valid ] else: logits_train = outputs_train logits_valid = outputs_valid loss_train = seq2seq.sequence_loss_by_example( logits_train, self.targets_train, self.mask_train, average_across_timesteps=True) loss_valid = seq2seq.sequence_loss_by_example( logits_valid, self.targets_valid, self.mask_valid, average_across_timesteps=True) self._cost_train = cost = tf.reduce_sum(loss_train) / float( batch_size) self._final_state_train = state_train self._cost_valid = tf.reduce_sum(loss_valid) / float(batch_size) self._final_state_valid = state_valid if not is_training: return tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), max_grad_norm) opt = optimization_ops.get_optimizer(optimizer, learning_rate) self._train_op = opt.apply_gradients(zip(grads, tvars), global_step=self.global_step) self._valid_op = tf.no_op() self.saver = tf.train.Saver(tf.all_variables()) self.saver_best = tf.train.Saver(tf.all_variables())