def testSequenceLoss(self): with self.test_session() as sess: output_classes = 5 logits = [tf.constant(i + 0.5, shape=[2, 5]) for i in xrange(3)] targets = [tf.constant(i, tf.int32, shape=[2]) for i in xrange(3)] weights = [tf.constant(1.0, shape=[2]) for i in xrange(3)] average_loss_per_example = seq2seq.sequence_loss( logits, targets, weights, output_classes, average_across_timesteps=True, average_across_batch=True) res = sess.run(average_loss_per_example) self.assertAllClose(res, 1.60944) average_loss_per_sequence = seq2seq.sequence_loss( logits, targets, weights, output_classes, average_across_timesteps=False, average_across_batch=True) res = sess.run(average_loss_per_sequence) self.assertAllClose(res, 4.828314) total_loss = seq2seq.sequence_loss( logits, targets, weights, output_classes, average_across_timesteps=False, average_across_batch=False) res = sess.run(total_loss) self.assertAllClose(res, 9.656628)
def build_autoencoder(dpg): hidden_dim = dpg.spec.policy_dims[0] dec_cell = util.GRUCell(FLAGS.embedding_dim, hidden_dim) dec_cell = rnn_cell.OutputProjectionWrapper(dec_cell, FLAGS.vocab_size) dec_inp = [ tf.zeros_like(dpg.input_tokens[0], name="adec_inp%i" % t) for t in range(dpg.seq_length) ] dec_out, _ = util.embedding_rnn_decoder(dec_inp, dpg.encoder_states[-1], dec_cell, FLAGS.vocab_size, feed_previous=True, embedding=dpg.embeddings, scope="adec") labels = [ tf.placeholder(tf.int32, shape=(None, ), name="labels%i" % t) for t in range(dpg.seq_length) ] weights = [tf.ones_like(labels_t, dtype=tf.float32) for labels_t in labels] loss = seq2seq.sequence_loss(dec_out, labels, weights, FLAGS.vocab_size) optimizer = tf.train.AdamOptimizer(0.01) train_op = optimizer.minimize(loss) # TODO wrt what? return labels, loss, train_op
def __init__(self, vocab_size, sequence_length, num_units, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor): self.vocab_size = vocab_size self.sequence_length = sequence_length self.batch_size = batch_size self.learning_rate = tf.Variable(float(learning_rate), trainable=False) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) w = training.utils.gaussian_weights_variable([num_units, self.vocab_size]) b = tf.Variable(tf.zeros([self.vocab_size])) lstm_cell = rnn_cell.LSTMCell(num_units, vocab_size) self.encoder_inputs = [] self.decoder_inputs = [] self.target_weights = [] for _ in range(sequence_length): self.encoder_inputs.append(tf.placeholder( tf.float32, shape=(batch_size, self.vocab_size))) self.decoder_inputs.append(tf.placeholder( tf.float32, shape=(batch_size, self.vocab_size))) self.target_weights.append(tf.placeholder( tf.float32, shape=(batch_size,))) # Decoder has one extra cell because it starts with the GO symbol, # and the targets are shifted by one. # Not sure this is actually useful, as it is always set to 0. # As this is inspired by TensorFlow seq2seq models, there might be # something dodgy in there. self.decoder_inputs.append(tf.placeholder( tf.float32, shape=(batch_size, self.vocab_size))) self.target_weights.append(np.ones((batch_size,))) #Â Targets used by the sequence loss must be integer indices. targets = [tf.cast(tf.argmax(i, 1), dtype=tf.int32) for i in self.decoder_inputs[1:]] outputs, self.state = seq2seq.basic_rnn_seq2seq( self.encoder_inputs, self.decoder_inputs, lstm_cell) self.logits = [tf.nn.xw_plus_b(o, w, b) for o in outputs] self.loss = seq2seq.sequence_loss(self.logits[:self.sequence_length], targets, self.target_weights[:self.sequence_length], self.vocab_size) params = tf.trainable_variables() opt = tf.train.GradientDescentOptimizer(self.learning_rate) gradients = tf.gradients(self.loss, params) clipped_gradients, self.gradient_norms = tf.clip_by_global_norm( gradients, max_gradient_norm) self.updates = opt.apply_gradients( zip(clipped_gradients, params), global_step=self.global_step) self.saver = tf.train.Saver(tf.all_variables())
def __load_optimizer(self): # loss function self.loss = seq2seq.sequence_loss(self.dec_outputs, self.labels, \ self.weights, self.vocab_size) # optimizer self.optimizer = tf.train.MomentumOptimizer(self.learning_rate, \ self.momentum) self.train_op = self.optimizer.minimize(self.loss)
def __load_optimizer(self): # loss function with tf.variable_scope("forward"): self.loss_fwd = seq2seq.sequence_loss(self.dec_outputs_fwd, self.labels, \ self.weights, self.vocab_size) # optimizer self.optimizer_fwd = tf.train.MomentumOptimizer(self.learning_rate, \ self.momentum) self.train_op_fwd = self.optimizer_fwd.minimize(self.loss_fwd) with tf.variable_scope("backward"): self.loss_bwd = seq2seq.sequence_loss(self.dec_outputs_bwd, self.labels, \ self.weights, self.vocab_size) # optimizer self.optimizer_bwd = tf.train.MomentumOptimizer(self.learning_rate, \ self.momentum) self.train_op_bwd = self.optimizer_bwd.minimize(self.loss_bwd)
def build_autoencoder(dpg): hidden_dim = dpg.spec.policy_dims[0] dec_cell = util.GRUCell(FLAGS.embedding_dim, hidden_dim) dec_cell = rnn_cell.OutputProjectionWrapper(dec_cell, FLAGS.vocab_size) dec_inp = [tf.zeros_like(dpg.input_tokens[0], name="adec_inp%i" % t) for t in range(dpg.seq_length)] dec_out, _ = util.embedding_rnn_decoder( dec_inp, dpg.encoder_states[-1], dec_cell, FLAGS.vocab_size, feed_previous=True, embedding=dpg.embeddings, scope="adec") labels = [tf.placeholder(tf.int32, shape=(None,), name="labels%i" % t) for t in range(dpg.seq_length)] weights = [tf.ones_like(labels_t, dtype=tf.float32) for labels_t in labels] loss = seq2seq.sequence_loss(dec_out, labels, weights, FLAGS.vocab_size) optimizer = tf.train.AdamOptimizer(0.01) train_op = optimizer.minimize(loss) # TODO wrt what? return labels, loss, train_op
with tf.variable_scope("RNN/EmbeddingWrapper", reuse=True): embeddings = tf.get_variable("embedding") inp_embedded = [tf.nn.embedding_lookup(embeddings, inp_t) for inp_t in inp] cell = rnn_cell.GRUCell(memory_dim) attn_states = tf.concat(1, [tf.reshape(e, [-1, 1, cell.output_size]) for e in enc_outputs]) dec_inp = [tf.zeros((batch_size, cell.input_size), dtype=tf.float32) for _ in range(seq_length)] dec_outputs, dec_states = seq2seq.attention_decoder(dec_inp, enc_states[-1], attn_states, cell, output_size=seq_length, loop_function=make_loop_function(inp_embedded, cell)) loss = seq2seq.sequence_loss(dec_outputs, labels, weights, seq_length) learning_rate = 0.05 momentum = 0.9 optimizer = tf.train.MomentumOptimizer(learning_rate, momentum) train_op = optimizer.minimize(loss) summary_op = loss # tf.merge_all_summaries() sess = tf.InteractiveSession() sess.run(tf.initialize_all_variables()) def train_batch(batch_size): X = [np.random.choice(vocab_size, size=(seq_length,), replace=False) for _ in range(batch_size)] y = [np.argsort(x) for x in X] # [np.arange(seq_length) for _ in X]
cell = rnn_cell.GRUCell(memory_dim) attn_states = tf.concat( 1, [tf.reshape(e, [-1, 1, cell.output_size]) for e in enc_outputs]) dec_inp = [ tf.zeros((batch_size, cell.input_size), dtype=tf.float32) for _ in range(seq_length) ] dec_outputs, dec_states = seq2seq.attention_decoder( dec_inp, enc_states[-1], attn_states, cell, output_size=seq_length, loop_function=make_loop_function(inp_embedded, cell)) loss = seq2seq.sequence_loss(dec_outputs, labels, weights, seq_length) learning_rate = 0.05 momentum = 0.9 optimizer = tf.train.MomentumOptimizer(learning_rate, momentum) train_op = optimizer.minimize(loss) summary_op = loss # tf.merge_all_summaries() sess = tf.InteractiveSession() sess.run(tf.initialize_all_variables()) def train_batch(batch_size): X = [ np.random.choice(vocab_size, size=(seq_length, ), replace=False) for _ in range(batch_size)
def __init__(self, vocab, tagset, alphabet, word_embedding_size, char_embedding_size, num_chars, num_steps, optimizer_desc, generate_lemmas, l2, dropout_prob_values, experiment_name, supply_form_characters_to_lemma, threads=0, seed=None, write_summaries=True, use_attention=True, scheduled_sampling=None): """ Builds the tagger computation graph and initializes it in a TensorFlow session. Arguments: vocab: Vocabulary of word forms. tagset: Vocabulary of possible tags. alphabet: Vocabulary of possible characters. word_embedding_size (int): Size of the form-based word embedding. char_embedding_size (int): Size of character embeddings, i.e. a half of the size of the character-based words embeddings. num_chars: Maximum length of a word. num_steps: Maximum lenght of a sentence. optimizer_desc: Description of the optimizer. generate_lemmas: Generate lemmas during tagging. seed: TensorFlow seed write_summaries: Write summaries using TensorFlow interface. """ self.num_steps = num_steps self.num_chars = num_chars self.word_embedding_size = word_embedding_size self.char_embedding_size = char_embedding_size self.lstm_size = word_embedding_size + 2 * char_embedding_size ### self.vocab = vocab self.tagset = tagset self.alphabet = alphabet self.dropout_prob_values = dropout_prob_values self.forward_initial_state = tf.placeholder( tf.float32, [None, rnn_cell.BasicLSTMCell(self.lstm_size).state_size], name="forward_lstm_initial_state") self.backward_initial_state = tf.placeholder( tf.float32, [None, rnn_cell.BasicLSTMCell(self.lstm_size).state_size], name="backward_lstm_initial_state") self.sentence_lengths = tf.placeholder(tf.int64, [None], name="sentence_lengths") self.tags = tf.placeholder(tf.int32, [None, num_steps], name="ground_truth_tags") self.dropout_prob = tf.placeholder(tf.float32, [None], name="dropout_keep_p") self.generate_lemmas = generate_lemmas global_step = tf.Variable(0, trainable=False) input_list = [] regularize = [] # Word-level embeddings if word_embedding_size: self.words = tf.placeholder(tf.int32, [None, num_steps], name='words') word_embeddings = tf.Variable( tf.random_uniform([len(vocab), word_embedding_size], -1.0, 1.0)) we_lookup = tf.nn.embedding_lookup(word_embeddings, self.words) input_list.append(we_lookup) # Character-level embeddings if char_embedding_size: self.chars = tf.placeholder(tf.int32, [None, num_steps, num_chars], name='chars') self.chars_lengths = tf.placeholder(tf.int64, [None, num_steps], name='chars_lengths') char_embeddings = \ tf.Variable(tf.random_uniform([len(alphabet), char_embedding_size], -1.0, 1.0)) ce_lookup = tf.nn.embedding_lookup(char_embeddings, self.chars) reshaped_ce_lookup = tf.reshape( ce_lookup, [-1, num_chars, char_embedding_size], name="reshape-char_inputs") char_inputs = [ tf.squeeze(input_, [1]) for input_ in tf.split(1, num_chars, reshaped_ce_lookup) ] char_inputs_lengths = tf.reshape(self.chars_lengths, [-1]) with tf.variable_scope('char_forward'): char_lstm = rnn_cell.BasicLSTMCell(char_embedding_size) _, char_last_state = rnn.rnn( cell=char_lstm, inputs=char_inputs, sequence_length=char_inputs_lengths, dtype=tf.float32) tf.get_variable_scope().reuse_variables() regularize.append( tf.get_variable('RNN/BasicLSTMCell/Linear/Matrix')) with tf.variable_scope('char_backward'): char_lstm_rev = rnn_cell.BasicLSTMCell(char_embedding_size) _, char_last_state_rev = rnn.rnn( cell=char_lstm_rev, inputs=self._reverse_seq(char_inputs, char_inputs_lengths), sequence_length=char_inputs_lengths, dtype=tf.float32) tf.get_variable_scope().reuse_variables() regularize.append( tf.get_variable('RNN/BasicLSTMCell/Linear/Matrix')) last_char_lstm_state = tf.split(1, 2, char_last_state)[1] last_char_lstm_state_rev = tf.split(1, 2, char_last_state_rev)[1] last_char_states = \ tf.reshape(last_char_lstm_state, [-1, num_steps, char_embedding_size], name="reshape-charstates") last_char_states_rev = tf.reshape( last_char_lstm_state_rev, [-1, num_steps, char_embedding_size], name="reshape-charstates_rev") char_output = tf.concat(2, [last_char_states, last_char_states_rev]) input_list.append(char_output) # All inputs correctly sliced input_list_dropped = [ tf.nn.dropout(x, self.dropout_prob[0]) for x in input_list ] inputs = [ tf.squeeze(input_, [1]) for input_ in tf.split( 1, num_steps, tf.concat(2, input_list_dropped)) ] with tf.variable_scope('forward'): lstm = rnn_cell.BasicLSTMCell(self.lstm_size) outputs, last_state = rnn.rnn( cell=lstm, inputs=inputs, dtype=tf.float32, initial_state=self.forward_initial_state, sequence_length=self.sentence_lengths) tf.get_variable_scope().reuse_variables() regularize.append( tf.get_variable('RNN/BasicLSTMCell/Linear/Matrix')) with tf.variable_scope('backward'): lstm_rev = rnn_cell.BasicLSTMCell(self.lstm_size) outputs_rev_rev, last_state_rev = rnn.rnn( cell=lstm_rev, inputs=self._reverse_seq(inputs, self.sentence_lengths), dtype=tf.float32, initial_state=self.backward_initial_state, sequence_length=self.sentence_lengths) outputs_rev = self._reverse_seq(outputs_rev_rev, self.sentence_lengths) tf.get_variable_scope().reuse_variables() regularize.append( tf.get_variable('RNN/BasicLSTMCell/Linear/Matrix')) #outputs_forward = tf.reshape(tf.concat(1, outputs), [-1, self.lstm_size], # name="reshape-outputs_forward") #outputs_backward = tf.reshape(tf.concat(1, outputs_rev), [-1, self.lstm_size], # name="reshape-outputs_backward") #forward_w = tf.get_variable("forward_w", [self.lstm_size, self.lstm_size]) #backward_w = tf.get_variable("backward_w", [self.lstm_size, self.lstm_size]) #non_linearity_bias = tf.get_variable("non_linearity_b", [self.lstm_size]) outputs_bidi = [ tf.concat(1, [o1, o2]) for o1, o2 in zip(outputs, reversed(outputs_rev)) ] #output = tf.tanh(tf.matmul(outputs_forward, forward_w) + tf.matmul(outputs_backward, backward_w) + non_linearity_bias) output = tf.reshape(tf.concat(1, outputs_bidi), [-1, 2 * self.lstm_size], name="reshape-outputs_bidi") output_dropped = tf.nn.dropout(output, self.dropout_prob[1]) # We are computing only the logits, not the actual softmax -- while # computing the loss, it is done by the sequence_loss_by_example and # during the runtime classification, the argmax over logits is enough. softmax_w = tf.get_variable( "softmax_w", [2 * self.lstm_size, len(tagset)]) logits_flatten = tf.nn.xw_plus_b( output_dropped, softmax_w, tf.get_variable("softmax_b", [len(tagset)])) #tf.get_variable_scope().reuse_variables() regularize.append(softmax_w) self.logits = tf.reshape(logits_flatten, [-1, num_steps, len(tagset)], name="reshape-logits") estimated_tags_flat = tf.to_int32( tf.argmax(logits_flatten, dimension=1)) self.last_state = last_state # output maks: compute loss only if it insn't a padded word (i.e. zero index) output_mask = tf.reshape(tf.to_float(tf.not_equal(self.tags, 0)), [-1]) gt_tags_flat = tf.reshape(self.tags, [-1]) tagging_loss = seq2seq.sequence_loss_by_example( logits=[logits_flatten], targets=[gt_tags_flat], weights=[output_mask]) tagging_accuracy = \ tf.reduce_sum(tf.to_float(tf.equal(estimated_tags_flat, gt_tags_flat)) * output_mask) \ / tf.reduce_sum(output_mask) tf.scalar_summary('train_accuracy', tagging_accuracy, collections=["train"]) tf.scalar_summary('dev_accuracy', tagging_accuracy, collections=["dev"]) self.cost = tf.reduce_mean(tagging_loss) tf.scalar_summary('train_tagging_loss', tf.reduce_mean(tagging_loss), collections=["train"]) tf.scalar_summary('dev_tagging_loss', tf.reduce_mean(tagging_loss), collections=["dev"]) if generate_lemmas: with tf.variable_scope('decoder'): self.lemma_chars = tf.placeholder( tf.int32, [None, num_steps, num_chars + 2], name='lemma_chars') lemma_state_size = self.lstm_size lemma_w = tf.Variable(tf.random_uniform( [lemma_state_size, len(alphabet)], 0.5), name="state_to_char_w") lemma_b = tf.Variable(tf.fill([len(alphabet)], -math.log(len(alphabet))), name="state_to_char_b") lemma_char_embeddings = tf.Variable(tf.random_uniform([ len(alphabet), lemma_state_size / (2 if supply_form_characters_to_lemma else 1) ], -0.5, 0.5), name="char_embeddings") lemma_char_inputs = \ [tf.squeeze(input_, [1]) for input_ in tf.split(1, num_chars + 2, tf.reshape(self.lemma_chars, [-1, num_chars + 2], name="reshape-lemma_char_inputs"))] if supply_form_characters_to_lemma: char_inputs_zeros = \ [tf.squeeze(chars, [1]) for chars in tf.split(1, num_chars, tf.reshape(self.chars, [-1, num_chars], name="reshape-char_inputs_zeros"))] char_inputs_zeros.append(char_inputs_zeros[0] * 0) def loop(prev_state, i): # it takes the previous hidden state, finds the character and formats it # as input for the next time step ... used in the decoder in the "real decoding scenario" out_activation = tf.matmul(prev_state, lemma_w) + lemma_b prev_char_index = tf.argmax(out_activation, 1) return tf.concat(1, [ tf.nn.embedding_lookup(lemma_char_embeddings, prev_char_index), tf.nn.embedding_lookup(lemma_char_embeddings, char_inputs_zeros[i]) ]) embedded_lemma_characters = [] for lemma_chars, form_chars in zip(lemma_char_inputs[:-1], char_inputs_zeros): embedded_lemma_characters.append( tf.concat(1, [ tf.nn.embedding_lookup(lemma_char_embeddings, lemma_chars), tf.nn.embedding_lookup(lemma_char_embeddings, form_chars) ])) else: def loop(prev_state, _): # it takes the previous hidden state, finds the character and formats it # as input for the next time step ... used in the decoder in the "real decoding scenario" out_activation = tf.matmul(prev_state, lemma_w) + lemma_b prev_char_index = tf.argmax(out_activation, 1) return tf.nn.embedding_lookup(lemma_char_embeddings, prev_char_index) embedded_lemma_characters = [] for lemma_chars in lemma_char_inputs[:-1]: embedded_lemma_characters.append( tf.nn.embedding_lookup(lemma_char_embeddings, lemma_chars)) def sampling_loop(prev_state, i): threshold = scheduled_sampling / ( scheduled_sampling + tf.exp(tf.to_float(global_step))) condition = tf.less_equal( tf.random_uniform( tf.shape(embedded_lemma_characters[0])), threshold) return tf.select(condition, embedded_lemma_characters[i], loop(prev_state, i)) decoder_cell = rnn_cell.BasicLSTMCell(lemma_state_size) if scheduled_sampling: lf = sampling_loop else: lf = None if use_attention: lemma_outputs_train, _ = seq2seq.attention_decoder( embedded_lemma_characters, output_dropped, reshaped_ce_lookup, decoder_cell, loop_function=lf) else: lemma_outputs_train, _ = seq2seq.rnn_decoder( embedded_lemma_characters, output_dropped, decoder_cell, loop_function=lf) tf.get_variable_scope().reuse_variables() #regularize.append(tf.get_variable('attention_decoder/BasicLSTMCell/Linear/Matrix')) tf.get_variable_scope().reuse_variables() if use_attention: lemma_outputs_runtime, _ = \ seq2seq.attention_decoder(embedded_lemma_characters, output_dropped, reshaped_ce_lookup, decoder_cell, loop_function=loop) else: lemma_outputs_runtime, _ = \ seq2seq.rnn_decoder(embedded_lemma_characters, output_dropped, decoder_cell, loop_function=loop) lemma_char_logits_train = \ [tf.matmul(o, lemma_w) + lemma_b for o in lemma_outputs_train] lemma_char_logits_runtime = \ [tf.matmul(o, lemma_w) + lemma_b for o in lemma_outputs_runtime] self.lemmas_decoded = \ tf.reshape(tf.transpose(tf.argmax(tf.pack(lemma_char_logits_runtime), 2)), [-1, num_steps, num_chars + 1]) lemma_char_weights = [] for lemma_chars in lemma_char_inputs[1:]: lemma_char_weights.append( tf.to_float(tf.not_equal(lemma_chars, 0))) lemmatizer_loss = seq2seq.sequence_loss( lemma_char_logits_train, lemma_char_inputs[1:], lemma_char_weights) lemmatizer_loss_runtime = \ seq2seq.sequence_loss(lemma_char_logits_runtime, lemma_char_inputs[1:], lemma_char_weights) tf.scalar_summary('train_lemma_loss_with_gt_inputs', tf.reduce_mean(lemmatizer_loss), collections=["train"]) tf.scalar_summary('dev_lemma_loss_with_gt_inputs', tf.reduce_mean(lemmatizer_loss), collections=["dev"]) tf.scalar_summary('train_lemma_loss_with_decoded_inputs', tf.reduce_mean(lemmatizer_loss_runtime), collections=["train"]) tf.scalar_summary('dev_lemma_loss_with_decoded_inputs', tf.reduce_mean(lemmatizer_loss_runtime), collections=["dev"]) self.cost += tf.reduce_mean(lemmatizer_loss) + tf.reduce_mean( lemmatizer_loss_runtime) self.cost += l2 * sum( [tf.nn.l2_loss(variable) for variable in regularize]) tf.scalar_summary('train_optimization_cost', self.cost, collections=["train"]) tf.scalar_summary('dev_optimization_cost', self.cost, collections=["dev"]) def decay(learning_rate, exponent, iteration_steps): return tf.train.exponential_decay(learning_rate, global_step, iteration_steps, exponent, staircase=True) optimizer = eval('tf.train.' + optimizer_desc) self.train = optimizer.minimize(self.cost, global_step=global_step) if threads > 0: self.session = tf.Session( config=tf.ConfigProto(inter_op_parallelism_threads=threads, intra_op_parallelism_threads=threads)) else: self.session = tf.Session() self.session.run(tf.initialize_all_variables()) if write_summaries: self.summary_train = tf.merge_summary(tf.get_collection("train")) self.summary_dev = tf.merge_summary(tf.get_collection("dev")) timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S") self.summary_writer = tf.train.SummaryWriter("logs/" + timestamp + "_" + experiment_name) self.steps = 0
def __init__( self, vocab, tagset, alphabet, word_embedding_size, char_embedding_size, num_chars, num_steps, optimizer_desc, generate_lemmas, l2, dropout_prob_values, experiment_name, supply_form_characters_to_lemma, threads=0, seed=None, write_summaries=True, use_attention=True, scheduled_sampling=None, ): """ Builds the tagger computation graph and initializes it in a TensorFlow session. Arguments: vocab: Vocabulary of word forms. tagset: Vocabulary of possible tags. alphabet: Vocabulary of possible characters. word_embedding_size (int): Size of the form-based word embedding. char_embedding_size (int): Size of character embeddings, i.e. a half of the size of the character-based words embeddings. num_chars: Maximum length of a word. num_steps: Maximum lenght of a sentence. optimizer_desc: Description of the optimizer. generate_lemmas: Generate lemmas during tagging. seed: TensorFlow seed write_summaries: Write summaries using TensorFlow interface. """ self.num_steps = num_steps self.num_chars = num_chars self.word_embedding_size = word_embedding_size self.char_embedding_size = char_embedding_size self.lstm_size = word_embedding_size + 2 * char_embedding_size ### self.vocab = vocab self.tagset = tagset self.alphabet = alphabet self.dropout_prob_values = dropout_prob_values self.forward_initial_state = tf.placeholder( tf.float32, [None, rnn_cell.BasicLSTMCell(self.lstm_size).state_size], name="forward_lstm_initial_state" ) self.backward_initial_state = tf.placeholder( tf.float32, [None, rnn_cell.BasicLSTMCell(self.lstm_size).state_size], name="backward_lstm_initial_state" ) self.sentence_lengths = tf.placeholder(tf.int64, [None], name="sentence_lengths") self.tags = tf.placeholder(tf.int32, [None, num_steps], name="ground_truth_tags") self.dropout_prob = tf.placeholder(tf.float32, [None], name="dropout_keep_p") self.generate_lemmas = generate_lemmas global_step = tf.Variable(0, trainable=False) input_list = [] regularize = [] # Word-level embeddings if word_embedding_size: self.words = tf.placeholder(tf.int32, [None, num_steps], name="words") word_embeddings = tf.Variable(tf.random_uniform([len(vocab), word_embedding_size], -1.0, 1.0)) we_lookup = tf.nn.embedding_lookup(word_embeddings, self.words) input_list.append(we_lookup) # Character-level embeddings if char_embedding_size: self.chars = tf.placeholder(tf.int32, [None, num_steps, num_chars], name="chars") self.chars_lengths = tf.placeholder(tf.int64, [None, num_steps], name="chars_lengths") char_embeddings = tf.Variable(tf.random_uniform([len(alphabet), char_embedding_size], -1.0, 1.0)) ce_lookup = tf.nn.embedding_lookup(char_embeddings, self.chars) reshaped_ce_lookup = tf.reshape(ce_lookup, [-1, num_chars, char_embedding_size], name="reshape-char_inputs") char_inputs = [tf.squeeze(input_, [1]) for input_ in tf.split(1, num_chars, reshaped_ce_lookup)] char_inputs_lengths = tf.reshape(self.chars_lengths, [-1]) with tf.variable_scope("char_forward"): char_lstm = rnn_cell.BasicLSTMCell(char_embedding_size) _, char_last_state = rnn.rnn( cell=char_lstm, inputs=char_inputs, sequence_length=char_inputs_lengths, dtype=tf.float32 ) tf.get_variable_scope().reuse_variables() regularize.append(tf.get_variable("RNN/BasicLSTMCell/Linear/Matrix")) with tf.variable_scope("char_backward"): char_lstm_rev = rnn_cell.BasicLSTMCell(char_embedding_size) _, char_last_state_rev = rnn.rnn( cell=char_lstm_rev, inputs=self._reverse_seq(char_inputs, char_inputs_lengths), sequence_length=char_inputs_lengths, dtype=tf.float32, ) tf.get_variable_scope().reuse_variables() regularize.append(tf.get_variable("RNN/BasicLSTMCell/Linear/Matrix")) last_char_lstm_state = tf.split(1, 2, char_last_state)[1] last_char_lstm_state_rev = tf.split(1, 2, char_last_state_rev)[1] last_char_states = tf.reshape( last_char_lstm_state, [-1, num_steps, char_embedding_size], name="reshape-charstates" ) last_char_states_rev = tf.reshape( last_char_lstm_state_rev, [-1, num_steps, char_embedding_size], name="reshape-charstates_rev" ) char_output = tf.concat(2, [last_char_states, last_char_states_rev]) input_list.append(char_output) # All inputs correctly sliced input_list_dropped = [tf.nn.dropout(x, self.dropout_prob[0]) for x in input_list] inputs = [tf.squeeze(input_, [1]) for input_ in tf.split(1, num_steps, tf.concat(2, input_list_dropped))] with tf.variable_scope("forward"): lstm = rnn_cell.BasicLSTMCell(self.lstm_size) outputs, last_state = rnn.rnn( cell=lstm, inputs=inputs, dtype=tf.float32, initial_state=self.forward_initial_state, sequence_length=self.sentence_lengths, ) tf.get_variable_scope().reuse_variables() regularize.append(tf.get_variable("RNN/BasicLSTMCell/Linear/Matrix")) with tf.variable_scope("backward"): lstm_rev = rnn_cell.BasicLSTMCell(self.lstm_size) outputs_rev_rev, last_state_rev = rnn.rnn( cell=lstm_rev, inputs=self._reverse_seq(inputs, self.sentence_lengths), dtype=tf.float32, initial_state=self.backward_initial_state, sequence_length=self.sentence_lengths, ) outputs_rev = self._reverse_seq(outputs_rev_rev, self.sentence_lengths) tf.get_variable_scope().reuse_variables() regularize.append(tf.get_variable("RNN/BasicLSTMCell/Linear/Matrix")) # outputs_forward = tf.reshape(tf.concat(1, outputs), [-1, self.lstm_size], # name="reshape-outputs_forward") # outputs_backward = tf.reshape(tf.concat(1, outputs_rev), [-1, self.lstm_size], # name="reshape-outputs_backward") # forward_w = tf.get_variable("forward_w", [self.lstm_size, self.lstm_size]) # backward_w = tf.get_variable("backward_w", [self.lstm_size, self.lstm_size]) # non_linearity_bias = tf.get_variable("non_linearity_b", [self.lstm_size]) outputs_bidi = [tf.concat(1, [o1, o2]) for o1, o2 in zip(outputs, reversed(outputs_rev))] # output = tf.tanh(tf.matmul(outputs_forward, forward_w) + tf.matmul(outputs_backward, backward_w) + non_linearity_bias) output = tf.reshape(tf.concat(1, outputs_bidi), [-1, 2 * self.lstm_size], name="reshape-outputs_bidi") output_dropped = tf.nn.dropout(output, self.dropout_prob[1]) # We are computing only the logits, not the actual softmax -- while # computing the loss, it is done by the sequence_loss_by_example and # during the runtime classification, the argmax over logits is enough. softmax_w = tf.get_variable("softmax_w", [2 * self.lstm_size, len(tagset)]) logits_flatten = tf.nn.xw_plus_b(output_dropped, softmax_w, tf.get_variable("softmax_b", [len(tagset)])) # tf.get_variable_scope().reuse_variables() regularize.append(softmax_w) self.logits = tf.reshape(logits_flatten, [-1, num_steps, len(tagset)], name="reshape-logits") estimated_tags_flat = tf.to_int32(tf.argmax(logits_flatten, dimension=1)) self.last_state = last_state # output maks: compute loss only if it insn't a padded word (i.e. zero index) output_mask = tf.reshape(tf.to_float(tf.not_equal(self.tags, 0)), [-1]) gt_tags_flat = tf.reshape(self.tags, [-1]) tagging_loss = seq2seq.sequence_loss_by_example( logits=[logits_flatten], targets=[gt_tags_flat], weights=[output_mask] ) tagging_accuracy = tf.reduce_sum( tf.to_float(tf.equal(estimated_tags_flat, gt_tags_flat)) * output_mask ) / tf.reduce_sum(output_mask) tf.scalar_summary("train_accuracy", tagging_accuracy, collections=["train"]) tf.scalar_summary("dev_accuracy", tagging_accuracy, collections=["dev"]) self.cost = tf.reduce_mean(tagging_loss) tf.scalar_summary("train_tagging_loss", tf.reduce_mean(tagging_loss), collections=["train"]) tf.scalar_summary("dev_tagging_loss", tf.reduce_mean(tagging_loss), collections=["dev"]) if generate_lemmas: with tf.variable_scope("decoder"): self.lemma_chars = tf.placeholder(tf.int32, [None, num_steps, num_chars + 2], name="lemma_chars") lemma_state_size = self.lstm_size lemma_w = tf.Variable(tf.random_uniform([lemma_state_size, len(alphabet)], 0.5), name="state_to_char_w") lemma_b = tf.Variable(tf.fill([len(alphabet)], -math.log(len(alphabet))), name="state_to_char_b") lemma_char_embeddings = tf.Variable( tf.random_uniform( [len(alphabet), lemma_state_size / (2 if supply_form_characters_to_lemma else 1)], -0.5, 0.5 ), name="char_embeddings", ) lemma_char_inputs = [ tf.squeeze(input_, [1]) for input_ in tf.split( 1, num_chars + 2, tf.reshape(self.lemma_chars, [-1, num_chars + 2], name="reshape-lemma_char_inputs"), ) ] if supply_form_characters_to_lemma: char_inputs_zeros = [ tf.squeeze(chars, [1]) for chars in tf.split( 1, num_chars, tf.reshape(self.chars, [-1, num_chars], name="reshape-char_inputs_zeros") ) ] char_inputs_zeros.append(char_inputs_zeros[0] * 0) def loop(prev_state, i): # it takes the previous hidden state, finds the character and formats it # as input for the next time step ... used in the decoder in the "real decoding scenario" out_activation = tf.matmul(prev_state, lemma_w) + lemma_b prev_char_index = tf.argmax(out_activation, 1) return tf.concat( 1, [ tf.nn.embedding_lookup(lemma_char_embeddings, prev_char_index), tf.nn.embedding_lookup(lemma_char_embeddings, char_inputs_zeros[i]), ], ) embedded_lemma_characters = [] for lemma_chars, form_chars in zip(lemma_char_inputs[:-1], char_inputs_zeros): embedded_lemma_characters.append( tf.concat( 1, [ tf.nn.embedding_lookup(lemma_char_embeddings, lemma_chars), tf.nn.embedding_lookup(lemma_char_embeddings, form_chars), ], ) ) else: def loop(prev_state, _): # it takes the previous hidden state, finds the character and formats it # as input for the next time step ... used in the decoder in the "real decoding scenario" out_activation = tf.matmul(prev_state, lemma_w) + lemma_b prev_char_index = tf.argmax(out_activation, 1) return tf.nn.embedding_lookup(lemma_char_embeddings, prev_char_index) embedded_lemma_characters = [] for lemma_chars in lemma_char_inputs[:-1]: embedded_lemma_characters.append(tf.nn.embedding_lookup(lemma_char_embeddings, lemma_chars)) def sampling_loop(prev_state, i): threshold = scheduled_sampling / (scheduled_sampling + tf.exp(tf.to_float(global_step))) condition = tf.less_equal(tf.random_uniform(tf.shape(embedded_lemma_characters[0])), threshold) return tf.select(condition, embedded_lemma_characters[i], loop(prev_state, i)) decoder_cell = rnn_cell.BasicLSTMCell(lemma_state_size) if scheduled_sampling: lf = sampling_loop else: lf = None if use_attention: lemma_outputs_train, _ = seq2seq.attention_decoder( embedded_lemma_characters, output_dropped, reshaped_ce_lookup, decoder_cell, loop_function=lf ) else: lemma_outputs_train, _ = seq2seq.rnn_decoder( embedded_lemma_characters, output_dropped, decoder_cell, loop_function=lf ) tf.get_variable_scope().reuse_variables() # regularize.append(tf.get_variable('attention_decoder/BasicLSTMCell/Linear/Matrix')) tf.get_variable_scope().reuse_variables() if use_attention: lemma_outputs_runtime, _ = seq2seq.attention_decoder( embedded_lemma_characters, output_dropped, reshaped_ce_lookup, decoder_cell, loop_function=loop ) else: lemma_outputs_runtime, _ = seq2seq.rnn_decoder( embedded_lemma_characters, output_dropped, decoder_cell, loop_function=loop ) lemma_char_logits_train = [tf.matmul(o, lemma_w) + lemma_b for o in lemma_outputs_train] lemma_char_logits_runtime = [tf.matmul(o, lemma_w) + lemma_b for o in lemma_outputs_runtime] self.lemmas_decoded = tf.reshape( tf.transpose(tf.argmax(tf.pack(lemma_char_logits_runtime), 2)), [-1, num_steps, num_chars + 1] ) lemma_char_weights = [] for lemma_chars in lemma_char_inputs[1:]: lemma_char_weights.append(tf.to_float(tf.not_equal(lemma_chars, 0))) lemmatizer_loss = seq2seq.sequence_loss( lemma_char_logits_train, lemma_char_inputs[1:], lemma_char_weights ) lemmatizer_loss_runtime = seq2seq.sequence_loss( lemma_char_logits_runtime, lemma_char_inputs[1:], lemma_char_weights ) tf.scalar_summary( "train_lemma_loss_with_gt_inputs", tf.reduce_mean(lemmatizer_loss), collections=["train"] ) tf.scalar_summary("dev_lemma_loss_with_gt_inputs", tf.reduce_mean(lemmatizer_loss), collections=["dev"]) tf.scalar_summary( "train_lemma_loss_with_decoded_inputs", tf.reduce_mean(lemmatizer_loss_runtime), collections=["train"], ) tf.scalar_summary( "dev_lemma_loss_with_decoded_inputs", tf.reduce_mean(lemmatizer_loss_runtime), collections=["dev"] ) self.cost += tf.reduce_mean(lemmatizer_loss) + tf.reduce_mean(lemmatizer_loss_runtime) self.cost += l2 * sum([tf.nn.l2_loss(variable) for variable in regularize]) tf.scalar_summary("train_optimization_cost", self.cost, collections=["train"]) tf.scalar_summary("dev_optimization_cost", self.cost, collections=["dev"]) def decay(learning_rate, exponent, iteration_steps): return tf.train.exponential_decay(learning_rate, global_step, iteration_steps, exponent, staircase=True) optimizer = eval("tf.train." + optimizer_desc) self.train = optimizer.minimize(self.cost, global_step=global_step) if threads > 0: self.session = tf.Session( config=tf.ConfigProto(inter_op_parallelism_threads=threads, intra_op_parallelism_threads=threads) ) else: self.session = tf.Session() self.session.run(tf.initialize_all_variables()) if write_summaries: self.summary_train = tf.merge_summary(tf.get_collection("train")) self.summary_dev = tf.merge_summary(tf.get_collection("dev")) timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S") self.summary_writer = tf.train.SummaryWriter("logs/" + timestamp + "_" + experiment_name) self.steps = 0
def _init_neural_network(self): """Initializing the NN (building a TensorFlow graph and initializing session).""" # set TensorFlow random seed tf.set_random_seed(rnd.randint(-sys.maxint, sys.maxint)) # create placeholders for input & output (always batch-size * 1, list of up to num. steps) self.enc_inputs = [] self.enc_inputs_drop = [] for i in xrange(self.max_da_len): enc_input = tf.placeholder(tf.int32, [None], name=('enc_inp-%d' % i)) self.enc_inputs.append(enc_input) if self.dropout_keep_prob < 1: enc_input_drop = tf.nn.dropout(enc_input, self.dropout_keep_prob, name=('enc_inp-drop-%d' % i)) self.enc_inputs_drop.append(enc_input_drop) self.dec_inputs = [] for i in xrange(self.max_tree_len): self.dec_inputs.append( tf.placeholder(tf.int32, [None], name=('dec_inp-%d' % i))) # targets are just decoder inputs shifted by one (+pad with one empty spot) self.targets = [ self.dec_inputs[i + 1] for i in xrange(len(self.dec_inputs) - 1) ] self.targets.append( tf.placeholder(tf.int32, [None], name=('target-pad'))) # prepare cells self.initial_state = tf.placeholder(tf.float32, [None, self.emb_size]) if self.cell_type.startswith('gru'): self.cell = rnn_cell.GRUCell(self.emb_size) else: self.cell = rnn_cell.BasicLSTMCell(self.emb_size) if self.cell_type.endswith('/2'): self.cell = rnn_cell.MultiRNNCell([self.cell] * 2) # build the actual LSTM Seq2Seq network (for training and decoding) with tf.variable_scope(self.scope_name) as scope: rnn_func = embedding_rnn_seq2seq if self.nn_type == 'emb_attention_seq2seq': rnn_func = embedding_attention_seq2seq elif self.nn_type == 'emb_attention2_seq2seq': rnn_func = partial(embedding_attention_seq2seq, num_heads=2) elif self.nn_type == 'emb_attention_seq2seq_context': rnn_func = embedding_attention_seq2seq_context elif self.nn_type == 'emb_attention2_seq2seq_context': rnn_func = partial(embedding_attention_seq2seq_context, num_heads=2) # for training: feed_previous == False, using dropout if available # outputs = batch_size * num_decoder_symbols ~ i.e. output logits at each steps # states = cell states at each steps self.outputs, self.states = rnn_func( self.enc_inputs_drop if self.enc_inputs_drop else self.enc_inputs, self.dec_inputs, self.cell, self.da_dict_size, self.tree_dict_size, scope=scope) scope.reuse_variables() # for decoding: feed_previous == True self.dec_outputs, self.dec_states = rnn_func(self.enc_inputs, self.dec_inputs, self.cell, self.da_dict_size, self.tree_dict_size, feed_previous=True, scope=scope) # TODO use output projection ??? # target weights # TODO change to actual weights, zero after the end of tree ??? self.cost_weights = [ tf.ones_like(trg, tf.float32, name='cost_weights') for trg in self.targets ] # cost self.tf_cost = sequence_loss(self.outputs, self.targets, self.cost_weights, self.tree_dict_size) self.dec_cost = sequence_loss(self.dec_outputs, self.targets, self.cost_weights, self.tree_dict_size) if self.use_dec_cost: self.cost = 0.5 * (self.tf_cost + self.dec_cost) else: self.cost = self.tf_cost self.learning_rate = tf.placeholder(tf.float32, name="learning_rate") # optimizer (default to Adam) if self.optimizer_type == 'sgd': self.optimizer = tf.train.GradientDescentOptimizer( self.learning_rate) if self.optimizer_type == 'adagrad': self.optimizer = tf.train.AdagradOptimizer(self.learning_rate) else: self.optimizer = tf.train.AdamOptimizer(self.learning_rate) self.train_func = self.optimizer.minimize(self.cost) # initialize session session_config = None if self.max_cores: session_config = tf.ConfigProto( inter_op_parallelism_threads=self.max_cores, intra_op_parallelism_threads=self.max_cores) self.session = tf.Session(config=session_config) # this helps us load/save the model self.saver = tf.train.Saver(tf.all_variables())
def __init__(self): # Set up hyperparameters self.num_layers = 3 self.layer_size = 256 # Set up the core RNN cells of the tensor network single_cell = rnn_cell.BasicLSTMCell(self.layer_size) self.cell = rnn_cell.MultiRNNCell([single_cell] * self.num_layers) # Set up placeholders for the inputs and outputs. # Leave batch size unspecified as a None shape. # The input problem self.encoder_inputs = [tf.placeholder(tf.int32, shape=[None], name='encoder{0}'.format(i)) for i in range(SOURCE_LEN)] # The correct answers self.labels = [tf.placeholder(tf.int32, shape=[None], name='labels{0}'.format(i)) for i in range(TARGET_LEN)] # Each item is equal, so weights are ones self.weights = [tf.ones_like(label, dtype=tf.float32) for label in self.labels] # decoder_inputs has the correct output from the previous timestep, # with a zero-hot "go" token on the first one go_token = tf.zeros_like(self.labels[0], dtype=np.int32, name="GO") self.decoder_inputs = [go_token] + self.labels[:-1] # Construct the guts of the model. # This same model will be used for training and testing, so we # don't feed_previous. self.outputs, self.states = seq2seq.embedding_rnn_seq2seq( self.encoder_inputs, self.decoder_inputs, self.cell, len(SOURCE_VOCAB), len(TARGET_VOCAB), feed_previous=False) self.loss = seq2seq.sequence_loss( self.outputs, self.labels, self.weights) # Set up the ops we need for training if True: # momentum learning_rate = 0.05 momentum = 0.9 self.optimizer = tf.train.MomentumOptimizer(learning_rate, momentum) self.train_op = self.optimizer.minimize(self.loss) else: # adam # Assumes batch size of 100 self.cost = tf.reduce_sum(self.loss) / TARGET_LEN / 100 self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() # Clip gradients at 5.0 grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), 5.0) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars)) self.sess = tf.Session() self.sess.run(tf.initialize_all_variables())
def __init__(self, vocab_size, sequence_length, num_units, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor): self.vocab_size = vocab_size self.sequence_length = sequence_length self.batch_size = batch_size self.learning_rate = tf.Variable(float(learning_rate), trainable=False) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) w = training.utils.gaussian_weights_variable( [num_units, self.vocab_size]) b = tf.Variable(tf.zeros([self.vocab_size])) lstm_cell = rnn_cell.LSTMCell(num_units, vocab_size) self.encoder_inputs = [] self.decoder_inputs = [] self.target_weights = [] for _ in range(sequence_length): self.encoder_inputs.append( tf.placeholder(tf.float32, shape=(batch_size, self.vocab_size))) self.decoder_inputs.append( tf.placeholder(tf.float32, shape=(batch_size, self.vocab_size))) self.target_weights.append( tf.placeholder(tf.float32, shape=(batch_size, ))) # Decoder has one extra cell because it starts with the GO symbol, # and the targets are shifted by one. # Not sure this is actually useful, as it is always set to 0. # As this is inspired by TensorFlow seq2seq models, there might be # something dodgy in there. self.decoder_inputs.append( tf.placeholder(tf.float32, shape=(batch_size, self.vocab_size))) self.target_weights.append(np.ones((batch_size, ))) #Â Targets used by the sequence loss must be integer indices. targets = [ tf.cast(tf.argmax(i, 1), dtype=tf.int32) for i in self.decoder_inputs[1:] ] outputs, self.state = seq2seq.basic_rnn_seq2seq( self.encoder_inputs, self.decoder_inputs, lstm_cell) self.logits = [tf.nn.xw_plus_b(o, w, b) for o in outputs] self.loss = seq2seq.sequence_loss( self.logits[:self.sequence_length], targets, self.target_weights[:self.sequence_length], self.vocab_size) params = tf.trainable_variables() opt = tf.train.GradientDescentOptimizer(self.learning_rate) gradients = tf.gradients(self.loss, params) clipped_gradients, self.gradient_norms = tf.clip_by_global_norm( gradients, max_gradient_norm) self.updates = opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step) self.saver = tf.train.Saver(tf.all_variables())
cell = rnn_cell.GRUCell(memory_dim) #encoder_inputs, decoder_inputs, cell, num_encoder_symbols, num_decoder_symbols, embedding_size dec_outputs, dec_state, enc_state = seq2seq_new.embedding_rnn_seq2seq_new( enc_inp, dec_inp, cell, vocab_size, vocab_size, embedding_dim) #print dec_outputs[0], len(dec_outputs), tf.shape(dec_state) print '** enc memory ', enc_state.get_shape() print '** dec memory ', dec_state.get_shape() # Objective 1 # loss function - mean cross entropy across sequence loss = seq2seq.sequence_loss(dec_outputs, labels, weights, vocab_size) learning_rate = 0.05 momentum = 0.9 optimizer = tf.train.MomentumOptimizer(learning_rate, momentum) train_op = optimizer.minimize(loss) # Objective 2 # Set model weights W = tf.Variable(tf.random_normal([100, 1], stddev=0.35), name="weights") b = tf.Variable(tf.zeros([1]), name="biases") # Construct a linear model activation = tf.add(tf.matmul(enc_state, W), b)
def __init__(self): # Set up hyperparameters self.num_layers = 3 self.layer_size = 256 # Set up the core RNN cells of the tensor network single_cell = rnn_cell.BasicLSTMCell(self.layer_size) self.cell = rnn_cell.MultiRNNCell([single_cell] * self.num_layers) # Set up placeholders for the inputs and outputs. # Leave batch size unspecified as a None shape. # The input problem self.encoder_inputs = [ tf.placeholder(tf.int32, shape=[None], name='encoder{0}'.format(i)) for i in range(SOURCE_LEN) ] # The correct answers self.labels = [ tf.placeholder(tf.int32, shape=[None], name='labels{0}'.format(i)) for i in range(TARGET_LEN) ] # Each item is equal, so weights are ones self.weights = [ tf.ones_like(label, dtype=tf.float32) for label in self.labels ] # decoder_inputs has the correct output from the previous timestep, # with a zero-hot "go" token on the first one go_token = tf.zeros_like(self.labels[0], dtype=np.int32, name="GO") self.decoder_inputs = [go_token] + self.labels[:-1] # Construct the guts of the model. # This same model will be used for training and testing, so we # don't feed_previous. self.outputs, self.states = seq2seq.embedding_rnn_seq2seq( self.encoder_inputs, self.decoder_inputs, self.cell, len(SOURCE_VOCAB), len(TARGET_VOCAB), feed_previous=False) self.loss = seq2seq.sequence_loss(self.outputs, self.labels, self.weights) # Set up the ops we need for training if True: # momentum learning_rate = 0.05 momentum = 0.9 self.optimizer = tf.train.MomentumOptimizer( learning_rate, momentum) self.train_op = self.optimizer.minimize(self.loss) else: # adam # Assumes batch size of 100 self.cost = tf.reduce_sum(self.loss) / TARGET_LEN / 100 self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() # Clip gradients at 5.0 grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), 5.0) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars)) self.sess = tf.Session() self.sess.run(tf.initialize_all_variables())
def __init__(self, vocab_size, buckets_or_sentence_length, size, num_layers, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, model_type, use_lstm=True, num_samples=512, forward_only=False): """Create the model. This constructor can be used to created an embedded or embedded-attention, bucketed or non-bucketed model made of single or multi-layer RNN cells. Args: vocab_size: Size of the vocabulary. target_vocab_size: Size of the target vocabulary. buckets_or_sentence_length: If using buckets: A list of pairs (I, O), where I specifies maximum input length that will be processed in that bucket, and O specifies maximum output length. Training instances that have inputs longer than I or outputs longer than O will be pushed to the next bucket and padded accordingly. We assume that the list is sorted, e.g., [(2, 4), (8, 16)]. Else: Number of the maximum number of words per sentence. size: Number of units in each layer of the model. num_layers: Number of layers in the model. max_gradient_norm: Gradients will be clipped to maximally this norm. batch_size: The size of the batches used during training; the model construction is independent of batch_size, so it can be changed after initialization if this is convenient, e.g., for decoding. learning_rate: Learning rate to start with. learning_rate_decay_factor: Decay learning rate by this much when needed. num_samples: Number of samples for sampled softmax. forward_only: If set, we do not construct the backward pass in the model. """ # Need to determine if we're using buckets or not: if type(buckets_or_sentence_length) == list: self.buckets = buckets_or_sentence_length else: self.max_sentence_length = buckets_or_sentence_length self.vocab_size = vocab_size self.batch_size = batch_size self.learning_rate = tf.Variable(float(learning_rate), trainable=False) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) # If we use sampled softmax, we need an output projection. output_projection = None softmax_loss_function = None # Sampled softmax only makes sense if we sample less than vocabulary size. if num_samples > 0 and num_samples < self.vocab_size: with tf.device("/cpu:0"): w = tf.get_variable("proj_w", [size, self.vocab_size]) w_t = tf.transpose(w) b = tf.get_variable("proj_b", [self.vocab_size]) output_projection = (w, b) def sampled_loss(inputs, labels): with tf.device("/cpu:0"): labels = tf.reshape(labels, [-1, 1]) return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, num_samples, self.vocab_size) softmax_loss_function = sampled_loss # Create the internal multi-layer cell for our RNN. single_cell = rnn_cell.GRUCell(size) if use_lstm: single_cell = rnn_cell.BasicLSTMCell(size) cell = single_cell #i, j, f, o = array_ops.split(1, 4, concat) if num_layers > 1: cell = rnn_cell.MultiRNNCell( [single_cell] * num_layers) #cur_inp, array_ops.concat(1, new_states) # The seq2seq function: we use embedding for the input and attention (if applicable). if model_type is 'embedding_attention': def seq2seq_f(encoder_inputs, decoder_inputs, do_decode): return seq2seq.embedding_attention_seq2seq( encoder_inputs, decoder_inputs, cell, vocab_size, vocab_size, output_projection=output_projection, feed_previous=do_decode) else: # just build embedding model, I should probably change this to throw an error def seq2seq_f(encoder_inputs, decoder_inputs, do_decode): return seq2seq.embedding_rnn_seq2seq( encoder_inputs, decoder_inputs, cell, vocab_size, vocab_size, output_projection=output_projection, feed_previous=do_decode) # Feeds for inputs. self.encoder_inputs = [] self.decoder_inputs = [] self.target_weights = [] # NOTE: If the model is not bucketed, these try blocks will throw an AttributeError and execute code to build a non-bucketed model. try: encoder_range = self.buckets[-1][0] decoder_range = self.buckets[-1][1] except AttributeError: encoder_range, decoder_range = self.max_sentence_length, self.max_sentence_length for i in xrange(encoder_range): # Last bucket is the biggest one. self.encoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i))) for i in xrange(decoder_range + 1): self.decoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i))) self.target_weights.append( tf.placeholder(tf.float32, shape=[None], name="weight{0}".format(i))) # Our targets are decoder inputs shifted by one. targets = [ self.decoder_inputs[i + 1] for i in xrange(len(self.decoder_inputs) - 1) ] # Training outputs and losses. try: if forward_only: self.outputs, self.losses = seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, self.buckets, self.vocab_size, lambda x, y: seq2seq_f(x, y, True), softmax_loss_function=softmax_loss_function) # If we use output projection, we need to project outputs for decoding. if output_projection is not None: for b in xrange(len(self.buckets)): self.outputs[b] = [ tf.nn.xw_plus_b(output, output_projection[0], output_projection[1]) for output in self.outputs[b] ] else: self.outputs, self.losses = seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, self.buckets, self.vocab_size, lambda x, y: seq2seq_f(x, y, False), softmax_loss_function=softmax_loss_function) except AttributeError: if forward_only: self.outputs, self.states = seq2seq_f(self.encoder_inputs, self.decoder_inputs[:-1], True) self.losses = seq2seq.sequence_loss( self.outputs, targets, self.target_weights[:-1], self.vocab_size, softmax_loss_function=softmax_loss_function) # Project outputs for decoding if output_projection is not None: self.outputs = [ tf.nn.xw_plus_b(output, output_projection[0], output_projection[1]) for output in self.outputs ] else: self.outputs, self.states = seq2seq_f(self.encoder_inputs, self.decoder_inputs[:-1], False) self.losses = (seq2seq.sequence_loss( self.outputs, targets, self.target_weights[:-1], self.vocab_size, softmax_loss_function=softmax_loss_function)) # Gradients and SGD update operation for training the model. params = tf.trainable_variables() self.params = params # Hold onto this for Woz if not forward_only: self.gradient_norms = [] self.updates = [] opt = tf.train.GradientDescentOptimizer(self.learning_rate) try: for b in xrange(len(self.buckets)): gradients = tf.gradients(self.losses[b], params) clipped_gradients, norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.gradient_norms.append(norm) self.updates.append( opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)) except AttributeError: gradients = tf.gradients(self.losses, params) clipped_gradients, norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.gradient_norms = norm self.updates = opt.apply_gradients( zip(clipped_gradients, params), global_step=self.global_step) self.saver = tf.train.Saver(tf.all_variables())
def _init_neural_network(self): """Initializing the NN (building a TensorFlow graph and initializing session).""" # set TensorFlow random seed tf.set_random_seed(rnd.randint(-sys.maxint, sys.maxint)) # create placeholders for input & output (always batch-size * 1, list of up to num. steps) self.enc_inputs = [] self.enc_inputs_drop = [] for i in xrange(self.max_da_len): enc_input = tf.placeholder(tf.int32, [None], name=('enc_inp-%d' % i)) self.enc_inputs.append(enc_input) if self.dropout_keep_prob < 1: enc_input_drop = tf.nn.dropout(enc_input, self.dropout_keep_prob, name=('enc_inp-drop-%d' % i)) self.enc_inputs_drop.append(enc_input_drop) self.dec_inputs = [] for i in xrange(self.max_tree_len): self.dec_inputs.append(tf.placeholder(tf.int32, [None], name=('dec_inp-%d' % i))) # targets are just decoder inputs shifted by one (+pad with one empty spot) self.targets = [self.dec_inputs[i + 1] for i in xrange(len(self.dec_inputs) - 1)] self.targets.append(tf.placeholder(tf.int32, [None], name=('target-pad'))) # prepare cells self.initial_state = tf.placeholder(tf.float32, [None, self.emb_size]) if self.cell_type.startswith('gru'): self.cell = rnn_cell.GRUCell(self.emb_size) else: self.cell = rnn_cell.BasicLSTMCell(self.emb_size) if self.cell_type.endswith('/2'): self.cell = rnn_cell.MultiRNNCell([self.cell] * 2) # build the actual LSTM Seq2Seq network (for training and decoding) with tf.variable_scope(self.scope_name) as scope: rnn_func = embedding_rnn_seq2seq if self.nn_type == 'emb_attention_seq2seq': rnn_func = embedding_attention_seq2seq elif self.nn_type == 'emb_attention2_seq2seq': rnn_func = partial(embedding_attention_seq2seq, num_heads=2) elif self.nn_type == 'emb_attention_seq2seq_context': rnn_func = embedding_attention_seq2seq_context elif self.nn_type == 'emb_attention2_seq2seq_context': rnn_func = partial(embedding_attention_seq2seq_context, num_heads=2) # for training: feed_previous == False, using dropout if available # outputs = batch_size * num_decoder_symbols ~ i.e. output logits at each steps # states = cell states at each steps self.outputs, self.states = rnn_func( self.enc_inputs_drop if self.enc_inputs_drop else self.enc_inputs, self.dec_inputs, self.cell, self.da_dict_size, self.tree_dict_size, scope=scope) scope.reuse_variables() # for decoding: feed_previous == True self.dec_outputs, self.dec_states = rnn_func( self.enc_inputs, self.dec_inputs, self.cell, self.da_dict_size, self.tree_dict_size, feed_previous=True, scope=scope) # TODO use output projection ??? # target weights # TODO change to actual weights, zero after the end of tree ??? self.cost_weights = [tf.ones_like(trg, tf.float32, name='cost_weights') for trg in self.targets] # cost self.tf_cost = sequence_loss(self.outputs, self.targets, self.cost_weights, self.tree_dict_size) self.dec_cost = sequence_loss(self.dec_outputs, self.targets, self.cost_weights, self.tree_dict_size) if self.use_dec_cost: self.cost = 0.5 * (self.tf_cost + self.dec_cost) else: self.cost = self.tf_cost self.learning_rate = tf.placeholder(tf.float32, name="learning_rate") # optimizer (default to Adam) if self.optimizer_type == 'sgd': self.optimizer = tf.train.GradientDescentOptimizer(self.learning_rate) if self.optimizer_type == 'adagrad': self.optimizer = tf.train.AdagradOptimizer(self.learning_rate) else: self.optimizer = tf.train.AdamOptimizer(self.learning_rate) self.train_func = self.optimizer.minimize(self.cost) # initialize session session_config = None if self.max_cores: session_config = tf.ConfigProto(inter_op_parallelism_threads=self.max_cores, intra_op_parallelism_threads=self.max_cores) self.session = tf.Session(config=session_config) # this helps us load/save the model self.saver = tf.train.Saver(tf.all_variables())
def __init__(self, vocab_size, buckets_or_sentence_length, size, num_layers, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, model_type, use_lstm=True, num_samples=512, forward_only=False): """Create the model. This constructor can be used to created an embedded or embedded-attention, bucketed or non-bucketed model made of single or multi-layer RNN cells. Args: vocab_size: Size of the vocabulary. target_vocab_size: Size of the target vocabulary. buckets_or_sentence_length: If using buckets: A list of pairs (I, O), where I specifies maximum input length that will be processed in that bucket, and O specifies maximum output length. Training instances that have inputs longer than I or outputs longer than O will be pushed to the next bucket and padded accordingly. We assume that the list is sorted, e.g., [(2, 4), (8, 16)]. Else: Number of the maximum number of words per sentence. size: Number of units in each layer of the model. num_layers: Number of layers in the model. max_gradient_norm: Gradients will be clipped to maximally this norm. batch_size: The size of the batches used during training; the model construction is independent of batch_size, so it can be changed after initialization if this is convenient, e.g., for decoding. learning_rate: Learning rate to start with. learning_rate_decay_factor: Decay learning rate by this much when needed. num_samples: Number of samples for sampled softmax. forward_only: If set, we do not construct the backward pass in the model. """ # Need to determine if we're using buckets or not: if type(buckets_or_sentence_length) == list: self.buckets = buckets_or_sentence_length else: self.max_sentence_length = buckets_or_sentence_length self.vocab_size = vocab_size self.batch_size = batch_size self.learning_rate = tf.Variable(float(learning_rate), trainable=False) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) # Summary variables. NOTE: added these. # self.summary_op_learning_rate = tf.scalar_summary('learning rate', self.learning_rate) # self.summary_op_global_step = tf.scalar_summary('global step', self.global_step) # If we use sampled softmax, we need an output projection. output_projection = None softmax_loss_function = None # Sampled softmax only makes sense if we sample less than vocabulary size. if num_samples > 0 and num_samples < self.vocab_size: with tf.device("/cpu:0"): w = tf.get_variable("proj_w", [size, self.vocab_size]) w_t = tf.transpose(w) b = tf.get_variable("proj_b", [self.vocab_size]) output_projection = (w, b) def sampled_loss(inputs, labels): with tf.device("/cpu:0"): labels = tf.reshape(labels, [-1, 1]) return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, num_samples, self.vocab_size) softmax_loss_function = sampled_loss # Create the internal multi-layer cell for our RNN. single_cell = rnn_cell.GRUCell(size) if use_lstm: single_cell = rnn_cell.BasicLSTMCell(size) cell = single_cell #i, j, f, o = array_ops.split(1, 4, concat) if num_layers > 1: cell = rnn_cell.MultiRNNCell([single_cell] * num_layers) #cur_inp, array_ops.concat(1, new_states) # The seq2seq function: we use embedding for the input and attention (if applicable). if model_type is 'embedding_attention': def seq2seq_f(encoder_inputs, decoder_inputs, do_decode): return seq2seq.embedding_attention_seq2seq(encoder_inputs, decoder_inputs, cell, vocab_size, vocab_size, output_projection=output_projection, feed_previous=do_decode) else: # just build embedding model, I should probably change this to throw an error def seq2seq_f(encoder_inputs, decoder_inputs, do_decode): return seq2seq.embedding_rnn_seq2seq(encoder_inputs, decoder_inputs, cell, vocab_size, vocab_size, output_projection=output_projection, feed_previous=do_decode) # Feeds for inputs. self.encoder_inputs = [] self.decoder_inputs = [] self.target_weights = [] # NOTE: If the model is not bucketed, these try blocks will throw an AttributeError and execute code to build a non-bucketed model. try: encoder_range = self.buckets[-1][0] decoder_range = self.buckets[-1][1] except AttributeError: encoder_range, decoder_range = self.max_sentence_length, self.max_sentence_length for i in xrange(encoder_range): # Last bucket is the biggest one. self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i))) for i in xrange(decoder_range + 1): self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i))) self.target_weights.append(tf.placeholder(tf.float32, shape=[None], name="weight{0}".format(i))) # Our targets are decoder inputs shifted by one. targets = [self.decoder_inputs[i + 1] for i in xrange(len(self.decoder_inputs) - 1)] # Training outputs and losses. try: if forward_only: self.outputs, self.losses = seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, self.buckets, self.vocab_size, lambda x, y: seq2seq_f(x, y, True), softmax_loss_function=softmax_loss_function) # If we use output projection, we need to project outputs for decoding. if output_projection is not None: for b in xrange(len(self.buckets)): self.outputs[b] = [tf.nn.xw_plus_b(output, output_projection[0], output_projection[1]) for output in self.outputs[b]] else: self.outputs, self.losses = seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, self.buckets, self.vocab_size, lambda x, y: seq2seq_f(x, y, False), softmax_loss_function=softmax_loss_function) except AttributeError: if forward_only: self.outputs, self.states = seq2seq_f(self.encoder_inputs, self.decoder_inputs[:-1], True) self.losses = seq2seq.sequence_loss(self.outputs, targets, self.target_weights[:-1], self.vocab_size, softmax_loss_function=softmax_loss_function) # Project outputs for decoding if output_projection is not None: self.outputs = [tf.nn.xw_plus_b(output, output_projection[0], output_projection[1]) for output in self.outputs] else: self.outputs, self.states = seq2seq_f(self.encoder_inputs, self.decoder_inputs[:-1], False) self.losses = (seq2seq.sequence_loss(self.outputs, targets, self.target_weights[:-1], self.vocab_size, softmax_loss_function=softmax_loss_function)) # Gradients and SGD update operation for training the model. params = tf.trainable_variables() self.params = params # Hold onto this for Woz if not forward_only: self.gradient_norms = [] self.updates = [] opt = tf.train.GradientDescentOptimizer(self.learning_rate) try: for b in xrange(len(self.buckets)): gradients = tf.gradients(self.losses[b], params) clipped_gradients, norm = tf.clip_by_global_norm(gradients, max_gradient_norm) self.gradient_norms.append(norm) self.updates.append(opt.apply_gradients( zip(clipped_gradients, params), global_step=self.global_step)) except AttributeError: gradients = tf.gradients(self.losses, params) clipped_gradients, norm = tf.clip_by_global_norm(gradients,max_gradient_norm) self.gradient_norms = norm self.updates = opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step) self.saver = tf.train.Saver(tf.all_variables())
dec_inp = ([tf.zeros_like(enc_inp[0], dtype=np.float32, name="GO")] + enc_inp[:-1]) # Initial memory value for recurrence. #prev_mem = tf.zeros((batch_size, memory_dim)) print("shapes", np.array(enc_inp).shape, np.array(dec_inp).shape, np.array(labels).shape) cell = rnn_cell.GRUCell(memory_dim) dec_outputs, dec_memory = seq2seq.basic_rnn_seq2seq( enc_inp, dec_inp, cell) labels_t = tf.reshape(labels, [5,100]) print(labels_t) print(dec_outputs) loss = seq2seq.sequence_loss(dec_outputs, labels_t, weights, vocab_size) tf.scalar_summary("loss", loss) #magnitude = tf.sqrt(tf.reduce_sum(tf.square(dec_memory[1]))) #tf.scalar_summary("magnitude at t=1", magnitude) summary_op = tf.merge_all_summaries() learning_rate = 0.05 momentum = 0.9 optimizer = tf.train.MomentumOptimizer(learning_rate, momentum) train_op = optimizer.minimize(loss) logdir = tempfile.mkdtemp() print(logdir) summary_writer = tf.train.SummaryWriter(logdir, sess.graph_def) sess.run(tf.initialize_all_variables())
memory_dim = 100 x_seq = [tf.placeholder(tf.int32, shape=(None,), name="x%i" % t) for t in range(seq_length)] t_seq = [tf.placeholder(tf.int32, shape=(None,), name="t%i" % t) for t in range(seq_length)] weights = [tf.ones_like(t_i, dtype=tf.float32) for t_i in t_seq] # Decoder input: prepend some "GO" token and drop the final token of the encoder input dec_inp = ([tf.zeros_like(x_seq[0], dtype=np.int32, name="GO")] + x_seq[:-1]) # Initial memory value for recurrence. prev_mem = tf.zeros((batch_size, memory_dim)) # GRU cell = rnn_cell.GRUCell(memory_dim) dec_outputs, dec_memory = seq2seq.embedding_rnn_seq2seq(x_seq, dec_inp, cell, vocab_size, vocab_size) loss = seq2seq.sequence_loss(dec_outputs, t_seq, weights, vocab_size) tf.scalar_summary("loss", loss) magnitude = tf.sqrt(tf.reduce_sum(tf.square(dec_memory[1]))) tf.scalar_summary("magnitude at t=1", magnitude) summary_op = tf.merge_all_summaries() learning_rate = 0.05 momentum = 0.9 optimizer = tf.train.MomentumOptimizer(learning_rate, momentum) train_op = optimizer.minimize(loss) logdir = tempfile.mkdtemp() print logdir def generate_data():
def model_with_buckets(encoder_inputs, decoder_inputs, targets, weights, buckets, seq2seq_f, softmax_loss_function=None, per_example_loss=False, name=None): """Create a sequence-to-sequence model with support for bucketing. The seq2seq argument is a function that defines a sequence-to-sequence model, e.g., seq2seq = lambda x, y: basic_rnn_seq2seq(x, y, rnn_cell.GRUCell(24)) Args: encoder_inputs: A list of Tensors to feed the encoder; first seq2seq input. decoder_inputs: A list of Tensors to feed the decoder; second seq2seq input. targets: A list of 1D batch-sized int32 Tensors (desired output sequence). weights: List of 1D batch-sized float-Tensors to weight the targets. buckets: A list of pairs of (input size, output size) for each bucket. seq2seq_f: A sequence-to-sequence model function; it takes 2 input that agree with encoder_inputs and decoder_inputs, and returns a pair consisting of outputs and states (as, e.g., basic_rnn_seq2seq). softmax_loss_function: Function (inputs-batch, labels-batch) -> loss-batch to be used instead of the standard softmax (the default if this is None). per_example_loss: Boolean. If set, the returned loss will be a batch-sized tensor of losses for each sequence in the batch. If unset, it will be a scalar with the averaged loss from all examples. name: Optional name for this operation, defaults to "model_with_buckets". Returns: A tuple of the form (outputs, losses), where: outputs: The outputs for each bucket. Its j'th element consists of a list of 2D Tensors of shape [batch_size x num_decoder_symbols] (jth outputs). losses: List of scalar Tensors, representing losses for each bucket, or, if per_example_loss is set, a list of 1D batch-sized float Tensors. Raises: ValueError: If length of encoder_inputsut, targets, or weights is smaller than the largest (last) bucket. """ if len(encoder_inputs) < buckets[-1][0]: raise ValueError("Length of encoder_inputs (%d) must be at least that of la" "st bucket (%d)." % (len(encoder_inputs), buckets[-1][0])) if len(targets) < buckets[-1][1]: raise ValueError("Length of targets (%d) must be at least that of last" "bucket (%d)." % (len(targets), buckets[-1][1])) if len(weights) < buckets[-1][1]: raise ValueError("Length of weights (%d) must be at least that of last" "bucket (%d)." % (len(weights), buckets[-1][1])) all_inputs = encoder_inputs + decoder_inputs + targets + weights losses = [] outputs = [] with ops.op_scope(all_inputs, name, "model_with_buckets"): for j, bucket in enumerate(buckets): with variable_scope.variable_scope(variable_scope.get_variable_scope(), reuse=True if j > 0 else None): bucket_outputs, _ = seq2seq_f(encoder_inputs[:bucket[0]], decoder_inputs[:bucket[1]]) outputs.append(bucket_outputs) if per_example_loss: losses.append(seq2seq.sequence_loss_by_example( outputs[-1], targets[:bucket[1]], weights[:bucket[1]], average_across_timesteps=True, softmax_loss_function=softmax_loss_function)) else: losses.append(seq2seq.sequence_loss( outputs[-1], targets[:bucket[1]], weights[:bucket[1]], average_across_timesteps=True, softmax_loss_function=softmax_loss_function)) return outputs, losses
# Initial memory value for recurrence. prev_mem = tf.zeros((batch_size, memory_dim)) cell = rnn_cell.BasicLSTMCell(memory_dim) #enc_inp = np.tile(enc_inp, 2).tolist() logits, state = seq2seq.basic_rnn_seq2seq( enc_inp, dec_inp, cell)#, vocab_size, vocab_size) for i, inp in enumerate(enc_inp): print(i, inp) print("logits", logits) print('labels', labels) loss = seq2seq.sequence_loss(logits, labels, weights) summary_op = tf.scalar_summary("loss", loss) square = tf.square(state) sum = tf.reduce_sum(square) magnitude = tf.sqrt(sum) tf.scalar_summary("magnitude at t=1", magnitude) learning_rate = 0.05 momentum = 0.9 optimizer = tf.train.MomentumOptimizer(learning_rate, momentum) train_op = optimizer.minimize(loss) logdir = tempfile.mkdtemp() print(logdir)