def build_sampler_with_beam_search(self, beam_size=10, max_len=20): features = self.features # batch normalize feature vectors features = self._batch_norm(features, mode='test', name='conv_features') features_proj = self._project_features(features=features) lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=self.H) def tokens_to_inputs_attention_fn(model, symbols, feats, feats_proj, hidden_state, beam_size): embed_symbols = model._word_embedding(inputs=tf.reshape(symbols, [-1]), reuse=True) context, alpha = self._attention_layer(feats, feats_proj, hidden_state, reuse=True) if self.enable_selector: context, beta = self._selector(context, hidden_state, reuse=True) next_input = tf.concat([embed_symbols, context], 1) next_input = tf.reshape(next_input, [-1, beam_size, next_input.shape[-1]]) return next_input, context, alpha, beta def outputs_to_score_attention_fn(model, symbols, outputs, beam_context, beam_size): embed_symbols = model._word_embedding(inputs=symbols, reuse=True) outputs = tf.reshape(outputs, [-1, outputs.shape[-1]]) logits = model._decode_lstm(embed_symbols, outputs, beam_context) logits = tf.reshape(logits, [-1, beam_size, logits.shape[-1]]) return tf.nn.log_softmax(logits) sampled_captions, logprobs, alphas, betas = beam_decoder(lstm_cell, beam_size, self._start, self._end, tokens_to_inputs_attention_fn, outputs_to_score_attention_fn, features=features, features_proj=features_proj, max_len=35, selector=self.enable_selector, output_dense=True, scope='lstm', model=self) return alphas, betas, sampled_captions
def test1(self): """ test correct decode in sequence """ with self.test_session() as sess: table = np.array( [[[0.0, 0.6, 0.4], [0.0, 0.4, 0.6], [0.0, 0.0, 1.0]]] * 3) for cell_transform in ['default', 'flatten', 'replicate']: cell = MarkovChainCell(table) initial_state = cell.zero_state(1, tf.int32) initial_input = initial_state[0] with tf.variable_scope('test1_{}'.format(cell_transform)): best_sparse, best_logprobs = beam_decoder( cell=cell, beam_size=7, stop_token=2, initial_state=initial_state, initial_input=initial_input, tokens_to_inputs_fn=lambda x: tf.expand_dims(x, -1), max_len=5, cell_transform=cell_transform, output_dense=False, ) tf.variables_initializer([cell.log_table_var]).run() assert all(best_sparse.eval().values == [2]) assert np.isclose(np.exp(best_logprobs.eval())[0], 0.4)
def test3(self): """ test that variable reuse works as expected """ with self.test_session() as sess: table = np.array( [[[0.0, 0.6, 0.4], [0.0, 0.4, 0.6], [0.0, 0.0, 1.0]]] * 3) for cell_transform in ['default', 'flatten', 'replicate']: cell = MarkovChainCell(table) initial_state = cell.zero_state(1, tf.int32) initial_input = initial_state[0] with tf.variable_scope( 'test3_{}'.format(cell_transform)) as scope: best_sparse, best_logprobs = beam_decoder( cell=cell, beam_size=7, stop_token=2, initial_state=initial_state, initial_input=initial_input, tokens_to_inputs_fn=lambda x: tf.expand_dims(x, -1), max_len=5, cell_transform=cell_transform, output_dense=False, scope=scope) tf.variables_initializer([cell.log_table_var]).run() with tf.variable_scope(scope, reuse=True) as varscope: best_sparse_2, best_logprobs_2 = beam_decoder( cell=cell, beam_size=7, stop_token=2, initial_state=initial_state, initial_input=initial_input, tokens_to_inputs_fn=lambda x: tf.expand_dims(x, -1), max_len=5, cell_transform=cell_transform, output_dense=False, scope=varscope) assert all( sess.run(tf.equal(best_sparse.values, best_sparse_2.values))) assert np.isclose(*sess.run((best_logprobs, best_logprobs_2)))
def add_decoder_test(self): print 'Adding decoder test' scope = 'Decoder' with tf.variable_scope(scope, reuse=True): # Use the same cell and output projection as in the decoder train case cell = tf.nn.rnn_cell.GRUCell( num_units=self.config.decoder_hidden_size) W = tf.get_variable('W') b = tf.get_variable('b') def output_fn(inputs): original_shape = tf.shape(inputs) outputs_flat = tf.reshape( inputs, [-1, self.config.decoder_hidden_size]) logits_flat = tf.matmul(outputs_flat, W) + b logits = tf.reshape(logits_flat, [ original_shape[0], original_shape[1], self.config.vocab_size ]) return tf.nn.log_softmax(logits) def emb_fn(tokens): original_shape = tf.shape(tokens) outputs = tf.nn.embedding_lookup(self.L, tokens) return tf.reshape(outputs, [ original_shape[0], original_shape[1], self.config.embedding_dim ]) start_tokens = tf.nn.embedding_lookup( self.L, self.labels_placeholder[:, 0]) print 'Start tokens shape', start_tokens.get_shape() self.decoded, _ = beam_decoder(cell=cell, beam_size=self.config.num_beams, stop_token=self.config.vocab_size - 1, initial_state=self.encoded, initial_input=start_tokens, tokens_to_inputs_fn=emb_fn, max_len=self.config.max_out_len, scope=scope, outputs_to_score_fn=output_fn, output_dense=True, cell_transform='replicate', score_upper_bound=0.0) params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) for param in params: print param
def add_decoder_test(self): print 'Adding decoder test' scope = 'Decoder' with tf.variable_scope(scope, reuse=True): # Use the same output projection as in the decoder train case W = tf.get_variable('W') b = tf.get_variable('b') def output_fn(inputs): original_shape = tf.shape(inputs) outputs_flat = tf.reshape( inputs, [-1, self.config.decoder_hidden_size]) logits_flat = tf.matmul(outputs_flat, W) + b logits = tf.reshape(logits_flat, [ original_shape[0], original_shape[1], self.config.vocab_size ]) return tf.nn.log_softmax(logits) def emb_fn(tokens): original_shape = tf.shape(tokens) outputs = tf.nn.embedding_lookup(self.L, tokens) return tf.reshape(outputs, [ original_shape[0], original_shape[1], self.config.embedding_dim ]) start_tokens = tf.nn.embedding_lookup( self.L, self.labels_placeholder[:, 0]) init_state = [self.encoded] for i in range(self.config.num_dec_layers): init_state.append(tf.zeros_like(self.encoded, dtype=tf.float32)) init_state = tuple(init_state) self.decoded, _ = beam_decoder(cell=self.cell, beam_size=self.config.num_beams, stop_token=self.config.vocab_size - 1, initial_state=init_state, initial_input=start_tokens, tokens_to_inputs_fn=emb_fn, max_len=self.config.max_out_len, scope=scope, outputs_to_score_fn=output_fn, output_dense=True, cell_transform='replicate', score_upper_bound=0.0) # Greedy decoder def loop_fn(prev, i): indices = tf.argmax(tf.matmul(prev, W) + b, axis=1) return tf.nn.embedding_lookup(self.L, indices) decoder_inputs = tf.nn.embedding_lookup( self.L, ids=self.labels_placeholder) decoder_inputs = tf.unstack(decoder_inputs, axis=1)[:-1] outputs, _ = tf.nn.seq2seq.rnn_decoder(decoder_inputs=decoder_inputs,\ initial_state = init_state,\ cell=self.cell, loop_function=loop_fn, scope=scope) # Convert back to tensor tensor_preds = tf.stack(outputs, axis=1) # Compute output_projection original_shape = tf.shape(tensor_preds) outputs_flat = tf.reshape(tensor_preds, [-1, self.config.decoder_hidden_size]) logits_flat = tf.matmul(outputs_flat, W) + b # Reshape back to original self.test_scores = tf.reshape( logits_flat, [original_shape[0], original_shape[1], self.config.vocab_size]) self.greedy_decoded = tf.argmax(self.test_scores, axis=2)
beamSz = 1 image_embedding = tf.matmul(img, img_embedding) + img_embedding_bias with tf.variable_scope("RNN"): output, state = rnn(image_embedding, state) previous_word = tf.nn.embedding_lookup(E, [0]) with tf.variable_scope("RNN_beam") as scope: best_sparse, best_logprobs = beam_decoder( cell=rnn, beam_size=beamSz, stop_token=0, initial_state=state, initial_input=previous_word, tokens_to_inputs_fn=lambda x: tf.nn.embedding_lookup(E, x), outputs_to_score_fn=lambda x: outputs_to_score_fn(x), max_len=maxlen, cell_transform='default', output_dense=True, scope=scope) # for i in range(maxlen): # tf.get_variable_scope().reuse_variables() # output, state = rnn(previous_word, state) # prob = tf.matmul(output, W) + b # best_word = tf.argmax(prob, 1) # previous_word = tf.nn.embedding_lookup(E, best_word) # all_words.append(best_word)
def add_decoder_test(self): print 'Adding decoder test' scope='Decoder' with tf.variable_scope(scope, reuse=True): # Use the same output projection as in the decoder train case W = tf.get_variable('W') b = tf.get_variable('b') W_ini = tf.get_variable('W_ini') def output_fn(inputs): original_shape = tf.shape(inputs) outputs_flat = tf.reshape(inputs, [-1, self.config.decoder_hidden_size]) logits_flat = tf.matmul(outputs_flat, W) + b logits = tf.reshape(logits_flat, [original_shape[0], original_shape[1], self.config.vocab_size]) return tf.nn.log_softmax(logits) def emb_fn(tokens): original_shape = tf.shape(tokens) outputs = tf.nn.embedding_lookup(self.L, tokens) return tf.reshape(outputs, [original_shape[0], original_shape[1], self.config.embedding_dim]) start_tokens = tf.nn.embedding_lookup(self.L, self.labels_placeholder[:, 0]) init_state = list(self.encoded) + \ [tf.zeros_like(self.encoded[0])] + \ [tf.zeros(shape=(tf.shape(self.encoded[0])[0], self.config.num_cells), dtype=tf.float32)]*2 # + \ # [tf.zeros(shape=(tf.shape(self.encoded[0])[0], self.config.num_cells, self.config.decoder_hidden_size), dtype=tf.float32)] # Memory is currently summed_memory_vec = tf.reduce_sum(self.memory, axis=1) numer = tf.sigmoid(tf.matmul(summed_memory_vec, W_ini)) print 'Numer is', numer init_memory = numer/tf.expand_dims(tf.cast(self.input_seq_lens, tf.float32), 1) init_memory = tf.expand_dims(init_memory, 1) init_memory = tf.tile(init_memory, [1, self.config.num_cells, 1]) init_memory = init_memory + tf.random_normal(shape=tf.shape(init_memory), mean=0.0, stddev=np.sqrt(0.1)) print 'Init memory', init_memory init_state += [init_memory] # for i in range(self.config.num_dec_layers): # init_state.append(tf.zeros_like(self.encoded, dtype=tf.float32)) init_state = tuple(init_state) self.decoded, _ = beam_decoder( cell=self.cell, beam_size=self.config.num_beams, stop_token=self.config.vocab_size - 1, initial_state=init_state, initial_input=start_tokens, tokens_to_inputs_fn=emb_fn, max_len=self.config.max_out_len, scope=scope, outputs_to_score_fn=output_fn, output_dense=True, cell_transform='replicate', score_upper_bound = self.config.beam_threshold ) # Greedy decoder def loop_fn(prev, i): indices = tf.argmax(tf.matmul(prev, W) + b, axis=1) return tf.nn.embedding_lookup(self.L, indices) decoder_inputs = tf.nn.embedding_lookup(self.L, ids=self.labels_placeholder) decoder_inputs = tf.unstack(decoder_inputs, axis=1)[:-1] outputs, _ = tf.nn.seq2seq.rnn_decoder(decoder_inputs=decoder_inputs,\ initial_state = init_state,\ cell=self.cell, loop_function=loop_fn, scope=scope) # Convert back to tensor tensor_preds = tf.stack(outputs, axis=1) # Compute output_projection original_shape = tf.shape(tensor_preds) outputs_flat = tf.reshape(tensor_preds, [-1, self.config.decoder_hidden_size]) logits_flat = tf.matmul(outputs_flat, W) + b # Reshape back to original self.test_scores = tf.reshape(logits_flat, [original_shape[0], original_shape[1], self.config.vocab_size]) self.greedy_decoded = tf.argmax(self.test_scores, axis=2)