def build_predict_text_graph(self, image, decode_method='greedy', beam_size=5, convert_unk=True): attention_states, initial_state, image_emb = self.encoder.encode( self.process(image)) if FLAGS.image_as_init_state: #for im2txt one more step at first with tf.variable_scope(self.decoder.scope) as scope: batch_size = melt.get_batch_size(image_emb) zero_state = self.decoder.cell.zero_state(batch_size, dtype=tf.float32) _, initial_state = self.decoder.cell(image_emb, zero_state) image_emb = self.decoder.get_start_embedding_input(batch_size) elif image_emb is None: #TODO check batch_size = melt.get_batch_size(image) image_emb = self.decoder.get_start_embedding_input(batch_size) max_words = TEXT_MAX_WORDS if decode_method == SeqDecodeMethod.greedy: return self.decoder.generate_sequence_greedy( image_emb, max_words=max_words, initial_state=initial_state, attention_states=attention_states, convert_unk=convert_unk) elif decode_method == SeqDecodeMethod.multinomal: return self.decoder.generate_sequence_multinomial( image_emb, max_words=max_words, initial_state=initial_state, attention_states=attention_states, convert_unk=convert_unk) else: if decode_method == SeqDecodeMethod.ingraph_beam: decode_func = self.decoder.generate_sequence_ingraph_beam elif decode_method == SeqDecodeMethod.outgraph_beam: decode_func = self.decoder.generate_sequence_outgraph_beam else: raise ValueError('not supported decode_method: %s' % decode_method) return decode_func( image_emb, max_words=max_words, initial_state=initial_state, beam_size=beam_size, convert_unk=convert_unk, attention_states=attention_states, length_normalization_factor=FLAGS.length_normalization_factor)
def build_image_words_sim_graph(self): with tf.variable_scope(self.scope): #[1, atten_size, emb_dim] image_feature = self.forward_image_feature( self.get_image_feature_feed()) #[vocab_size, 1, emb_dim] word_feature, words = self.forward_word_feature() #image_feature_batch_size = melt.get_batch_size(image_feature) word_feature_batch_size = melt.get_batch_size(word_feature) #[vocab_size, atten_size, emb_dim] image_feature = tf.contrib.seq2seq.tile_batch( image_feature, word_feature_batch_size) #word_feature = tf.contrib.seq2seq.tile_batch(word_feature, image_feature_batch_size) #[vocab_size, 1] #score = self.compute_image_text_sim(image_feature, word_feature, words) score = self.compute_image_text_sim(image_feature, word_feature) score = tf.expand_dims(tf.squeeze(score), 0) #print(image_feature, word_feature, words, score) #[1, vocab_size] #score = tf.transpose(score, [1, 0]) return score
def encode(self, seq, seq_len=None, output_method='all'): with tf.variable_scope(self.scope): num_filters = self.num_units seqs = [seq] batch_size = melt.get_batch_size(seq) kernel_sizes = [3, 5, 7, 9, 11, 13] #kernel_sizes = [3] * 7 assert self.num_layers <= len(kernel_sizes) for layer in range(self.num_layers): input_size_ = melt.get_shape(seq, -1) if layer == 0 else num_filters seq = melt.dropout(seq, self.keep_prob, self.is_train) seq = tf.layers.conv1d(seqs[-1], num_filters, kernel_size=kernel_sizes[layer], padding='same', activation=tf.nn.relu) # mask = melt.dropout(tf.ones([batch_size, 1, input_size_], dtype=tf.float32), # keep_prob=self.keep_prob, is_train=self.is_train, mode=None) #seq = tf.layers.conv1d(seqs[-1] * mask, num_filters, kernel_size=3, padding='same', activation=tf.nn.relu) #seq = tf.layers.conv1d(seqs[-1] * mask, num_filters, kernel_size=kernel_sizes[layer], padding='same', activation=tf.nn.relu) # if self.is_train and self.keep_prob < 1: # seq = tf.nn.dropout(seq, self.keep_prob) #seq = melt.layers.batch_norm(seq, self.is_train, name='layer_%d' % layer) seqs.append(seq) outputs = tf.concat(seqs[1:], 2) # not do any dropout in convet just dropout outside # if self.is_train and self.keep_prob < 1: # outputs = tf.nn.dropout(outputs, self.keep_prob) # compact for rnn with sate return return melt.rnn.encode_outputs(outputs, seq_len, output_method)
def __init__(self, input, max_steps, initial_state, beam_size=7, done_token=0, batch_size=None, num_classes=None, output_fn=None, length_normalization_factor=0., topn=1, need_softmax=True, logprobs_history=False, alignment_history=False, fast_greedy=False): self.length_normalization_factor = length_normalization_factor self.topn = topn self.need_softmax = need_softmax self.beam_size = beam_size self.batch_size = batch_size if self.batch_size is None: self.batch_size = melt.get_batch_size(input) self.max_len = max_steps self.num_classes = num_classes self.done_token = done_token self.pad_token = 0 self.output_fn = output_fn self.past_logprobs = None self.past_symbols = None self.past_step_logprobs = None self.fast_greedy = fast_greedy if self.fast_greedy: self.finished_beams = tf.zeros((self.batch_size, self.max_len), dtype=tf.int32) self.logprobs_finished_beams = tf.ones( (self.batch_size, ), dtype=tf.float32) * -float('inf') else: self.path_list = [] self.logprobs_list = [] self.step_logprobs_list = [] self.alignments_path_list = [] #for rnn_decoder function need one more step loop, since i== 0 will not output words, take step start from i==1 self.decoder_inputs = [None] * (self.max_len + 1) self.decoder_inputs[0] = tf.contrib.seq2seq.tile_batch( input, beam_size) self.initial_state = initial_state self.final_state = None self.log_probs_history = None self.alignment_history = None self.need_logprobs_history = logprobs_history self.need_alignment_history = alignment_history
def beam_search_step(self, input, state, cell, beam_size, attention_construct_fn=None, input_text=None): output, state = cell(input, state) if hasattr(state, 'alignments'): tf.add_to_collection('attention_alignments', state.alignments) tf.add_to_collection('beam_search_alignments', tf.get_collection('attention_alignments')[-1]) #TODO: this step cause.. attenion decode each step after initalization still need input_text feed #will this case attention_keys and attention_values to be recompute(means redo encoding process) each step? #can we avoid this? seems no better method, #if enocding is slow may be feed attention_keys, attention_values each step if not FLAGS.decode_use_alignment: if FLAGS.gen_only: output_fn = self.output_fn logits = output_fn(output) else: indices = melt.batch_values_to_indices(tf.to_int32(input_text)) batch_size = melt.get_batch_size(input) if FLAGS.copy_only: output_fn_ = self.copy_output_fn else: output_fn_ = self.gen_copy_output_fn output_fn = lambda cell_output, cell_state: output_fn_( indices, batch_size, cell_output, cell_state) logits = output_fn(output, state) if FLAGS.gen_copy_switch and FLAGS.switch_after_softmax: logprobs = tf.log(logits) else: logprobs = tf.nn.log_softmax(logits) if FLAGS.decode_copy: logprobs = melt.gather_cols(logprobs, tf.to_int32(input_text)) else: logits = state.alignments logits = scores[:, :tf.shape(input_text)[-1]] logprobs = tf.nn.log_softmax(logits) top_logprobs, top_ids = tf.nn.top_k(logprobs, beam_size) #------too slow... for transfering large data between py and c++ cost a lot! #top_logprobs, top_ids = tf.nn.top_k(logprobs, self.vocab_size) if input_text is not None and FLAGS.decode_copy: top_ids = tf.nn.embedding_lookup(input_text, top_ids) if hasattr(state, 'cell_state'): state = state.cell_state return output, state, top_logprobs, top_ids
def _encode(self, image): attention_states, initial_state, image_emb = self.encoder.encode( self.process(image)) if FLAGS.image_as_init_state: #for im2txt one more step at first with tf.variable_scope(self.decoder.scope): batch_size = melt.get_batch_size(image_emb) zero_state = self.decoder.cell.zero_state(batch_size, dtype=tf.float32) _, initial_state = self.decoder.cell(image_emb, zero_state) image_emb = self.decoder.get_start_embedding_input(batch_size) elif image_emb is None: #TODO check batch_size = melt.get_batch_size(image) image_emb = self.decoder.get_start_embedding_input(batch_size) image_emb = self._post_deal_image_embedding(image_emb, image) #attention_states, initial_state, image_emb = self._post_deal(attention_states, initial_state, image_emb) return attention_states, initial_state, image_emb
def generate_sequence_greedy(self, input, max_words, initial_state=None, attention_states=None, convert_unk=True, input_text=None, emb=None): """ this one is using greedy search method for beam search using generate_sequence_by_beam_search with addditional params like beam_size """ if emb is None: emb = self.emb batch_size = melt.get_batch_size(input) if attention_states is None: cell = self.cell else: cell = self.prepare_attention( attention_states, initial_state=initial_state, score_as_alignment=self.score_as_alignment) initial_state = None state = cell.zero_state( batch_size, tf.float32) if initial_state is None else initial_state helper = melt.seq2seq.GreedyEmbeddingHelper(embedding=emb, first_input=input, end_token=self.end_id) if FLAGS.gen_only: output_fn = self.output_fn else: indices = melt.batch_values_to_indices(tf.to_int32(input_text)) if FLAGS.copy_only: output_fn_ = self.copy_output_fn else: output_fn_ = self.gen_copy_output_fn output_fn = lambda cell_output, cell_state: output_fn_( indices, batch_size, cell_output, cell_state) my_decoder = melt.seq2seq.BasicDecoder(cell=cell, helper=helper, initial_state=state, vocab_size=self.vocab_size, output_fn=output_fn) outputs, _, _ = melt.seq2seq.dynamic_decode( my_decoder, maximum_iterations=max_words, scope=self.scope) generated_sequence = outputs.sample_id #------like beam search return sequence, score return generated_sequence, tf.zeros([ batch_size, ])
def encode(self, image_feature): if FLAGS.scene_model: if not hasattr(self.encoder, 'scene_feature'): self.encoder.scene_feature = self.scene.feature_feed attention_states, initial_state, image_emb = self.encoder.encode(self.process(image_feature)) if FLAGS.image_as_init_state: #for im2txt one more step at first, just for exp not used much with tf.variable_scope(self.decoder.scope) as scope: zero_state = self.decoder.cell.zero_state(batch_size=melt.get_batch_size(input), dtype=tf.float32) _, initial_state = self.decoder.cell(image_emb, zero_state) image_emb = None self.image_emb = image_emb image_emb = self._post_deal_image_embedding(image_emb, image_feature) #attention_states, initial_state, image_emb = self._post_deal(attention_states, initial_state, image_emb) return attention_states, initial_state, image_emb
def build_predict_text_graph(self, input_text, decode_method=0, beam_size=5, convert_unk=True): with tf.variable_scope("encode"): encoder_output, state = self.encoder.encode(input_text) if not FLAGS.use_attention: encoder_output = None with tf.variable_scope("decode"): #---try to use static shape if possible batch_size = melt.get_batch_size(input_text) decoder_input = self.decoder.get_start_embedding_input(batch_size) max_words = FLAGS.decode_max_words if FLAGS.decode_max_words else TEXT_MAX_WORDS if decode_method == SeqDecodeMethod.greedy: input_text = self.encoder.sequence return self.decoder.generate_sequence_greedy( decoder_input, max_words=max_words, initial_state=state, attention_states=encoder_output, convert_unk=convert_unk, input_text=input_text) else: if decode_method == SeqDecodeMethod.beam: decode_func = self.decoder.generate_sequence_beam elif decode_method == SeqDecodeMethod.beam_search: decode_func = self.decoder.generate_sequence_beam_search else: raise ValueError('not supported decode_method: %d' % decode_method) input_text, input_text_length = melt.pad( input_text, end_id=self.encoder.end_id) #input_text = self.encoder.sequence #input_text_length = self.encoder.sequence_length return decode_func(decoder_input, max_words=max_words, initial_state=state, attention_states=encoder_output, beam_size=beam_size, convert_unk=convert_unk, length_normalization_factor=FLAGS. length_normalization_factor, input_text=input_text, input_text_length=input_text_length)
def build_graph(self, image_feature, text, neg_image_feature=None, neg_text=None, exact_prob=False, exact_loss=False): attention_states, initial_state, image_emb = self.encoder.encode( self.process(image_feature)) if not FLAGS.image_as_init_state: #mostly go here scores = self.decoder.sequence_loss( text, input=image_emb, initial_state=initial_state, attention_states=attention_states, exact_prob=exact_prob, exact_loss=exact_loss) else: #for im2txt one more step at first, just for exp not used much with tf.variable_scope(self.decoder.scope) as scope: zero_state = self.decoder.cell.zero_state( batch_size=melt.get_batch_size(input), dtype=tf.float32) _, initial_state = self.decoder.cell(input, zero_state) #will pad start in decoder.sequence_loss scores = self.decoder.sequence_loss( text, input=None, initial_state=initial_state, attention_states=attention_states, exact_prob=exact_prob, exact_loss=exact_loss) if not self.is_training and not self.is_predict: #evaluate mode tf.add_to_collection('scores', scores) if not self.is_predict: loss = tf.reduce_mean(scores) else: loss = scores return loss
def build_graph(self, image_feature, text, neg_image_feature=None, neg_text=None, exact_loss=False): image_emb = self.build_image_embeddings(image_feature) attention_states = None if FLAGS.show_atten_tell: image_emb, attention_states = self.init_attention(image_emb) if not FLAGS.image_as_init_state: scores = self.decoder.sequence_loss( text, input=image_emb, attention_states=attention_states, exact_loss=exact_loss) else: #for im2txt one more step at first with tf.variable_scope(self.decoder.scope) as scope: zero_state = self.decoder.cell.zero_state( batch_size=melt.get_batch_size(image_emb), dtype=tf.float32) _, initial_state = self.decoder.cell(image_emb, zero_state) #will pad start in decoder.sequence_loss scores = self.decoder.sequence_loss( text, initial_state=initial_state, attention_states=attention_states, exact_loss=exact_loss) if not self.is_training and not self.is_predict: #evaluate mode tf.add_to_collection('scores', scores) if not self.is_predict: loss = tf.reduce_mean(scores) else: loss = scores return loss
def encode(self, image_features): image_emb = features2feature(image_features, is_training=self.is_training) image_features = tf.concat([image_features, tf.expand_dims(image_emb, 1)], 1) image_embs = self.build_image_embeddings(image_features) image_emb = image_embs[:,-1] image_embs = image_embs[:,:-1] #128,64,512 64,512 image_embs = tf.concat([image_embs, tf.tile(tf.expand_dims(self.pos_emb, 0), [melt.get_batch_size(image_embs), 1, 1])], 1) #to make it like rnn encoder outputs with tf.variable_scope("attention_embedding") as scope: encoder_output = tf.contrib.layers.fully_connected( inputs=image_embs, num_outputs=FLAGS.rnn_hidden_size, activation_fn=None, weights_initializer=self.initializer, biases_initializer=None, scope=scope) state = None #image_emb = tf.reduce_mean(image_embs, 1) return encoder_output, state, image_emb
def build_graph(self, image_feature, text, neg_image_feature=None, neg_text=None, exact_prob=False, exact_loss=False, weights=None): scope = tf.get_variable_scope() if not FLAGS.showtell_noimage: with tf.variable_scope(FLAGS.showtell_encode_scope or scope): attention_states, initial_state, image_emb = self.encode(image_feature) if image_emb is not None: assert not FLAGS.add_text_start, 'if use image emb as input then must not pad start mark before sentence' else: assert FLAGS.add_text_start, 'if not use image emb as input then must pad start mark before sentence' else: print('Language only mode!', file=sys.stderr) image_emb = tf.zeros([melt.get_batch_size(text), self.emb_dim]) initial_state = None attention_states = None with tf.variable_scope(FLAGS.showtell_decode_scope or scope): #will pad start in decoder.sequence_loss if FLAGS.image_as_init_state scores = self.decoder.sequence_loss(text, input=image_emb, initial_state=initial_state, attention_states=attention_states, exact_prob=exact_prob, exact_loss=exact_loss, vocab_weights=self.idf_weights if self.is_training else None, weights=weights if self.is_training else None) loss = scores if FLAGS.reinforcement_learning and self.is_training: assert not FLAGS.image_as_init_state, 'not support im2txt style for reinforcement_learning now, not tested!' assert self.rl, 'need to set rl for reinforcement_learning' tf.get_variable_scope().reuse_variables() max_words = TEXT_MAX_WORDS convert_unk = True #code borrow from https://github.com/arieling/SelfCriticalSequenceTraining-tensorflow #scores is -(negative log loss) sampled_caption, sampled_loss = self.decoder.generate_sequence_multinomial(image_emb, max_words=max_words, #max_words=16, initial_state=initial_state, attention_states=attention_states, convert_unk=convert_unk, #length_normalization_factor=0., need_logprobs=True) self.rl.sampled_caption = sampled_caption greedy_caption, _ = self.decoder.generate_sequence_greedy(image_emb, max_words=max_words, #max_words=20, initial_state=initial_state, attention_states=attention_states, convert_unk=convert_unk, need_logprobs=False) self.rl.greedy_caption = greedy_caption ratio = FLAGS.reinforcement_ratio #if doing this need loss and sampled_loss same shape batch_size or batch_size * text_length loss = ratio * (self.rl.rewards_feed - self.rl.baseline_feed) * sampled_loss + (1- ratio) * loss #loss = -loss if not self.is_predict: loss = tf.reduce_mean(loss) #if not self.is_training and not self.is_predict: #evaluate mode if self.is_training: tf.add_to_collection('train_scores', scores) elif not self.is_predict: tf.add_to_collection('eval_scores', scores) if FLAGS.discriminant_loss_ratio > 0 and self.is_training: assert neg_text is not None tf.get_variable_scope().reuse_variables() max_words = TEXT_MAX_WORDS convert_unk = True greedy_caption, _ = self.decoder.generate_sequence_greedy(image_emb, max_words=max_words, #max_words=20, initial_state=initial_state, attention_states=attention_states, convert_unk=convert_unk, need_logprobs=False) text_feature = self.encoder2.encode(text, self.emb) text_feature = normalize(text_feature) # neg_text = neg_text[:, 0, :] # neg_text_feature = self.encoder2.encode(neg_text, self.emb) # neg_text_feature = normalize(neg_text_feature) caption_feature = self.encoder2.encode(greedy_caption, self.emb) caption_feature = normalize(caption_feature) pos_score = compute_sim(caption_feature, text_feature) # neg_score = compute_sim(caption_feature, neg_text_feature) tf.add_to_collection('pos_score', pos_score) # tf.add_to_collection('neg_score', neg_score) # discriminant_loss = pairwise_loss(pos_score, neg_score) discriminant_loss = tf.reduce_mean((1. - pos_score) / 2.) #TODO this is mean loss so can use reduced loss then add discriminant_loss * ratio tf.add_to_collection('discriminant_loss', discriminant_loss) ratio = FLAGS.discriminant_loss_ratio tf.add_to_collection('gen_loss', loss) loss += ratio * discriminant_loss if FLAGS.alignment_history and self.is_training: alignment_history = self.decoder.alignment_history tf.add_to_collection('alignment_history', alignment_history) if FLAGS.alignment_loss_ratio > 0: lengths = self.decoder.final_sequence_lengths alignment_loss = self.calc_alignment_loss(alignment_history, lengths) tf.add_to_collection('alignment_loss', alignment_loss) #alignment_loss might be 4.1 .. ratio = FLAGS.alignment_loss_ratio #loss = (1 - ratio) * loss + ratio * alignment_loss loss += ratio * alignment_loss self.main_loss = loss if self.is_predict: loss = tf.squeeze(loss) return loss
def dupimage_process(self, image_feature): processed_image_feature = self.image_process_fn(tf.slice(image_feature, [0], [1])) # TODO seems below not work preocessed_image_feature = tf.contrib.seq2seq.tile_batch(processed_image_feature, melt.get_batch_size(image_feature)) return processed_image_feature
def encode(self, inputs, seq_len, emb=None, concat_layers=True, output_method=OutputMethod.all): if emb is not None: inputs = tf.nn.embedding_lookup(emb, inputs) outputs = [inputs] keep_prob = self.keep_prob num_units = self.num_units is_train = self.is_train with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE): for layer in range(self.num_layers): input_size_ = melt.get_shape( inputs, -1) if layer == 0 else 2 * num_units batch_size = melt.get_batch_size(inputs) with tf.variable_scope("fw_{}".format(layer)): gru_fw = tf.contrib.rnn.GRUCell(num_units) if not self.share_dropout: mask_fw = dropout(tf.ones([batch_size, 1, input_size_], dtype=tf.float32), keep_prob=keep_prob, is_train=is_train, mode=self.dropout_mode) else: if self.dropout_mask_fw[layer] is None: mask_fw = dropout( tf.ones([batch_size, 1, input_size_], dtype=tf.float32), keep_prob=keep_prob, is_train=is_train, mode=self.dropout_mode) self.dropout_mask_fw[layer] = mask_fw else: mask_fw = self.dropout_mask_fw[layer] if self.train_init_state: if self.init_fw[layer] is None: self.init_fw[layer] = tf.tile( tf.get_variable("init_state", [1, num_units], tf.float32, tf.zeros_initializer()), [batch_size, 1]) out_fw, state = tf.nn.dynamic_rnn( gru_fw, outputs[-1] * mask_fw, seq_len, initial_state=self.init_fw[layer], dtype=tf.float32) with tf.variable_scope("bw_{}".format(layer)): gru_bw = tf.contrib.rnn.GRUCell(num_units) if not self.share_dropout: mask_bw = dropout(tf.ones([batch_size, 1, input_size_], dtype=tf.float32), keep_prob=keep_prob, is_train=is_train, mode=self.dropout_mode) else: if self.dropout_mask_bw[layer] is None: mask_bw = dropout( tf.ones([batch_size, 1, input_size_], dtype=tf.float32), keep_prob=keep_prob, is_train=is_train, mode=self.dropout_mode) self.dropout_mask_bw[layer] = mask_bw else: mask_bw = self.dropout_mask_bw[layer] if self.train_init_state: if self.init_bw[layer] is None: self.init_bw[layer] = tf.tile( tf.get_variable("init_state", [1, num_units], tf.float32, tf.zeros_initializer()), [batch_size, 1]) inputs_bw = tf.reverse_sequence(outputs[-1] * mask_bw, seq_lengths=seq_len, seq_dim=1, batch_dim=0) out_bw, _ = tf.nn.dynamic_rnn( gru_bw, inputs_bw, seq_len, initial_state=self.init_bw[layer], dtype=tf.float32) out_bw = tf.reverse_sequence(out_bw, seq_lengths=seq_len, seq_dim=1, batch_dim=0) outputs.append(tf.concat([out_fw, out_bw], axis=2)) if concat_layers: res = tf.concat(outputs[1:], axis=2) else: res = outputs[-1] res = encode_outputs(res, seq_len, output_method=output_method) return res
def encode(self, inputs, seq_len, emb=None, concat_layers=True, output_method=OutputMethod.all): if emb is not None: inputs = tf.nn.embedding_lookup(emb, inputs) outputs = [tf.transpose(inputs, [1, 0, 2])] #states = [] keep_prob = self.keep_prob num_units = self.num_units is_train = self.is_train with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE): for layer in range(self.num_layers): input_size_ = melt.get_shape( inputs, -1) if layer == 0 else 2 * num_units batch_size = melt.get_batch_size(inputs) with tf.variable_scope("fw_{}".format(layer)): gru_fw = tf.contrib.cudnn_rnn.CudnnGRU(num_layers=1, num_units=num_units) if not self.share_dropout: # mode is None since by define mask.. is already recurrent mode mask_fw = dropout(tf.ones([1, batch_size, input_size_], dtype=tf.float32), keep_prob=keep_prob, is_train=is_train, mode=None) else: if self.dropout_mask_fw[layer] is None: mask_fw = dropout( tf.ones([1, batch_size, input_size_], dtype=tf.float32), keep_prob=keep_prob, is_train=is_train, mode=None) self.dropout_mask_fw[layer] = mask_fw else: mask_fw = self.dropout_mask_fw[layer] if self.train_init_state: if self.init_fw[layer] is None: self.init_fw[layer] = (tf.tile( tf.get_variable("init_state", [1, 1, num_units], tf.float32, tf.zeros_initializer()), [1, batch_size, 1]), ) out_fw, state_fw = gru_fw(outputs[-1] * mask_fw, self.init_fw[layer]) with tf.variable_scope("bw_{}".format(layer)): gru_bw = tf.contrib.cudnn_rnn.CudnnGRU(num_layers=1, num_units=num_units) if not self.share_dropout: mask_bw = dropout(tf.ones([1, batch_size, input_size_], dtype=tf.float32), keep_prob=keep_prob, is_train=is_train, mode=None) else: if self.dropout_mask_bw[layer] is None: mask_bw = dropout( tf.ones([1, batch_size, input_size_], dtype=tf.float32), keep_prob=keep_prob, is_train=is_train, mode=None) self.dropout_mask_bw[layer] = mask_bw else: mask_bw = self.dropout_mask_bw[layer] inputs_bw = tf.reverse_sequence(outputs[-1] * mask_bw, seq_lengths=seq_len, seq_dim=0, batch_dim=1) if self.train_init_state: if self.init_bw[layer] is None: self.init_bw[layer] = (tf.tile( tf.get_variable("init_state", [1, 1, num_units], tf.float32, tf.zeros_initializer()), [1, batch_size, 1]), ) out_bw, state_bw = gru_bw(inputs_bw, self.init_bw[layer]) out_bw = tf.reverse_sequence(out_bw, seq_lengths=seq_len, seq_dim=0, batch_dim=1) outputs.append(tf.concat([out_fw, out_bw], axis=2)) #states.append(tf.concat([state_fw, state_bw], axis=-1)) if concat_layers: res = tf.concat(outputs[1:], axis=2) #state = tf.concat(states, axis=-1) else: res = outputs[-1] #state = states[-1] res = tf.transpose(res, [1, 0, 2]) #state = tf.squeeze(state) #state = tf.reshape(state, [-1, num_units * 2 * self.num_layers]) #res = encode_outputs(res, output_method=output_method, sequence_length=seq_len, state=state) res = encode_outputs(res, output_method=output_method, sequence_length=seq_len) self.state = (state_fw, state_bw) return res
def generate_sequence_beam_search(self, input, max_words=None, initial_state=None, attention_states=None, beam_size=10, convert_unk=True, length_normalization_factor=0., input_text=None, input_text_length=None, emb=None): """ outgraph beam search, input should be one instance only batch_size=1 max_words actually not used here... for it is determined outgraph.. return top (path, score) TODO this is hacky, first step attention_state, input , state all size 1, then should be attention_state 1, input, state size is beam_size, also might be less then beam_size.. if not possible to find beam_size un done """ if emb is None: emb = self.emb tf.add_to_collection('beam_search_beam_size', tf.constant(beam_size)) if input_text is not None: if FLAGS.decode_copy: input_text = tf.squeeze(input_text) input_text_length = tf.to_int32(tf.squeeze(input_text_length)) input_text = input_text[0:input_text_length] input_text, _ = tf.unique(input_text) input_text_length = tf.shape(input_text)[-1] #sort from small to large #input_text, _ = -tf.nn.top_k(-input_text, input_text_length) #TODO may be need to be input_text_length, so as to do more decode limit out graph like using trie! beam_size = tf.minimum(beam_size, input_text_length) elif FLAGS.decode_use_alignment: input_text = tf.squeeze(input_text) input_text_length = tf.to_int32(tf.squeeze(input_text_length)) input_text = input_text[0:input_text_length] input_text_length = tf.shape(input_text)[-1] beam_size = tf.minimum(beam_size, input_text_length) else: if FLAGS.gen_only: input_text = None batch_size = melt.get_batch_size(input) if attention_states is None: cell = self.cell else: cell = self.prepare_attention( attention_states, initial_state=initial_state, score_as_alignment=self.score_as_alignment) initial_state = None state = cell.zero_state(batch_size, tf.float32) \ if initial_state is None else initial_state ##--TODO hard.. since need to reuse to share ValueError: ##Variable seq2seq/main/decode/memory_layer/kernel already exists, disallowed. Did you mean to set reuse=True in VarScope? ##another way to solve is always using tiled_batch attention_states and state, the first step will choose from only first beam ##will not all solve the problem since feed data might be less than beam size, so attention states always be 1 is safe #cell2 = self.prepare_attention(tf.contrib.seq2seq.tile_batch(attention_states, beam_size), reuse=True) first_state = state beam_search_step = functools.partial(self.beam_search_step, beam_size=beam_size) #since before hack using generate_sequence_greedy, here can not set scope.reuse_variables #NOTICE inorder to use lstm which is in .../rnn/ nameapce here you must also add this scope to use the shared with tf.variable_scope(self.scope) as scope: inital_attention, initial_state, initial_logprobs, initial_ids = \ beam_search_step(input, state, cell, input_text=input_text) if attention_states is not None: tf.add_to_collection( 'beam_search_initial_alignments', tf.get_collection('attention_alignments')[-1]) scope.reuse_variables() # In inference mode, use concatenated states for convenient feeding and # fetching. state_is_tuple = len(initial_state) == 2 if state_is_tuple: initial_state = tf.concat(initial_state, 1, name="initial_state") state_size = sum(self.cell.state_size) else: state_size = self.cell.state_size #output is used only when use attention if attention_states is not None: initial_state = tf.concat([initial_state, inital_attention], 1, name="initial_attention_state") state_size += self.cell.output_size tf.add_to_collection('beam_search_initial_state', initial_state) tf.add_to_collection('beam_search_initial_logprobs', initial_logprobs) tf.add_to_collection('beam_search_initial_ids', initial_ids) input_feed = tf.placeholder( dtype=tf.int64, shape=[None], # batch_size name="input_feed") tf.add_to_collection('beam_search_input_feed', input_feed) input = tf.nn.embedding_lookup(emb, input_feed) # Placeholder for feeding a batch of concatenated states. state_feed = tf.placeholder(dtype=tf.float32, shape=[None, state_size], name="state_feed") tf.add_to_collection('beam_search_state_feed', state_feed) if attention_states is not None: state, attention = tf.split(state_feed, [ state_size - self.cell.output_size, self.cell.output_size ], axis=1) else: state = state_feed if state_is_tuple: state = tf.split(state, num_or_size_splits=2, axis=1) if attention_states is not None: state_ = first_state.clone(cell_state=state, attention=attention) else: state_ = state #--TODO here is not safe if change attention_wrapper, notice batch size of attention states is 1 #--but cell input and state is beam_size #attention, state, top_logprobs, top_ids = beam_search_step(input, state_, cell2) if input_text is not None and not FLAGS.decode_copy: input_text = tf.contrib.seq2seq.tile_batch( input_text, melt.get_batch_size(input)) attention, state, top_logprobs, top_ids = beam_search_step( input, state_, cell, input_text=input_text) if state_is_tuple: # Concatentate the resulting state. state = tf.concat(state, 1, name="state") if attention_states is not None: state = tf.concat([state, attention], 1, name="attention_state") tf.add_to_collection('beam_search_state', state) tf.add_to_collection('beam_search_logprobs', top_logprobs) tf.add_to_collection('beam_search_ids', top_ids) #just same return like return path list, score list return tf.no_op(), tf.no_op()
def call(self, x, sequence_length=None, mask_fws=None, mask_bws=None, concat_layers=None, output_method=None, training=False): concat_layers = concat_layers or self.concat_layers output_mehtod = output_method or self.output_method if self.residual_connect: x = self.residual_linear(x) outputs = [x] #states = [] keep_prob = self.keep_prob num_units = self.num_units batch_size = melt.get_batch_size(x) if sequence_length is None: len_ = melt.get_shape(x, 1) sequence_length = tf.ones([ batch_size, ], dtype=tf.int64) * len_ for layer in range(self.num_layers): input_size_ = melt.get_shape(x, -1) if layer == 0 else 2 * num_units gru_fw, gru_bw = self.gru_fws[layer], self.gru_bws[layer] if self.train_init_state: #init_fw = tf.tile(self.init_fw[layer], [batch_size, 1]) #init_fw = tf.tile(self.init_fw_layer(layer), [batch_size, 1]) init_fw = self.init_fw_layer(layer, batch_size) if self.cell == 'lstm': init_fw = (init_fw, self.init_fw2_layer(layer, batch_size)) else: init_fw = None if self.recurrent_dropout: if mask_fws is not None: mask_fw = mask_fws[layer] else: if not self.share_dropout: mask_fw = dropout(tf.ones([batch_size, 1, input_size_], dtype=tf.float32), keep_prob=keep_prob, training=training, mode=None) else: if self.dropout_mask_fw[layer] is None or ( tf.executing_eagerly() and batch_size != self.dropout_mask_fw[layer].shape[0]): mask_fw = dropout( tf.ones([batch_size, 1, input_size_], dtype=tf.float32), keep_prob=keep_prob, training=training, mode=None) self.dropout_mask_fw[layer] = mask_fw else: mask_fw = self.dropout_mask_fw[layer] inputs_fw = outputs[-1] * mask_fw else: inputs_fw = dropout(outputs[-1], keep_prob=keep_prob, training=training, mode=None) # https://stackoverflow.com/questions/48233400/lstm-initial-state-from-dense-layer # gru and lstm different ... state lstm need tuple (,) states as input state\ if self.cell == 'gru': out_fw, state_fw = gru_fw(inputs_fw, init_fw) else: out_fw, state_fw1, state_fw2 = gru_fw(inputs_fw, init_fw) state_fw = (state_fw1, state_fw2) if self.train_init_state: #init_bw = tf.tile(self.init_bw[layer], [batch_size, 1]) #init_bw = tf.tile(self.init_bw_layer(layer), [batch_size, 1]) init_bw = self.init_bw_layer(layer, batch_size) if self.cell == 'lstm': init_bw = (init_bw, self.init_bw2_layer(layer, batch_size)) else: init_bw = None if mask_bws is not None: mask_bw = mask_bws[layer] else: if not self.share_dropout: mask_bw = dropout(tf.ones([batch_size, 1, input_size_], dtype=tf.float32), keep_prob=keep_prob, training=training, mode=None) else: if self.dropout_mask_bw[layer] is None or ( tf.executing_eagerly() and batch_size != self.dropout_mask_bw[layer].shape[0]): mask_bw = dropout(tf.ones([batch_size, 1, input_size_], dtype=tf.float32), keep_prob=keep_prob, training=training, mode=None) self.dropout_mask_bw[layer] = mask_bw else: mask_bw = self.dropout_mask_bw[layer] if self.recurrent_dropout: inputs_bw = outputs[-1] * mask_bw else: if self.bw_dropout: inputs_bw = dropout(outputs[-1], keep_prob=keep_prob, training=training, mode=None) else: inputs_bw = inputs_fw inputs_bw = tf.reverse_sequence(inputs_bw, seq_lengths=sequence_length, seq_axis=1, batch_axis=0) if self.cell == 'gru': out_bw, state_bw = gru_bw(inputs_bw, init_bw) else: out_bw, state_bw1, state_bw2 = gru_bw(inputs_bw, init_bw) state_bw = (state_bw1, state_bw2) out_bw = tf.reverse_sequence(out_bw, seq_lengths=sequence_length, seq_axis=1, batch_axis=0) outputs.append(tf.concat([out_fw, out_bw], axis=2)) if self.residual_connect: outputs[-1] = self.batch_norm(outputs[-2] + outputs[-1]) if concat_layers: res = tf.concat(outputs[1:], axis=2) else: res = outputs[-1] res = encode_outputs(res, output_method=output_method, sequence_length=sequence_length) self.state = (state_fw, state_bw) if not self.return_state: return res else: return res, self.state
def sequence_loss(self, sequence, initial_state=None, attention_states=None, input=None, input_text=None, exact_prob=False, exact_loss=False, emb=None): """ for general seq2seq input is None, sequence will pad <GO>, inital_state is last state from encoder for img2text/showandtell input is image_embedding, inital_state is None/zero set TODO since exact_porb and exact_loss same value, may remove exact_prob NOTICE! assume sequence to be padded by zero and must have one instance full length(no zero!) """ if emb is None: emb = self.emb is_training = self.is_training batch_size = melt.get_batch_size(sequence) sequence, sequence_length = melt.pad(sequence, start_id=self.get_start_id(), end_id=self.get_end_id()) #[batch_size, num_steps - 1, emb_dim], remove last col inputs = tf.nn.embedding_lookup(emb, sequence[:, :-1]) if is_training and FLAGS.keep_prob < 1: inputs = tf.nn.dropout(inputs, FLAGS.keep_prob) #inputs[batch_size, num_steps, emb_dim] input([batch_size, emb_dim] -> [batch_size, 1, emb_dim]) before concat if input is not None: #used like showandtell where image_emb is as input, additional to sequence inputs = tf.concat([tf.expand_dims(input, 1), inputs], 1) else: #common usage input is None, sequence as input, notice already pad <GO> before using melt.pad sequence_length -= 1 sequence = sequence[:, 1:] if self.is_predict: #---only need when predict, since train input already dynamic length, NOTICE this will improve speed a lot num_steps = tf.to_int32(tf.reduce_max(sequence_length)) sequence = sequence[:, :num_steps] inputs = inputs[:, :num_steps, :] tf.add_to_collection('sequence', sequence) tf.add_to_collection('sequence_length', sequence_length) #[batch_size, num_steps] targets = sequence if attention_states is None: cell = self.cell else: cell = self.prepare_attention( attention_states, initial_state=initial_state, score_as_alignment=self.score_as_alignment) initial_state = None state = cell.zero_state( batch_size, tf.float32) if initial_state is None else initial_state if FLAGS.gen_only: #gen only mode #for attention wrapper can not use dynamic_rnn if aligments_history=True TODO see pointer_network in application seems ok.. why outputs, state = tf.nn.dynamic_rnn(cell, inputs, initial_state=state, sequence_length=sequence_length, dtype=tf.float32, scope=self.scope) #--------below is ok but slower then dynamic_rnn 3.4batch -> 3.1 batch/s #helper = melt.seq2seq.TrainingHelper(inputs, tf.to_int32(sequence_length)) ##helper = tf.contrib.seq2seq.TrainingHelper(inputs, tf.to_int32(sequence_length)) #my_decoder = melt.seq2seq.BasicTrainingDecoder( ##my_decoder = tf.contrib.seq2seq.BasicDecoder( ##my_decoder = melt.seq2seq.BasicDecoder( # cell=cell, # helper=helper, # initial_state=state) ##outputs, state, _ = tf.contrib.seq2seq.dynamic_decode(my_decoder, scope=self.scope) #outputs, state, _ = melt.seq2seq.dynamic_decode(my_decoder, scope=self.scope) ##outputs = outputs.rnn_output else: #---copy only or gen copy helper = melt.seq2seq.TrainingHelper(inputs, tf.to_int32(sequence_length)) indices = melt.batch_values_to_indices(tf.to_int32(input_text)) if FLAGS.copy_only: output_fn = lambda cell_output, cell_state: self.copy_output_fn( indices, batch_size, cell_output, cell_state) else: #gen_copy right now, not use switch sampled_values = None if self.softmax_loss_function is not None: sampled_values = tf.nn.log_uniform_candidate_sampler( true_classes=tf.reshape(targets, [-1, 1]), num_true=1, num_sampled=self.num_sampled, unique=True, range_max=self.vocab_size) #TODO since perf of sampled version here is ok not modify now, but actually in addtional to sampled_values #sampled_w, sampled_b can also be pre embedding lookup, may imporve not much output_fn = lambda time, cell_output, cell_state: self.gen_copy_output_train_fn( time, indices, targets, sampled_values, batch_size, cell_output, cell_state) my_decoder = melt.seq2seq.BasicTrainingDecoder( cell=cell, helper=helper, initial_state=state, vocab_size=self.vocab_size, output_fn=output_fn) outputs, state, _ = tf.contrib.seq2seq.dynamic_decode( my_decoder, scope=self.scope) #outputs, state, _ = melt.seq2seq.dynamic_decode(my_decoder, scope=self.scope) tf.add_to_collection('outputs', outputs) #TODO: hack here add FLAGS.predict_no_sample just for Seq2seqPredictor exact_predict softmax_loss_function = self.softmax_loss_function if self.is_predict and (exact_prob or exact_loss): softmax_loss_function = None if not FLAGS.gen_only: logits = outputs softmax_loss_function = None elif softmax_loss_function is not None: logits = outputs else: #[batch_size, num_steps, num_units] * [num_units, vocab_size] # -> logits [batch_size, num_steps, vocab_size] (if use exact_predict_loss) #or [batch_size * num_steps, vocab_size] by default flatten=True keep_dims = exact_prob or exact_loss logits = melt.batch_matmul_embedding( outputs, self.w, keep_dims=keep_dims) + self.v if not keep_dims: targets = tf.reshape(targets, [-1]) tf.add_to_collection('logits', logits) #if input_text is not None: # logits = outputs mask = tf.cast(tf.sign(targets), dtype=tf.float32) if FLAGS.gen_copy_switch: #TODO why need more gpu mem ? ... do not save logits ? just calc loss in output_fn ? #batch size 256 #File "/home/gezi/mine/hasky/util/melt/seq2seq/loss.py", line 154, in body #step_logits = logits[:, i, :] #ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[256,21,33470] num_steps = tf.shape(targets)[1] loss = melt.seq2seq.exact_predict_loss(logits, targets, mask, num_steps, need_softmax=False, need_average=True, batch_size=batch_size) # loss = melt.seq2seq.sequence_loss_by_example( # logits, # targets, # weights=mask) elif self.is_predict and exact_prob: #generate real prob for sequence #for 10w vocab textsum seq2seq 20 -> 4 about loss = melt.seq2seq.exact_predict_loss(logits, targets, mask, num_steps, batch_size=batch_size) elif self.is_predict and exact_loss: #force no sample softmax loss, the diff with exact_prob is here we just use cross entropy error as result not real prob of seq #NOTICE using time a bit less 55 to 57(prob), same result with exact prob and exact score #but 256 vocab sample will use only about 10ms loss = melt.seq2seq.sequence_loss_by_example(logits, targets, weights=mask) else: #loss [batch_size,] loss = melt.seq2seq.sequence_loss_by_example( logits, targets, weights=mask, softmax_loss_function=softmax_loss_function) #mainly for compat with [bach_size, num_losses] loss = tf.reshape(loss, [-1, 1]) if self.is_predict: loss = self.normalize_length(loss, sequence_length, exact_prob) #loss = tf.squeeze(loss) TODO: later will uncomment this with all models rerun return loss
def build_predict_text_graph(self, image, decode_method='greedy', beam_size=5, convert_unk=False, length_normalization_factor=None, max_words=None, logprobs_history=False, alignment_history=False): scope = tf.get_variable_scope() if not FLAGS.showtell_noimage: with tf.variable_scope(FLAGS.showtell_encode_scope or scope): attention_states, initial_state, image_emb = self._encode( image) else: image_emb = tf.zeros([melt.get_batch_size(image), self.emb_dim]) initial_state = None attention_states = None with tf.variable_scope(FLAGS.showtell_decode_scope or scope): # max_words = max_words or TEXT_MAX_WORDS max_words = max_words or FLAGS.decoder_max_words decode_func = None if decode_method == SeqDecodeMethod.greedy: decode_func = self.decoder.generate_sequence_greedy elif decode_method == SeqDecodeMethod.multinomal: decode_func = self.decoder.generate_sequence_multinomial if decode_func is not None: results = decode_func( image_emb, max_words=max_words, initial_state=initial_state, attention_states=attention_states, convert_unk=convert_unk, need_logprobs=FLAGS.greedy_decode_with_logprobs) else: if decode_method == SeqDecodeMethod.ingraph_beam: decode_func = self.decoder.generate_sequence_ingraph_beam elif decode_method == SeqDecodeMethod.outgraph_beam: decode_func = self.decoder.generate_sequence_outgraph_beam else: raise ValueError('not supported decode_method: %s' % decode_method) results = decode_func( image_emb, max_words=max_words, initial_state=initial_state, beam_size=beam_size, convert_unk=convert_unk, attention_states=attention_states, length_normalization_factor=length_normalization_factor or FLAGS.length_normalization_factor, logprobs_history=logprobs_history, alignment_history=alignment_history) if logprobs_history: if self.decoder.log_probs_history is not None: tf.add_to_collection('decoder_logprobs_history', self.decoder.log_probs_history) if alignment_history: if self.decoder.alignment_history is not None: tf.add_to_collection('decoder_alignment_history', self.decoder.alignment_history) return results
def sequence_loss(self, sequence, initial_state=None, attention_states=None, input=None, input_text=None, exact_prob=False, exact_loss=False, emb=None): """ for general seq2seq input is None, sequence will pad <GO>, inital_state is last state from encoder for showandtell input is image_embedding, inital_state is None/zero set, if use im2txt mode set image_as_init_state=True will do as above, need to PAD <GO> ! TODO since exact_porb and exact_loss same value, may remove exact_prob NOTICE! assume sequence to be padded by zero and must have one instance full length(no zero!) """ if emb is None: emb = self.emb is_training = self.is_training batch_size = melt.get_batch_size(sequence) sequence, sequence_length = melt.pad(sequence, start_id=self.get_start_id(), end_id=self.get_end_id()) #[batch_size, num_steps - 1, emb_dim], remove last col inputs = tf.nn.embedding_lookup(emb, sequence[:, :-1]) if is_training and FLAGS.keep_prob < 1: inputs = tf.nn.dropout(inputs, FLAGS.keep_prob) #inputs[batch_size, num_steps, emb_dim] input([batch_size, emb_dim] -> [batch_size, 1, emb_dim]) before concat if input is not None: #used like showandtell where image_emb is as input, additional to sequence inputs = tf.concat([tf.expand_dims(input, 1), inputs], 1) else: #common usage input is None, sequence as input, notice already pad <GO> before using melt.pad sequence_length -= 1 sequence = sequence[:, 1:] if self.is_predict: #---only need when predict, since train input already dynamic length, NOTICE this will improve speed a lot num_steps = tf.to_int32(tf.reduce_max(sequence_length)) sequence = sequence[:, :num_steps] inputs = inputs[:, :num_steps, :] tf.add_to_collection('sequence', sequence) tf.add_to_collection('sequence_length', sequence_length) #[batch_size, num_steps] targets = sequence if attention_states is None: cell = self.cell else: cell = self.prepare_attention( attention_states, initial_state=initial_state, score_as_alignment=self.score_as_alignment) initial_state = None state = cell.zero_state( batch_size, tf.float32) if initial_state is None else initial_state #TODO: hack here add FLAGS.predict_no_sample just for Seq2seqPredictor exact_predict softmax_loss_function = self.softmax_loss_function if self.is_predict and (exact_prob or exact_loss): softmax_loss_function = None scheduled_sampling_probability = FLAGS.scheduled_sampling_probability if self.is_training else 0. if FLAGS.gen_only: #gen only mode #for attention wrapper can not use dynamic_rnn if aligments_history=True TODO see pointer_network in application seems ok.. why if scheduled_sampling_probability > 0.: helper = melt.seq2seq.ScheduledEmbeddingTrainingHelper( inputs, tf.to_int32(sequence_length), emb, tf.constant(FLAGS.scheduled_sampling_probability)) #helper = tf.contrib.seq2seq.TrainingHelper(inputs, tf.to_int32(sequence_length)) my_decoder = melt.seq2seq.BasicDecoder( #my_decoder = tf.contrib.seq2seq.BasicDecoder( #my_decoder = melt.seq2seq.BasicDecoder( cell=cell, helper=helper, initial_state=state) outputs, state, _ = tf.contrib.seq2seq.dynamic_decode( my_decoder, scope=self.scope) #outputs, state, _ = melt.seq2seq.dynamic_decode(my_decoder, scope=self.scope) outputs = outputs.rnn_output else: outputs, state = tf.nn.dynamic_rnn( cell, inputs, initial_state=state, sequence_length=sequence_length, dtype=tf.float32, scope=self.scope) #--------below is ok but slower then dynamic_rnn 3.4batch -> 3.1 batch/s #helper = melt.seq2seq.TrainingHelper(inputs, tf.to_int32(sequence_length)) ##helper = tf.contrib.seq2seq.TrainingHelper(inputs, tf.to_int32(sequence_length)) #my_decoder = melt.seq2seq.BasicTrainingDecoder( ##my_decoder = tf.contrib.seq2seq.BasicDecoder( ##my_decoder = melt.seq2seq.BasicDecoder( # cell=cell, # helper=helper, # initial_state=state) ##outputs, state, _ = tf.contrib.seq2seq.dynamic_decode(my_decoder, scope=self.scope) #outputs, state, _ = melt.seq2seq.dynamic_decode(my_decoder, scope=self.scope) ##outputs = outputs.rnn_output else: #---copy only or gen copy if scheduled_sampling_probability > 0.: #not tested yet TODO helper = melt.seq2seq.ScheduledEmbeddingTrainingHelper( inputs, tf.to_int32(sequence_length), emb, tf.constant(FLAGS.scheduled_sampling_probability)) Decoder_ = melt.seq2seq.BasicDecoder else: #as before helper = melt.seq2seq.TrainingHelper( inputs, tf.to_int32(sequence_length)) Decoder_ = melt.seq2seq.BasicTrainingDecoder indices = melt.batch_values_to_indices(tf.to_int32(input_text)) if FLAGS.copy_only: output_fn = lambda cell_output, cell_state: self.copy_output_fn( indices, batch_size, cell_output, cell_state) else: #gen_copy right now, not use switch ? gen_copy and switch? sampled_values = None #TODO CHECK this is it ok? why train and predict not equal and score/exact score same? FIXME #need first debug why score and exact score is same ? score should be the same as train! TODO #sh ./inference/infrence-score.sh to reproduce #now just set num_sampled = 0 for safe, may be here train also not correct FIXME if softmax_loss_function is not None: sampled_values = tf.nn.log_uniform_candidate_sampler( true_classes=tf.reshape(targets, [-1, 1]), num_true=1, num_sampled=self.num_sampled, unique=True, range_max=self.vocab_size) #TODO since perf of sampled version here is ok not modify now, but actually in addtional to sampled_values #sampled_w, sampled_b can also be pre embedding lookup, may imporve not much output_fn = lambda time, cell_output, cell_state: self.gen_copy_output_train_fn( time, indices, targets, sampled_values, batch_size, cell_output, cell_state) my_decoder = Decoder_(cell=cell, helper=helper, initial_state=state, vocab_size=self.vocab_size, output_fn=output_fn) outputs, state, _ = tf.contrib.seq2seq.dynamic_decode( my_decoder, scope=self.scope) #outputs, state, _ = melt.seq2seq.dynamic_decode(my_decoder, scope=self.scope) if hasattr(outputs, 'rnn_output'): outputs = outputs.rnn_output tf.add_to_collection('outputs', outputs) if not FLAGS.gen_only: logits = outputs softmax_loss_function = None elif softmax_loss_function is not None: logits = outputs else: #--softmax_loss_function is None means num_sample = 0 or exact_loss or exact_prob #[batch_size, num_steps, num_units] * [num_units, vocab_size] # -> logits [batch_size, num_steps, vocab_size] (if use exact_predict_loss) #or [batch_size * num_steps, vocab_size] by default flatten=True #this will be fine for train [batch_size * num_steps] but not good for eval since we want #get score of each instance also not good for predict #--------only training mode not keep dims, but this will be dangerous, since class call rnn_decoder #need to manully set rnn_decoder.is_training=False! TODO other wise will show incorrect scores in eval mode #but not affect the final model! keep_dims = exact_prob or exact_loss or (not self.is_training) logits = melt.batch_matmul_embedding( outputs, self.w, keep_dims=keep_dims) + self.v if not keep_dims: targets = tf.reshape(targets, [-1]) tf.add_to_collection('logits', logits) mask = tf.cast(tf.sign(targets), dtype=tf.float32) if FLAGS.gen_copy_switch and FLAGS.switch_after_softmax: #TODO why need more gpu mem ? ... do not save logits ? just calc loss in output_fn ? #batch size 256 #File "/home/gezi/mine/hasky/util/melt/seq2seq/loss.py", line 154, in body #step_logits = logits[:, i, :] #ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[256,21,33470] num_steps = tf.shape(targets)[1] loss = melt.seq2seq.exact_predict_loss( logits, targets, mask, num_steps, need_softmax=False, average_across_timesteps=not self.is_predict, batch_size=batch_size) elif self.is_predict and exact_prob: #generate real prob for sequence #for 10w vocab textsum seq2seq 20 -> 4 about loss = melt.seq2seq.exact_predict_loss( logits, targets, mask, num_steps, batch_size=batch_size, average_across_timesteps=False) elif self.is_predict and exact_loss: #force no sample softmax loss, the diff with exact_prob is here we just use cross entropy error as result not real prob of seq #NOTICE using time a bit less 55 to 57(prob), same result with exact prob and exact score #but 256 vocab sample will use only about 10ms loss = melt.seq2seq.sequence_loss_by_example( logits, targets, weights=mask, average_across_timesteps=False) else: #loss [batch_size,] loss = melt.seq2seq.sequence_loss_by_example( logits, targets, weights=mask, average_across_timesteps=not self. is_predict, #train must average, other wise long sentence big loss.. softmax_loss_function=softmax_loss_function) #mainly for compat with [bach_size, num_losses] here may be [batch_size * num_steps,] if is_training and not exact loss/prob loss = tf.reshape(loss, [-1, 1]) self.ori_loss = loss if self.is_predict: #note use avg_loss not to change loss pointer, avg_loss is same as average time step=True is length_normalize_fator=1.0 avg_loss = self.normalize_length(loss, sequence_length) return avg_loss #if not is_predict loss is averaged per time step else not but avg loss will average it return loss
def generate_sequence(self, input, max_words, initial_state=None, attention_states=None, convert_unk=True, input_text=None, Helper=None, emb=None): """ this one is using greedy search method for beam search using generate_sequence_by_beam_search with addditional params like beam_size """ if emb is None: emb = self.emb batch_size = melt.get_batch_size(input) if attention_states is None: cell = self.cell else: cell = self.prepare_attention( attention_states, initial_state=initial_state, score_as_alignment=self.score_as_alignment) initial_state = None state = cell.zero_state( batch_size, tf.float32) if initial_state is None else initial_state need_logprobs = FLAGS.greedy_decode_with_logprobs if Helper is None: if not need_logprobs: helper = melt.seq2seq.GreedyEmbeddingHelper( embedding=emb, first_input=input, end_token=self.end_id) else: helper = melt.seq2seq.LogProbsGreedyEmbeddingHelper( embedding=emb, first_input=input, end_token=self.end_id, need_softmax=self.need_softmax) else: helper = melt.seq2seq.MultinomialEmbeddingHelper( embedding=emb, first_input=input, end_token=self.end_id, need_softmax=self.need_softmax) if FLAGS.gen_only: output_fn = self.output_fn else: indices = melt.batch_values_to_indices(tf.to_int32(input_text)) if FLAGS.copy_only: output_fn_ = self.copy_output_fn else: output_fn_ = self.gen_copy_output_fn output_fn = lambda cell_output, cell_state: output_fn_( indices, batch_size, cell_output, cell_state) Decoder = melt.seq2seq.BasicDecoder if not need_logprobs else melt.seq2seq.LogProbsDecoder my_decoder = Decoder(cell=cell, helper=helper, initial_state=state, vocab_size=self.vocab_size, output_fn=output_fn) outputs, final_state, sequence_length = melt.seq2seq.dynamic_decode( my_decoder, maximum_iterations=max_words, #MUST set to True, other wise will not set zero and sumup tokens past done/end token impute_finished=True, scope=self.scope) sequence = outputs.sample_id if not hasattr(final_state, 'log_probs'): score = tf.zeros([ batch_size, ]) else: score = self.normalize_length(final_state.log_probs, sequence_length, reshape=False) ##below can be verified to be the same # num_steps = tf.to_int32(tf.reduce_max(sequence_length)) # score2 = -melt.seq2seq.exact_predict_loss(outputs.rnn_output, sequence, tf.to_float(tf.sign(sequence)), # num_steps, need_softmax=True, average_across_timesteps=False) # score2 = self.normalize_length(score2, sequence_length, reshape=False) # score -= score2 #score = tf.exp(score) #score = tf.concat([tf.expand_dims(score, 1), outputs.log_probs], 1) if FLAGS.predict_use_prob: score = tf.exp(score) tf.add_to_collection('greedy_log_probs_list', outputs.log_probs) #------like beam search return sequence, score return sequence, score
def call(self, inputs, sequence_length, inputs2, sequence_length2, mask_fws, mask_bws, concat_layers=True, output_method=OutputMethod.all, training=False): outputs = [inputs] outputs2 = [inputs2] keep_prob = self.keep_prob num_units = self.num_units batch_size = melt.get_batch_size(inputs) for layer in range(self.num_layers): input_size_ = melt.get_shape(inputs, -1) if layer == 0 else 2 * num_units gru_fw, gru_bw = self.gru_fws[layer], self.gru_bws[layer] if self.train_init_state: init_fw = self.init_fw_layer(layer, batch_size) else: init_fw = None mask_fw = mask_fws[layer] out_fw, state_fw = gru_fw(outputs[-1] * mask_fw, init_fw) out_fw2, state_fw2 = gru_fw(outputs2[-1] * mask_fw, state_fw) mask_bw = mask_bws[layer] inputs_bw = tf.reverse_sequence( outputs[-1] * mask_bw, sequence_lengthgths=sequence_length, seq_axis=1, batch_axis=0) inputs_bw2 = tf.reverse_sequence( outputs2[-1] * mask_bw, sequence_lengthgths=sequence_length2, seq_axis=1, batch_axis=0) if self.train_init_state: init_bw = self.init_bw_layer(layer, batch_size) else: init_bw = None out_bw, state_bw = gru_bw(inputs_bw, init_bw) out_bw2, state_bw2 = gru_bw(inputs_bw2, state_bw) outputs.append(tf.concat([out_fw, out_bw], axis=2)) outputs2.append(tf.concat([out_fw2, out_bw2], axis=2)) if concat_layers: res = tf.concat(outputs[1:], axis=2) res2 = tf.concat(outputs2[1:], axis=2) else: res = outputs[-1] res2 = outpus2[-1] res = tf.concat([res, res2], axis=1) res = encode_outputs(res, output_method=output_method, sequence_length=sequence_length) self.state = (state_fw2, state_bw2) return res
def generate_sequence_beam(self, input, max_words, initial_state=None, attention_states=None, beam_size=5, convert_unk=True, length_normalization_factor=0., input_text=None, input_text_length=None, emb=None): """ beam dcode means ingraph beam search return top (path, score) """ if emb is None: emb = self.emb def loop_function(i, prev, state, decoder): prev, state = decoder.take_step(i, prev, state) next_input = tf.nn.embedding_lookup(emb, prev) return next_input, state batch_size = melt.get_batch_size(input) if initial_state is not None: initial_state = nest.map_structure( lambda x: tf.contrib.seq2seq.tile_batch(x, beam_size), initial_state) if attention_states is None: cell = self.cell else: attention_states = tf.contrib.seq2seq.tile_batch( attention_states, beam_size) #print('tiled_attention_states', attention_states, 'tiled_initial_state', initial_state) cell = self.prepare_attention( attention_states, initial_state=initial_state, score_as_alignment=self.score_as_alignment) initial_state = None state = cell.zero_state(batch_size * beam_size, tf.float32) \ if initial_state is None else initial_state if FLAGS.gen_only: output_fn = self.output_fn else: input_text = tf.contrib.seq2seq.tile_batch(input_text, beam_size) batch_size = batch_size * beam_size indices = melt.batch_values_to_indices(tf.to_int32(input_text)) if FLAGS.copy_only: output_fn_ = self.copy_output_fn else: output_fn_ = self.gen_copy_output_fn output_fn = lambda cell_output, cell_state: output_fn_( indices, batch_size, cell_output, cell_state) ##TODO to be safe make topn the same as beam size return melt.seq2seq.beam_decode( input, max_words, state, cell, loop_function, scope=self.scope, beam_size=beam_size, done_token=vocabulary.vocab.end_id(), output_fn=output_fn, length_normalization_factor=length_normalization_factor, topn=beam_size)