def forword(self, ltext, rtext): assert not FLAGS.rtext_bow text = tf.concat([ltext, rtext], 0) text_feature = self.encode(text) ltext_feature, rtext_feature = tf.split(text_feature, 2, 0) ltext_feature = self.mlp_layers(ltext_feature) ltext_feature = normalize(ltext_feature) rtext_feature = normalize(rtext_feature) return ltext_feature, rtext_feature
def rforward(self, text): """ Args: text: batch text [batch_size, max_text_len] """ text_feature = self.encode(text) text_feature = normalize(text_feature) return text_feature
def forward_image_feature(self, image_feature): """ Args: image: batch image [batch_size, image_feature_len] """ image_feature = self.forward_image_layers(image_feature) #for point wise comment below image_feature = normalize(image_feature) return image_feature
def rforward(self, text): """ Args: text: batch text [batch_size, max_text_len] """ if not FLAGS.rtext_bow: text_feature = self.encode(text) else: text_feature = bow_encoder.encode(text, self.emb) text_feature = normalize(text_feature) return text_feature
def lforward(self, text): """ Args: text: batch text [batch_size, max_text_len] """ text_feature = self.encode(text) text_feature = self.mlp_layers(text_feature) ##--well if not normalize will get big values.. then sigmod like 72 -> 1 #if not FLAGS.loss == 'cross': ## contrastive loss work both norm or not norm, for simplicity here not norm ##https://www.quora.com/When-training-siamese-networks-how-does-one-determine-the-margin-for-contrastive-loss-How-do-you-convert-this-loss-to-accuracy ##You can just normalize features using L2 before using Contrastive Loss. ##Then the margin can be constant while training because the distance between features will be normalized. #if not FLAGS.loss == 'contrastive': text_feature = normalize(text_feature) return text_feature
def forward_text_feature(self, text_feature): text_feature = self.forward_text_layers(text_feature) #for pointwise comment below #must be -1 not 1 for num_negs might > 1 if lookup onece.. text_feature = normalize(text_feature) return text_feature
def build_graph(self, image_feature, text, neg_image_feature=None, neg_text=None, exact_prob=False, exact_loss=False, weights=None): scope = tf.get_variable_scope() if not FLAGS.showtell_noimage: with tf.variable_scope(FLAGS.showtell_encode_scope or scope): attention_states, initial_state, image_emb = self.encode(image_feature) if image_emb is not None: assert not FLAGS.add_text_start, 'if use image emb as input then must not pad start mark before sentence' else: assert FLAGS.add_text_start, 'if not use image emb as input then must pad start mark before sentence' else: print('Language only mode!', file=sys.stderr) image_emb = tf.zeros([melt.get_batch_size(text), self.emb_dim]) initial_state = None attention_states = None with tf.variable_scope(FLAGS.showtell_decode_scope or scope): #will pad start in decoder.sequence_loss if FLAGS.image_as_init_state scores = self.decoder.sequence_loss(text, input=image_emb, initial_state=initial_state, attention_states=attention_states, exact_prob=exact_prob, exact_loss=exact_loss, vocab_weights=self.idf_weights if self.is_training else None, weights=weights if self.is_training else None) loss = scores if FLAGS.reinforcement_learning and self.is_training: assert not FLAGS.image_as_init_state, 'not support im2txt style for reinforcement_learning now, not tested!' assert self.rl, 'need to set rl for reinforcement_learning' tf.get_variable_scope().reuse_variables() max_words = TEXT_MAX_WORDS convert_unk = True #code borrow from https://github.com/arieling/SelfCriticalSequenceTraining-tensorflow #scores is -(negative log loss) sampled_caption, sampled_loss = self.decoder.generate_sequence_multinomial(image_emb, max_words=max_words, #max_words=16, initial_state=initial_state, attention_states=attention_states, convert_unk=convert_unk, #length_normalization_factor=0., need_logprobs=True) self.rl.sampled_caption = sampled_caption greedy_caption, _ = self.decoder.generate_sequence_greedy(image_emb, max_words=max_words, #max_words=20, initial_state=initial_state, attention_states=attention_states, convert_unk=convert_unk, need_logprobs=False) self.rl.greedy_caption = greedy_caption ratio = FLAGS.reinforcement_ratio #if doing this need loss and sampled_loss same shape batch_size or batch_size * text_length loss = ratio * (self.rl.rewards_feed - self.rl.baseline_feed) * sampled_loss + (1- ratio) * loss #loss = -loss if not self.is_predict: loss = tf.reduce_mean(loss) #if not self.is_training and not self.is_predict: #evaluate mode if self.is_training: tf.add_to_collection('train_scores', scores) elif not self.is_predict: tf.add_to_collection('eval_scores', scores) if FLAGS.discriminant_loss_ratio > 0 and self.is_training: assert neg_text is not None tf.get_variable_scope().reuse_variables() max_words = TEXT_MAX_WORDS convert_unk = True greedy_caption, _ = self.decoder.generate_sequence_greedy(image_emb, max_words=max_words, #max_words=20, initial_state=initial_state, attention_states=attention_states, convert_unk=convert_unk, need_logprobs=False) text_feature = self.encoder2.encode(text, self.emb) text_feature = normalize(text_feature) # neg_text = neg_text[:, 0, :] # neg_text_feature = self.encoder2.encode(neg_text, self.emb) # neg_text_feature = normalize(neg_text_feature) caption_feature = self.encoder2.encode(greedy_caption, self.emb) caption_feature = normalize(caption_feature) pos_score = compute_sim(caption_feature, text_feature) # neg_score = compute_sim(caption_feature, neg_text_feature) tf.add_to_collection('pos_score', pos_score) # tf.add_to_collection('neg_score', neg_score) # discriminant_loss = pairwise_loss(pos_score, neg_score) discriminant_loss = tf.reduce_mean((1. - pos_score) / 2.) #TODO this is mean loss so can use reduced loss then add discriminant_loss * ratio tf.add_to_collection('discriminant_loss', discriminant_loss) ratio = FLAGS.discriminant_loss_ratio tf.add_to_collection('gen_loss', loss) loss += ratio * discriminant_loss if FLAGS.alignment_history and self.is_training: alignment_history = self.decoder.alignment_history tf.add_to_collection('alignment_history', alignment_history) if FLAGS.alignment_loss_ratio > 0: lengths = self.decoder.final_sequence_lengths alignment_loss = self.calc_alignment_loss(alignment_history, lengths) tf.add_to_collection('alignment_loss', alignment_loss) #alignment_loss might be 4.1 .. ratio = FLAGS.alignment_loss_ratio #loss = (1 - ratio) * loss + ratio * alignment_loss loss += ratio * alignment_loss self.main_loss = loss if self.is_predict: loss = tf.squeeze(loss) return loss