コード例 #1
0
ファイル: dual_textsim.py プロジェクト: tangqiqi123/hasky
 def forword(self, ltext, rtext):
     assert not FLAGS.rtext_bow
     text = tf.concat([ltext, rtext], 0)
     text_feature = self.encode(text)
     ltext_feature, rtext_feature = tf.split(text_feature, 2, 0)
     ltext_feature = self.mlp_layers(ltext_feature)
     ltext_feature = normalize(ltext_feature)
     rtext_feature = normalize(rtext_feature)
     return ltext_feature, rtext_feature
コード例 #2
0
 def rforward(self, text):
     """
 Args:
 text: batch text [batch_size, max_text_len]
 """
     text_feature = self.encode(text)
     text_feature = normalize(text_feature)
     return text_feature
コード例 #3
0
    def forward_image_feature(self, image_feature):
        """
    Args:
      image: batch image [batch_size, image_feature_len]
    """
        image_feature = self.forward_image_layers(image_feature)

        #for point wise comment below
        image_feature = normalize(image_feature)

        return image_feature
コード例 #4
0
ファイル: dual_textsim.py プロジェクト: tangqiqi123/hasky
 def rforward(self, text):
     """
 Args:
 text: batch text [batch_size, max_text_len]
 """
     if not FLAGS.rtext_bow:
         text_feature = self.encode(text)
     else:
         text_feature = bow_encoder.encode(text, self.emb)
     text_feature = normalize(text_feature)
     return text_feature
コード例 #5
0
ファイル: dual_textsim.py プロジェクト: tangqiqi123/hasky
 def lforward(self, text):
     """
 Args:
 text: batch text [batch_size, max_text_len]
 """
     text_feature = self.encode(text)
     text_feature = self.mlp_layers(text_feature)
     ##--well if not normalize will get big values.. then sigmod like 72 -> 1
     #if not FLAGS.loss == 'cross':
     ## contrastive loss work both norm or not norm, for simplicity here not norm
     ##https://www.quora.com/When-training-siamese-networks-how-does-one-determine-the-margin-for-contrastive-loss-How-do-you-convert-this-loss-to-accuracy
     ##You can just normalize features using L2 before using Contrastive Loss.
     ##Then the margin can be constant while training because the distance between features will be normalized.
     #if not FLAGS.loss == 'contrastive':
     text_feature = normalize(text_feature)
     return text_feature
コード例 #6
0
 def forward_text_feature(self, text_feature):
     text_feature = self.forward_text_layers(text_feature)
     #for pointwise comment below
     #must be -1 not 1 for num_negs might > 1 if lookup onece..
     text_feature = normalize(text_feature)
     return text_feature
コード例 #7
0
  def build_graph(self, image_feature, text, 
                  neg_image_feature=None, neg_text=None, 
                  exact_prob=False, exact_loss=False,
                  weights=None):
    
    scope = tf.get_variable_scope()
    if not FLAGS.showtell_noimage:
      with tf.variable_scope(FLAGS.showtell_encode_scope or scope):
        attention_states, initial_state, image_emb = self.encode(image_feature)
        if image_emb is not None:
          assert not FLAGS.add_text_start, 'if use image emb as input then must not pad start mark before sentence'
        else:
          assert FLAGS.add_text_start, 'if not use image emb as input then must pad start mark before sentence'
    else:
      print('Language only mode!', file=sys.stderr)
      image_emb = tf.zeros([melt.get_batch_size(text), self.emb_dim])
      initial_state = None
      attention_states = None

    with tf.variable_scope(FLAGS.showtell_decode_scope or scope):
      #will pad start in decoder.sequence_loss if FLAGS.image_as_init_state
      scores = self.decoder.sequence_loss(text,
                                          input=image_emb, 
                                          initial_state=initial_state, 
                                          attention_states=attention_states, 
                                          exact_prob=exact_prob,
                                          exact_loss=exact_loss,
                                          vocab_weights=self.idf_weights if self.is_training else None,
                                          weights=weights if self.is_training else None) 

      loss = scores 

      if FLAGS.reinforcement_learning and self.is_training:
        assert not FLAGS.image_as_init_state, 'not support im2txt style for reinforcement_learning now, not tested!'
        assert self.rl, 'need to set rl for reinforcement_learning'
        tf.get_variable_scope().reuse_variables()
        max_words = TEXT_MAX_WORDS 
        convert_unk = True
        #code borrow from https://github.com/arieling/SelfCriticalSequenceTraining-tensorflow
        #scores is -(negative log loss)
        sampled_caption, sampled_loss = self.decoder.generate_sequence_multinomial(image_emb, 
                                          max_words=max_words, 
                                          #max_words=16,
                                          initial_state=initial_state,
                                          attention_states=attention_states,
                                          convert_unk=convert_unk,
                                          #length_normalization_factor=0.,
                                          need_logprobs=True)  

        self.rl.sampled_caption = sampled_caption

        greedy_caption, _ = self.decoder.generate_sequence_greedy(image_emb, 
                                          max_words=max_words,
                                          #max_words=20, 
                                          initial_state=initial_state,
                                          attention_states=attention_states,
                                          convert_unk=convert_unk,
                                          need_logprobs=False)

        self.rl.greedy_caption = greedy_caption

        ratio = FLAGS.reinforcement_ratio
        
        #if doing this need loss and sampled_loss same shape batch_size or batch_size * text_length 
        loss = ratio * (self.rl.rewards_feed - self.rl.baseline_feed) * sampled_loss + (1- ratio) * loss

        #loss = -loss

      if not self.is_predict:
        loss = tf.reduce_mean(loss)

      #if not self.is_training and not self.is_predict: #evaluate mode
      if self.is_training:
        tf.add_to_collection('train_scores', scores)
      elif not self.is_predict:
        tf.add_to_collection('eval_scores', scores)

      if FLAGS.discriminant_loss_ratio > 0 and self.is_training:
        assert neg_text is not None
        tf.get_variable_scope().reuse_variables()
        max_words = TEXT_MAX_WORDS 
        convert_unk = True
        greedy_caption, _ = self.decoder.generate_sequence_greedy(image_emb, 
                                  max_words=max_words,
                                  #max_words=20, 
                                  initial_state=initial_state,
                                  attention_states=attention_states,
                                  convert_unk=convert_unk,
                                  need_logprobs=False)
        text_feature = self.encoder2.encode(text, self.emb)
        text_feature = normalize(text_feature)
        # neg_text = neg_text[:, 0, :]
        # neg_text_feature = self.encoder2.encode(neg_text, self.emb)
        # neg_text_feature = normalize(neg_text_feature)
        caption_feature = self.encoder2.encode(greedy_caption, self.emb)
        caption_feature = normalize(caption_feature)
        pos_score = compute_sim(caption_feature, text_feature)
        # neg_score = compute_sim(caption_feature, neg_text_feature)
        tf.add_to_collection('pos_score', pos_score)
        # tf.add_to_collection('neg_score', neg_score)
        # discriminant_loss = pairwise_loss(pos_score, neg_score)
        discriminant_loss = tf.reduce_mean((1. - pos_score) / 2.)
        #TODO this is mean loss so can use reduced loss then add discriminant_loss * ratio
        tf.add_to_collection('discriminant_loss', discriminant_loss)
        ratio = FLAGS.discriminant_loss_ratio
        tf.add_to_collection('gen_loss', loss)
        loss += ratio * discriminant_loss 

      if FLAGS.alignment_history and self.is_training:
        alignment_history = self.decoder.alignment_history
        tf.add_to_collection('alignment_history', alignment_history)

        if FLAGS.alignment_loss_ratio > 0: 
          lengths = self.decoder.final_sequence_lengths
          alignment_loss = self.calc_alignment_loss(alignment_history, lengths)
          tf.add_to_collection('alignment_loss', alignment_loss)
          #alignment_loss might be 4.1 ..
          ratio = FLAGS.alignment_loss_ratio
          #loss = (1 - ratio) * loss + ratio * alignment_loss
          loss += ratio * alignment_loss 

    self.main_loss = loss

    if self.is_predict:
      loss = tf.squeeze(loss)

    return loss