Example #1
0
    def build_graph(self, image_feature, text, neg_text=None):
        image_emb = tf.nn.xw_plus_b(image_feature, self.encode_img_W,
                                    self.encode_img_b)
        pos_loss = self.compute_seq_loss(image_emb, text)

        if neg_text is not None and FLAGS.use_neg:
            neg_losses = []
            num_negs = neg_text.get_shape()[1]
            for i in xrange(num_negs):
                tf.get_variable_scope().reuse_variables()
                neg_text_i = neg_text[:, i, :]
                neg_loss = self.compute_seq_loss(image_emb, neg_text_i)
                neg_losses.append(neg_loss)

            neg_losses = tf.concat(1, neg_losses)

            #[batch_size, num_neg_text] @NOTICE input should be pos_score and
            #neg_score which is the larger the better
            loss = melt.hinge_loss(-pos_loss, -neg_losses, FLAGS.margin)
            scores = tf.concat(1, [pos_loss, neg_losses])
        else:
            #loss = tf.reduce_mean(pos_loss)
            #use melt.reduce_mean if input is not [batch_size, 1] but [batch_size * num_steps, 1]
            loss = melt.reduce_mean(pos_loss)
            scores = tf.concat(1, [pos_loss, pos_loss])

        self.scores = scores
        return loss
Example #2
0
    def build_graph(self,
                    image_feature,
                    text,
                    neg_text,
                    lookup_negs_once=False):
        """
    Args:
    image_feature: [batch_size, IMAGE_FEATURE_LEN]
    text: [batch_size, MAX_TEXT_LEN]
    neg_text: [batch_size, num_negs, MAXT_TEXT_LEN]
    """
        with tf.variable_scope("image_text_sim"):
            #-------------get image feature
            #[batch_size, hidden_size] <= [batch_size, IMAGE_FEATURE_LEN]
            normed_image_feature = self.forward_image_feature(image_feature)

            #--------------get image text sim as pos score
            #[batch_size, emb_dim] -> [batch_size, text_MAX_WORDS, emb_dim] -> [batch_size, emb_dim]
            text_feature = self.gen_text_feature(text, self.emb)
            tf.add_to_collection('text_feature', text_feature)

            pos_score = self.compute_image_text_sim(normed_image_feature,
                                                    text_feature)

            #--------------get image neg texts sim as neg scores
            #[batch_size, num_negs, text_MAX_WORDS, emb_dim] -> [batch_size, num_negs, emb_dim]
            tf.get_variable_scope().reuse_variables()
            if lookup_negs_once:
                neg_text_feature = self.gen_text_feature(neg_text, self.emb)
            neg_scores_list = []

            num_negs = neg_text.get_shape()[1]
            for i in xrange(num_negs):
                if lookup_negs_once:
                    neg_text_feature_i = neg_text_feature[:, i, :]
                else:
                    neg_text_feature_i = self.gen_text_feature(
                        neg_text[:, i, :], self.emb)
                neg_scores_i = self.compute_image_text_sim(
                    normed_image_feature, neg_text_feature_i)
                neg_scores_list.append(neg_scores_i)

            #[batch_size, num_negs]
            neg_scores = tf.concat(neg_scores_list, 1)

            #---------------rank loss
            #[batch_size, 1 + num_negs]
            scores = tf.concat([pos_score, neg_scores], 1)
            #may be turn to prob is and show is
            #probs = tf.sigmoid(scores)

            if FLAGS.rank_loss == 'hinge':
                loss = melt.hinge_loss(pos_score, neg_scores, FLAGS.margin)
            elif FLAGS.rank_loss == 'cross':
                loss = melt.cross_entropy_loss(scores, num_negs)
            else:
                loss = melt.hinge_cross_loss(pos_score, neg_scores)

            tf.add_to_collection('scores', scores)
        return loss
Example #3
0
    def build_graph(self,
                    image_feature,
                    text,
                    neg_text,
                    lookup_negs_once=True):
        """
    Args:
    image_feature: [batch_size, IMAGE_FEATURE_LEN]
    text: [batch_size, MAX_TEXT_LEN]
    neg_text: [batch_size, num_negs, MAXT_TEXT_LEN]
    """
        with tf.variable_scope("image_text_sim"):
            #-------------get image feature
            #[batch_size, hidden_size] <= [batch_size, IMAGE_FEATURE_LEN]
            normed_image_feature = self.forward_image_feature(image_feature)

            #--------------get image text sim as pos score
            #[batch_size, emb_dim] -> [batch_size, text_MAX_WORDS, emb_dim] -> [batch_size, emb_dim]
            text_feature = self.gen_text_feature(text)
            pos_score = self.compute_image_text_sim(normed_image_feature,
                                                    text_feature)

            #--------------get image neg texts sim as neg scores
            #[batch_size, num_negs, text_MAX_WORDS, emb_dim] -> [batch_size, num_negs, emb_dim]
            tf.get_variable_scope().reuse_variables()
            if lookup_negs_once:
                neg_text_feature = self.gen_text_feature(neg_text)
            neg_scores_list = []

            num_negs = neg_text.get_shape()[1]
            for i in xrange(num_negs):
                if lookup_negs_once:
                    neg_text_feature_i = neg_text_feature[:, i, :]
                else:
                    neg_text_feature_i = self.gen_text_feature(neg_text[:,
                                                                        i, :])
                neg_scores_i = self.compute_image_text_sim(
                    normed_image_feature, neg_text_feature_i)
                neg_scores_list.append(neg_scores_i)

            #[batch_size, num_negs]
            neg_scores = tf.concat(1, neg_scores_list)

            #---------------rank loss
            #[batch_size, 1 + num_negs]
            scores = tf.concat(1, [pos_score, neg_scores])
            loss = melt.hinge_loss(pos_score, neg_scores, FLAGS.margin)

            self.scores = scores
        return loss
Example #4
0
    def build_graph(self,
                    image_feature,
                    text,
                    neg_text=None,
                    exact_loss=False):
        image_emb = self.build_image_embeddings(image_feature)

        pos_loss = self.decoder.sequence_loss(text,
                                              input=image_emb,
                                              exact_loss=exact_loss)

        loss = None
        scores = None
        if neg_text is not None and (FLAGS.use_neg or FLAGS.show_neg):
            neg_losses = []
            num_negs = neg_text.get_shape()[1]
            for i in xrange(num_negs):
                tf.get_variable_scope().reuse_variables()
                neg_text_i = neg_text[:, i, :]
                neg_loss = self.decoder.sequence_loss(image_emb,
                                                      neg_text_i,
                                                      exact_loss=exact_loss)
                neg_losses.append(neg_loss)

            neg_losses = tf.concat(neg_losses, 1)

            if FLAGS.use_neg:
                #neg_losses [batch_size, num_neg_text]
                loss = melt.hinge_loss(-pos_loss, -neg_losses, FLAGS.margin)

            scores = tf.concat([pos_loss, neg_losses], 1)

        if loss is None:
            if not self.is_predict:
                loss = tf.reduce_mean(pos_loss)
            else:
                loss = pos_loss

            if scores is None:
                if neg_text is not None:
                    scores = tf.concat([pos_loss, pos_loss], 1)
                else:
                    scores = pos_loss

        if not self.is_training and not self.is_predict:  #evaluate mode
            tf.add_to_collection('scores', scores)
        return loss