Ejemplo n.º 1
0
def words_loss(img_features, words_emb, labels, cap_lens, class_ids,
               batch_size):
    """
        words_emb(query): batch x nef x seq_len
        img_features(context): batch x nef x 17 x 17
    """
    masks = []
    att_maps = []
    similarities = []
    cap_lens = cap_lens.data.tolist()
    for i in range(batch_size):
        if class_ids is not None:
            mask = (class_ids == class_ids[i]).astype(np.uint8)
            mask[i] = 0
            masks.append(mask.reshape((1, -1)))
        # Get the i-th text description
        words_num = cap_lens[i]
        word = words_emb[i, :, :words_num].unsqueeze(0).contiguous()
        word = word.repeat(batch_size, 1, 1)
        context = img_features

        weiContext, attn = func_attention(word, context,
                                          cfg.TRAIN.SMOOTH.GAMMA1)

        att_maps.append(attn[i].unsqueeze(0).contiguous())
        word = word.transpose(1, 2).contiguous()
        weiContext = weiContext.transpose(1, 2).contiguous()

        word = word.view(batch_size * words_num, -1)
        weiContext = weiContext.view(batch_size * words_num, -1)
        #
        row_sim = cosine_similarity(word, weiContext)
        row_sim = row_sim.view(batch_size, words_num)

        row_sim.mul_(cfg.TRAIN.SMOOTH.GAMMA2).exp_()
        row_sim = row_sim.sum(dim=1, keepdim=True)
        row_sim = torch.log(row_sim)

        similarities.append(row_sim)

    similarities = torch.cat(similarities, 1)
    if class_ids is not None:
        masks = np.concatenate(masks, 0)
        masks = torch.ByteTensor(masks)
        if cfg.CUDA:
            masks = masks.cuda()

    similarities = similarities * cfg.TRAIN.SMOOTH.GAMMA3

    if class_ids is not None:
        similarities.data.masked_fill_(masks, -float('inf'))
    similarities1 = similarities.transpose(0, 1)

    if labels is not None:
        loss0 = nn.CrossEntropyLoss()(similarities, labels)
        loss1 = nn.CrossEntropyLoss()(similarities1, labels)
    else:
        loss0, loss1 = None, None
    return loss0, loss1, att_maps
Ejemplo n.º 2
0
def word_level_correlation(img_features, words_emb, cap_lens, batch_size,
                           class_ids, labels):

    masks = []
    att_maps = []
    result = 0
    cap_lens = cap_lens.data.tolist()
    similar_list = []
    for i in range(batch_size):
        if class_ids is not None:
            mask = (class_ids == class_ids[i]).astype(np.uint8)
            mask[i] = 0
            masks.append(mask.reshape((1, -1)))

        words_num = cap_lens[i]
        word = words_emb[i, :, :words_num].unsqueeze(0).contiguous()

        context = img_features[i, :, :, :].unsqueeze(0).contiguous()

        weiContext, attn = func_attention(word, context,
                                          cfg.TRAIN.SMOOTH.GAMMA1)

        aver = torch.mean(word, 2)
        averT = aver.unsqueeze(1)
        res_word = torch.bmm(averT, word)
        res_softmax = F.softmax(res_word, 2)  # self attention

        res_softmax = res_softmax.repeat(1, weiContext.size(1), 1)

        self_weiContext = weiContext * res_softmax

        word = word.transpose(1, 2).contiguous()
        self_weiContext = self_weiContext.transpose(1, 2).contiguous()
        word = word.view(words_num, -1)
        self_weiContext = self_weiContext.view(words_num, -1)

        row_sim = cosine_similarity(word, self_weiContext)
        row_sim = row_sim.view(1, words_num)

        row_sim.mul_(cfg.TRAIN.SMOOTH.GAMMA2).exp_()
        row_sim = row_sim.sum(dim=1, keepdim=True)
        row_sim = torch.log(row_sim)
        similar_list.append(F.sigmoid(row_sim[0, 0]))

    similar_list = torch.tensor(similar_list, requires_grad=False).cuda()
    result = nn.BCELoss()(similar_list, labels)

    return result
Ejemplo n.º 3
0
    def words_loss(self, img_features, word_embs, cls_labels,
                   img_cap_pair_label):
        # img_features (local features of the image): BATCH x D_HIDDEN x 17 x 17
        # word_embs: BATCH x D_HIDDEN x CAP_LEN

        masks = get_class_masks(cls_labels)
        att_maps = []
        similarities = []

        batch_size = img_features.size(0)

        for i in range(batch_size):
            words = word_embs[i].unsqueeze(
                0).contiguous()  # -> 1 x D_HIDDEN x CAP_LEN
            words = words.repeat(batch_size, 1,
                                 1)  # -> BATCH x D_HIDDEN x CAP_LEN
            # region_context: word representation by image regions
            region_context, att_map = func_attention(words, img_features,
                                                     GAMMA_1)
            att_maps.append(att_map[i].unsqueeze(0).contiguous())
            # BATCH * CAP_LEN x D_HIDDEN
            words = words.transpose(1, 2).contiguous().view(
                batch_size * CAP_MAX_LEN, -1)
            region_context = region_context.transpose(1, 2).contiguous().view(
                batch_size * CAP_MAX_LEN, -1)

            # Eq. (10)
            sim = cos_sim(words, region_context).view(batch_size, CAP_MAX_LEN)
            sim.mul_(GAMMA_2).exp_()
            sim = sim.sum(dim=1, keepdim=True)
            sim = torch.log(sim)
            # similarities(i, j): the similarity between the i-th image and the j-th text description
            similarities.append(sim)

        similarities = torch.cat(similarities, 1)  # -> BATCH x BATCH
        masks = masks.view(batch_size, batch_size).contiguous().to(self.device)

        similarities = similarities * GAMMA_3
        similarities.data.masked_fill_(masks, -float('inf'))

        similarities2 = similarities.transpose(0, 1)
        loss1 = nn.CrossEntropyLoss()(similarities, img_cap_pair_label)
        loss2 = nn.CrossEntropyLoss()(similarities2, img_cap_pair_label)

        return loss1, loss2, att_maps