def words_loss(img_features, words_emb, labels, cap_lens, class_ids, batch_size): """ words_emb(query): batch x nef x seq_len img_features(context): batch x nef x 17 x 17 """ masks = [] att_maps = [] similarities = [] cap_lens = cap_lens.data.tolist() for i in range(batch_size): if class_ids is not None: mask = (class_ids == class_ids[i]).astype(np.uint8) mask[i] = 0 masks.append(mask.reshape((1, -1))) # Get the i-th text description words_num = cap_lens[i] word = words_emb[i, :, :words_num].unsqueeze(0).contiguous() word = word.repeat(batch_size, 1, 1) context = img_features weiContext, attn = func_attention(word, context, cfg.TRAIN.SMOOTH.GAMMA1) att_maps.append(attn[i].unsqueeze(0).contiguous()) word = word.transpose(1, 2).contiguous() weiContext = weiContext.transpose(1, 2).contiguous() word = word.view(batch_size * words_num, -1) weiContext = weiContext.view(batch_size * words_num, -1) # row_sim = cosine_similarity(word, weiContext) row_sim = row_sim.view(batch_size, words_num) row_sim.mul_(cfg.TRAIN.SMOOTH.GAMMA2).exp_() row_sim = row_sim.sum(dim=1, keepdim=True) row_sim = torch.log(row_sim) similarities.append(row_sim) similarities = torch.cat(similarities, 1) if class_ids is not None: masks = np.concatenate(masks, 0) masks = torch.ByteTensor(masks) if cfg.CUDA: masks = masks.cuda() similarities = similarities * cfg.TRAIN.SMOOTH.GAMMA3 if class_ids is not None: similarities.data.masked_fill_(masks, -float('inf')) similarities1 = similarities.transpose(0, 1) if labels is not None: loss0 = nn.CrossEntropyLoss()(similarities, labels) loss1 = nn.CrossEntropyLoss()(similarities1, labels) else: loss0, loss1 = None, None return loss0, loss1, att_maps
def word_level_correlation(img_features, words_emb, cap_lens, batch_size, class_ids, labels): masks = [] att_maps = [] result = 0 cap_lens = cap_lens.data.tolist() similar_list = [] for i in range(batch_size): if class_ids is not None: mask = (class_ids == class_ids[i]).astype(np.uint8) mask[i] = 0 masks.append(mask.reshape((1, -1))) words_num = cap_lens[i] word = words_emb[i, :, :words_num].unsqueeze(0).contiguous() context = img_features[i, :, :, :].unsqueeze(0).contiguous() weiContext, attn = func_attention(word, context, cfg.TRAIN.SMOOTH.GAMMA1) aver = torch.mean(word, 2) averT = aver.unsqueeze(1) res_word = torch.bmm(averT, word) res_softmax = F.softmax(res_word, 2) # self attention res_softmax = res_softmax.repeat(1, weiContext.size(1), 1) self_weiContext = weiContext * res_softmax word = word.transpose(1, 2).contiguous() self_weiContext = self_weiContext.transpose(1, 2).contiguous() word = word.view(words_num, -1) self_weiContext = self_weiContext.view(words_num, -1) row_sim = cosine_similarity(word, self_weiContext) row_sim = row_sim.view(1, words_num) row_sim.mul_(cfg.TRAIN.SMOOTH.GAMMA2).exp_() row_sim = row_sim.sum(dim=1, keepdim=True) row_sim = torch.log(row_sim) similar_list.append(F.sigmoid(row_sim[0, 0])) similar_list = torch.tensor(similar_list, requires_grad=False).cuda() result = nn.BCELoss()(similar_list, labels) return result
def words_loss(self, img_features, word_embs, cls_labels, img_cap_pair_label): # img_features (local features of the image): BATCH x D_HIDDEN x 17 x 17 # word_embs: BATCH x D_HIDDEN x CAP_LEN masks = get_class_masks(cls_labels) att_maps = [] similarities = [] batch_size = img_features.size(0) for i in range(batch_size): words = word_embs[i].unsqueeze( 0).contiguous() # -> 1 x D_HIDDEN x CAP_LEN words = words.repeat(batch_size, 1, 1) # -> BATCH x D_HIDDEN x CAP_LEN # region_context: word representation by image regions region_context, att_map = func_attention(words, img_features, GAMMA_1) att_maps.append(att_map[i].unsqueeze(0).contiguous()) # BATCH * CAP_LEN x D_HIDDEN words = words.transpose(1, 2).contiguous().view( batch_size * CAP_MAX_LEN, -1) region_context = region_context.transpose(1, 2).contiguous().view( batch_size * CAP_MAX_LEN, -1) # Eq. (10) sim = cos_sim(words, region_context).view(batch_size, CAP_MAX_LEN) sim.mul_(GAMMA_2).exp_() sim = sim.sum(dim=1, keepdim=True) sim = torch.log(sim) # similarities(i, j): the similarity between the i-th image and the j-th text description similarities.append(sim) similarities = torch.cat(similarities, 1) # -> BATCH x BATCH masks = masks.view(batch_size, batch_size).contiguous().to(self.device) similarities = similarities * GAMMA_3 similarities.data.masked_fill_(masks, -float('inf')) similarities2 = similarities.transpose(0, 1) loss1 = nn.CrossEntropyLoss()(similarities, img_cap_pair_label) loss2 = nn.CrossEntropyLoss()(similarities2, img_cap_pair_label) return loss1, loss2, att_maps