def word_similarity(img_features, words_emb, words_num): # -> 1 x nef x words_num word = words_emb[0, :, :words_num].unsqueeze(0).contiguous() # 1 x nef x 17*17 context = img_features """ word(query): 1 x nef x words_num context: 1 x nef x 16 x 16 weiContext: 1 x nef x words_num attn: 1 x words_num x 16 x 16 """ weiContext, attn = func_attention(word, context, cfg.GAMMA1) att_maps = attn[0].unsqueeze(0).contiguous() # --> batch_size x words_num x nef word = word.transpose(1, 2).contiguous() weiContext = weiContext.transpose(1, 2).contiguous() # --> batch_size*words_num x nef word = word.view(1 * words_num, -1) weiContext = weiContext.view(1 * words_num, -1) # # -->batch_size*words_num row_sim = cosine_similarity(word, weiContext) # --> batch_size x words_num row_sim = row_sim.view(1, words_num) # Eq. (10) row_sim.mul_(cfg.GAMMA2).exp_() row_sim = row_sim.sum(dim=1, keepdim=True) row_sim = -torch.log(row_sim) return row_sim, att_maps
def words_loss(img_features, words_emb, labels, cap_lens, class_ids, batch_size): """ words_emb(query): batch x nef x seq_len img_features(context): batch x nef x 17 x 17 """ masks = [] att_maps = [] similarities = [] cap_lens = cap_lens.data.tolist() for i in range(batch_size): if class_ids is not None: mask = (class_ids == class_ids[i]).astype(np.uint8) mask[i] = 0 masks.append(mask.reshape((1, -1))) # Get the i-th text description words_num = cap_lens[i] # -> 1 x nef x words_num word = words_emb[i, :, :words_num].unsqueeze(0).contiguous() # -> batch_size x nef x words_num word = word.repeat(batch_size, 1, 1) # batch x nef x 17*17 context = img_features """ word(query): batch x nef x words_num context: batch x nef x 17 x 17 weiContext: batch x nef x words_num attn: batch x words_num x 17 x 17 """ weiContext, attn = func_attention(word, context, cfg.TRAIN.SMOOTH.GAMMA1) att_maps.append(attn[i].unsqueeze(0).contiguous()) # --> batch_size x words_num x nef word = word.transpose(1, 2).contiguous() weiContext = weiContext.transpose(1, 2).contiguous() # --> batch_size*words_num x nef word = word.view(batch_size * words_num, -1) weiContext = weiContext.view(batch_size * words_num, -1) # # -->batch_size*words_num row_sim = cosine_similarity(word, weiContext) # --> batch_size x words_num row_sim = row_sim.view(batch_size, words_num) # Eq. (10) row_sim.mul_(cfg.TRAIN.SMOOTH.GAMMA2).exp_() row_sim = row_sim.sum(dim=1, keepdim=True) row_sim = torch.log(row_sim) # --> 1 x batch_size # similarities(i, j): the similarity between the i-th image and the j-th text description similarities.append(row_sim) # batch_size x batch_size similarities = torch.cat(similarities, 1) if class_ids is not None: masks = np.concatenate(masks, 0) # masks: batch_size x batch_size masks = torch.ByteTensor(masks) if cfg.CUDA: masks = masks.cuda() similarities = similarities * cfg.TRAIN.SMOOTH.GAMMA3 if class_ids is not None: similarities.data.masked_fill_(masks, -float('inf')) similarities1 = similarities.transpose(0, 1) if labels is not None: loss0 = nn.CrossEntropyLoss()(similarities, labels) loss1 = nn.CrossEntropyLoss()(similarities1, labels) else: loss0, loss1 = None, None return loss0, loss1, att_maps
def words_similarity(img_features, words_emb, labels, cap_lens, class_ids, batch_size): """ words_emb(query): batch x nef x seq_len img_features(context): batch x nef x 17 x 17 """ masks = [] att_maps = [] similarities = [] cap_lens = cap_lens.data.tolist() #print(cap_lens) #print(words_emb) for i in range(batch_size): # Get the i-th text description words_num = cap_lens[i] # -> 1 x nef x words_num word = words_emb[i, :, :words_num].unsqueeze(0).contiguous() # -> batch_size x nef x words_num word = word.repeat(batch_size, 1, 1) # batch x nef x 17*17 context = img_features """ word(query): batch x nef x words_num context: batch x nef x 17 x 17 weiContext: batch x nef x words_num attn: batch x words_num x 17 x 17 """ #weiContext, attn = func_attention(word, context, GAMMA1) weiContext, attn = func_attention(word, context, cfg.TRAIN.SMOOTH.GAMMA1) att_maps.append(attn[i].unsqueeze(0).contiguous()) # --> batch_size x words_num x nef word = word.transpose(1, 2).contiguous() weiContext = weiContext.transpose(1, 2).contiguous() # --> batch_size*words_num x nef word = word.view(batch_size * words_num, -1) weiContext = weiContext.view(batch_size * words_num, -1) # # -->batch_size*words_num row_sim = cosine_similarity(word, weiContext) # --> batch_size x words_num row_sim = row_sim.view(batch_size, words_num) # Eq. (10) #row_sim.mul_(GAMMA2).exp_() row_sim.mul_(cfg.TRAIN.SMOOTH.GAMMA2).exp_() row_sim = row_sim.sum(dim=1, keepdim=True) row_sim = torch.log(row_sim) print(row_sim) #row_sim = row_sim.cpu().squeeze(0) #print(row_sim.item()) # --> 1 x batch_size # similarities(i, j): the similarity between the i-th image and the j-th text description #similarities.append(row_sim.item()) similarities.append(row_sim) # batch_size x batch_size similarities = torch.cat(similarities, 1) #similarities = similarities.detach().cpu().numpy() similarities = similarities * cfg.TRAIN.SMOOTH.GAMMA3 similarities1 = similarities.transpose(0, 1) if labels is not None: loss0 = nn.CrossEntropyLoss()(similarities, labels) loss1 = nn.CrossEntropyLoss()(similarities1, labels) else: loss0, loss1 = None, None w_loss = (loss0 + loss1) * cfg.TRAIN.SMOOTH.LAMBDA print('w_loss = ', w_loss.item(), loss0.item(), loss1.item()) #average print(similarities) words_sim = similarities.detach().cpu().numpy() avg_sim = np.mean(words_sim,axis=0) print('similarities average(batch): ', avg_sim) #return avg_sim #return w_loss.item() return [w_loss.item(), loss0.item(), loss1.item()]