def test(args, split, modelfn=None, decoder=None, encoder=None): """Runs test on split=val/test with checkpoint file modelfn or loaded model_*""" # Device configuration device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Find model directory if args.caption_model not in args.model_path: args.model_path += "_" + args.caption_model if args.finetune_cnn: args.model_path += "_finetune" # Get the best model path if encoder == None: modelfn = os.path.join(args.model_path, 'best_model.ckpt') # Load vocabulary with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args, vocab, split, shuffle=False) max_tokens = args.max_tokens args.numwords = data_loader.dataset.numwords args.vocab_len = len(vocab) idx2word = vocab.idx2word num_batches = len(data_loader) print(('[DEBUG] Running inference on %s with %d batches' % (split.upper(), num_batches))) # Load model if modelfn is not None: print(('[INFO] Loading checkpoint %s' % modelfn)) encoder = ResNetFeats(args) decoder = models.setup(args) encoder.cuda() decoder.cuda() checkpoint = torch.load(modelfn) decoder.load_state_dict(checkpoint['decoder_state_dict']) encoder.load_state_dict(checkpoint['encoder_state_dict']) encoder.eval() decoder.eval() pred_captions = [] for i, current_batch in enumerate(tqdm(data_loader)): images, captions, _, _, _, img_ids, _ = current_batch images = images.to(device) if args.caption_model == "lstm": features = encoder(images) sentence_ids = decoder.sample(features).cpu().numpy() # Convert word_ids to words for j in range(args.batch_size): sampled_caption = [] word_raw_id = [] for word_id in sentence_ids[j]: word = idx2word[word_id] word_raw_id.append(word_id) if word == '<end>': break sampled_caption.append(word) word_raw_id = word_raw_id[1:] sentence = ' '.join(sampled_caption[1:]) word_raw_id = [str(raw) for raw in word_raw_id] pred_captions.append({ 'image_id': img_ids[j], 'caption': sentence, "gt_caption": captions[j] }) elif args.caption_model == "convcap": imgsfeats, imgsfc7 = encoder(images) _, featdim, feat_h, feat_w = imgsfeats.size() wordclass_feed = np.zeros((args.batch_size, max_tokens), dtype='int64') wordclass_feed[:, 0] = vocab('<start>') outcaps = np.empty((args.batch_size, 0)).tolist() for j in range(max_tokens - 1): wordclass = Variable(torch.from_numpy(wordclass_feed)).cuda() wordact, _ = decoder(imgsfeats, imgsfc7, wordclass) wordact = wordact[:, :, :-1] # batch_size*max_token_len-1, vocab_len wordact_t = wordact.permute(0, 2, 1).contiguous().view( args.batch_size * (max_tokens - 1), -1) wordprobs = F.softmax(wordact_t, dim=1).cpu().data.numpy() wordids = np.argmax(wordprobs, axis=1) word_raw_id = [[]] * args.batch_size for k in range(args.batch_size): word = idx2word[wordids[j + k * (max_tokens - 1)]] outcaps[k].append(word) word_raw_id[k].append(wordids[j + k * (max_tokens - 1)]) if (j < max_tokens - 1): wordclass_feed[k, j + 1] = wordids[j + k * (max_tokens - 1)] for j in range(args.batch_size): num_words = len(outcaps[j]) if '<end>' in outcaps[j]: num_words = outcaps[j].index('<end>') outcap = ' '.join(outcaps[j][:num_words]) current_word_raw_id = word_raw_id[k] #[:num_words] current_word_raw_id = [str(raw) for raw in current_word_raw_id] pred_captions.append({ 'image_id': img_ids[j], 'caption': outcap, "gt_caption": captions[j] }) elif args.caption_model == "transformer": features = encoder(images) sentence_ids = decoder.evaluate(features, args.max_tokens).cpu().numpy() # Convert word_ids to words for j in range(args.batch_size): sampled_caption = [] word_raw_id = [] for word_id in sentence_ids[j]: word = idx2word[word_id] word_raw_id.append(word_id) if word == '<end>': break sampled_caption.append(word) sentence = ' '.join(sampled_caption[1:]) word_raw_id = word_raw_id[1:] word_raw_id = [str(raw) for raw in word_raw_id] pred_captions.append({ 'image_id': img_ids[j], 'caption': sentence, "gt_caption": captions[j] }) print(pred_captions[0:2]) # Calculate scores scores = language_eval(args, pred_captions, args.model_path, split) if args.vis: print("[INFO] visualizing...") vis_folder = args.model_path.replace("models", "vis") + "_" + "_".join( os.path.basename(args.caption_path).split("_")[-2:]) target = os.path.join(vis_folder, "imgs") if not os.path.exists(target): os.makedirs(target) data = data_loader.dataset.data ''' # save img for pred in pred_captions: img_id = pred["image_id"] path = data.loadImgs(img_id)[0]['filename'] img_path = os.path.join(args.image_root, split, path) os.system("cp {} {}".format(img_path, target)) ''' # in order to save space, we use the original location of img to show them for k in range(len(pred_captions)): pred = pred_captions[k] img_id = pred["image_id"] path = data.loadImgs(img_id)[0]['filename'].replace(".jpg", ".png") # need absolute path img_path = os.path.join(args.image_root, split, path) pred_captions[k]["img_path"] = img_path with open(os.path.join(vis_folder, "vis.json"), "w") as f: json.dump(pred_captions, f) encoder.train() decoder.train() return scores
def test(args, split, modelfn=None, model_convcap=None, model_imgcnn=None): """Runs test on split=val/test with checkpoint file modelfn or loaded model_*""" t_start = time.time() data = coco_loader(args.coco_root, split=split, ncap_per_img=1) print('[DEBUG] Loading %s data ... %f secs' % (split, time.time() - t_start)) data_loader = DataLoader(dataset=data, num_workers=args.nthreads,\ batch_size=args.batchsize, shuffle=False, drop_last=True) batchsize = args.batchsize max_tokens = data.max_tokens num_batches = np.int_(np.floor((len(data.ids) * 1.) / batchsize)) print('[DEBUG] Running inference on %s with %d batches' % (split, num_batches)) if (modelfn is not None): model_imgcnn = Resnet101Feats() model_imgcnn.cuda() model_convcap = convcap(data.numwords, args.num_layers, is_attention=args.attention) model_convcap.cuda() print('[DEBUG] Loading checkpoint %s' % modelfn) checkpoint = torch.load(modelfn) model_convcap.load_state_dict(checkpoint['state_dict']) model_imgcnn.load_state_dict(checkpoint['img_state_dict']) else: model_imgcnn = model_imgcnn model_convcap = model_convcap model_imgcnn.train(False) model_convcap.train(False) pred_captions = [] #Test epoch for batch_idx, (imgs, _, _, _, img_ids) in \ tqdm(enumerate(data_loader), total=num_batches): imgs = imgs.view(batchsize, 3, 224, 224) imgs_v = Variable(imgs.cuda()) imgsfeats, imgsfc7 = model_imgcnn(imgs_v) _, featdim, feat_h, feat_w = imgsfeats.size() wordclass_feed = np.zeros((batchsize, max_tokens), dtype='int64') wordclass_feed[:, 0] = data.wordlist.index('<S>') outcaps = np.empty((batchsize, 0)).tolist() for j in range(max_tokens - 1): wordclass = Variable(torch.from_numpy(wordclass_feed)).cuda() wordact, _ = model_convcap(imgsfeats, imgsfc7, wordclass) wordact = wordact[:, :, :-1] wordact_t = wordact.permute(0, 2, 1).contiguous().view( batchsize * (max_tokens - 1), -1) wordprobs = F.softmax(wordact_t).cpu().data.numpy() wordids = np.argmax(wordprobs, axis=1) for k in range(batchsize): word = data.wordlist[wordids[j + k * (max_tokens - 1)]] outcaps[k].append(word) if (j < max_tokens - 1): wordclass_feed[k, j + 1] = wordids[j + k * (max_tokens - 1)] for j in range(batchsize): num_words = len(outcaps[j]) if 'EOS' in outcaps[j]: num_words = outcaps[j].index('EOS') outcap = ' '.join(outcaps[j][:num_words]) pred_captions.append({'image_id': img_ids[j], 'caption': outcap}) scores = language_eval(pred_captions, args.model_dir, split) model_imgcnn.train(True) model_convcap.train(True) return scores
def test_beam(args, split, modelfn=None): """Sample generation with beam-search""" t_start = time.time() data = coco_loader(args.coco_root, split=split, ncap_per_img=1) print('[DEBUG] Loading %s data ... %f secs' % (split, time.time() - t_start)) data_loader = DataLoader(dataset=data, num_workers=args.nthreads,\ batch_size=args.batchsize, shuffle=False, drop_last=True) batchsize = args.batchsize max_tokens = data.max_tokens num_batches = np.int_(np.floor((len(data.ids)*1.)/batchsize)) print('[DEBUG] Running test (w/ beam search) on %d batches' % num_batches) model_imgcnn = Vgg16Feats() model_imgcnn.cuda() model_convcap = convcap(data.numwords, args.num_layers, is_attention=args.attention) model_convcap.cuda() print('[DEBUG] Loading checkpoint %s' % modelfn) checkpoint = torch.load(modelfn) model_convcap.load_state_dict(checkpoint['state_dict']) model_imgcnn.load_state_dict(checkpoint['img_state_dict']) model_imgcnn.train(False) model_convcap.train(False) pred_captions = [] for batch_idx, (imgs, _, _, _, img_ids) in \ tqdm(enumerate(data_loader), total=num_batches): imgs = imgs.view(batchsize, 3, 224, 224) imgs_v = Variable(imgs.cuda()) imgsfeats, imgsfc7 = model_imgcnn(imgs_v) b, f_dim, f_h, f_w = imgsfeats.size() imgsfeats = imgsfeats.unsqueeze(1).expand(\ b, args.beam_size, f_dim, f_h, f_w) imgsfeats = imgsfeats.contiguous().view(\ b*args.beam_size, f_dim, f_h, f_w) beam_searcher = beamsearch(args.beam_size, batchsize, max_tokens) wordclass_feed = np.zeros((args.beam_size*batchsize, max_tokens), dtype='int64') wordclass_feed[:,0] = data.wordlist.index('<S>') imgsfc7 = repeat_img(args, imgsfc7) outcaps = np.empty((batchsize, 0)).tolist() for j in range(max_tokens-1): wordclass = Variable(torch.from_numpy(wordclass_feed)).cuda() wordact, _ = model_convcap(imgsfeats, imgsfc7, wordclass) wordact = wordact[:,:,:-1] wordact_j = wordact[..., j] beam_indices, wordclass_indices = beam_searcher.expand_beam(wordact_j) if len(beam_indices) == 0 or j == (max_tokens-2): # Beam search is over. generated_captions = beam_searcher.get_results() for k in range(batchsize): g = generated_captions[:, k] outcaps[k] = [data.wordlist[x] for x in g] else: wordclass_feed = wordclass_feed[beam_indices] imgsfc7 = imgsfc7.index_select(0, Variable(torch.cuda.LongTensor(beam_indices))) imgsfeats = imgsfeats.index_select(0, Variable(torch.cuda.LongTensor(beam_indices))) for i, wordclass_idx in enumerate(wordclass_indices): wordclass_feed[i, j+1] = wordclass_idx for j in range(batchsize): num_words = len(outcaps[j]) if 'EOS' in outcaps[j]: num_words = outcaps[j].index('EOS') outcap = ' '.join(outcaps[j][:num_words]) pred_captions.append({'image_id': img_ids[j], 'caption': outcap}) scores = language_eval(pred_captions, args.model_dir, split) model_imgcnn.train(True) model_convcap.train(True) return scores
def test_beam(args, split, modelfn=None): """Sample generation with beam-search""" t_start = time.time() data = coco_loader(args.coco_root, split=split, ncap_per_img=1) print('[DEBUG] Loading %s data ... %f secs' % (split, time.time() - t_start)) data_loader = DataLoader(dataset=data, num_workers=args.nthreads,\ batch_size=args.batchsize, shuffle=False, drop_last=True) batchsize = args.batchsize max_tokens = data.max_tokens num_batches = np.int_(np.floor((len(data.ids) * 1.) / batchsize)) print('[DEBUG] Running test (w/ beam search) on %d batches' % num_batches) #model_imgcnn = Vgg16Feats() #model_imgcnn = Resnet101Feats() model_imgcnn = Resnet152Feats() model_imgcnn.cuda() model_convcap = convcap(data.numwords, args.num_layers, is_attention=args.attention) model_convcap.cuda() print('[DEBUG] Loading checkpoint %s' % modelfn) checkpoint = torch.load(modelfn) model_convcap.load_state_dict(checkpoint['state_dict']) model_imgcnn.load_state_dict(checkpoint['img_state_dict']) model_imgcnn.train(False) model_convcap.train(False) pred_captions = [] for batch_idx, (imgs, _, _, _, img_ids) in \ tqdm(enumerate(data_loader), total=num_batches): imgs = imgs.view(batchsize, 3, 224, 224) imgs_v = Variable(imgs.cuda()) imgsfeats, imgsfc7 = model_imgcnn(imgs_v) b, f_dim, f_h, f_w = imgsfeats.size() imgsfeats = imgsfeats.unsqueeze(1).expand(\ b, args.beam_size, f_dim, f_h, f_w) imgsfeats = imgsfeats.contiguous().view(\ b*args.beam_size, f_dim, f_h, f_w) beam_searcher = beamsearch(args.beam_size, batchsize, max_tokens) wordclass_feed = np.zeros((args.beam_size * batchsize, max_tokens), dtype='int64') wordclass_feed[:, 0] = data.wordlist.index('<S>') imgsfc7 = repeat_img(args, imgsfc7) outcaps = np.empty((batchsize, 0)).tolist() for j in range(max_tokens - 1): wordclass = Variable(torch.from_numpy(wordclass_feed)).cuda() wordact, _ = model_convcap(imgsfeats, imgsfc7, wordclass) wordact = wordact[:, :, :-1] wordact_j = wordact[..., j] beam_indices, wordclass_indices = beam_searcher.expand_beam( wordact_j) if len(beam_indices) == 0 or j == (max_tokens - 2): # Beam search is over. generated_captions = beam_searcher.get_results() for k in range(batchsize): g = generated_captions[:, k] outcaps[k] = [data.wordlist[x] for x in g] else: wordclass_feed = wordclass_feed[beam_indices] imgsfc7 = imgsfc7.index_select( 0, Variable(torch.cuda.LongTensor(beam_indices))) imgsfeats = imgsfeats.index_select( 0, Variable(torch.cuda.LongTensor(beam_indices))) for i, wordclass_idx in enumerate(wordclass_indices): wordclass_feed[i, j + 1] = wordclass_idx for j in range(batchsize): num_words = len(outcaps[j]) if 'EOS' in outcaps[j]: num_words = outcaps[j].index('EOS') outcap = ' '.join(outcaps[j][:num_words]) pred_captions.append({'image_id': img_ids[j], 'caption': outcap}) scores = language_eval(pred_captions, args.model_dir, split) model_imgcnn.train(True) model_convcap.train(True) return scores
def test(args, split, modelfn=None, model_convcap=None, model_imgcnn=None): """Runs test on split=val/test with checkpoint file modelfn or loaded model_*""" t_start = time.time() data = coco_loader(args.coco_root, split=split, ncap_per_img=1) print('[DEBUG] Loading %s data ... %f secs' % (split, time.time() - t_start)) data_loader = DataLoader(dataset=data, num_workers=args.nthreads,\ batch_size=args.batchsize, shuffle=False, drop_last=True) batchsize = args.batchsize max_tokens = data.max_tokens num_batches = np.int_(np.floor((len(data.ids)*1.)/batchsize)) print('[DEBUG] Running inference on %s with %d batches' % (split, num_batches)) if(modelfn is not None): model_imgcnn = Vgg16Feats() model_imgcnn.cuda() model_convcap = convcap(data.numwords, args.num_layers, is_attention=args.attention) model_convcap.cuda() print('[DEBUG] Loading checkpoint %s' % modelfn) checkpoint = torch.load(modelfn) model_convcap.load_state_dict(checkpoint['state_dict']) model_imgcnn.load_state_dict(checkpoint['img_state_dict']) else: model_imgcnn = model_imgcnn model_convcap = model_convcap model_imgcnn.train(False) model_convcap.train(False) pred_captions = [] #Test epoch for batch_idx, (imgs, _, _, _, img_ids) in \ tqdm(enumerate(data_loader), total=num_batches): imgs = imgs.view(batchsize, 3, 224, 224) imgs_v = Variable(imgs.cuda()) imgsfeats, imgsfc7 = model_imgcnn(imgs_v) _, featdim, feat_h, feat_w = imgsfeats.size() wordclass_feed = np.zeros((batchsize, max_tokens), dtype='int64') wordclass_feed[:,0] = data.wordlist.index('<S>') outcaps = np.empty((batchsize, 0)).tolist() for j in range(max_tokens-1): wordclass = Variable(torch.from_numpy(wordclass_feed)).cuda() wordact, _ = model_convcap(imgsfeats, imgsfc7, wordclass) wordact = wordact[:,:,:-1] wordact_t = wordact.permute(0, 2, 1).contiguous().view(batchsize*(max_tokens-1), -1) wordprobs = F.softmax(wordact_t).cpu().data.numpy() wordids = np.argmax(wordprobs, axis=1) for k in range(batchsize): word = data.wordlist[wordids[j+k*(max_tokens-1)]] outcaps[k].append(word) if(j < max_tokens-1): wordclass_feed[k, j+1] = wordids[j+k*(max_tokens-1)] for j in range(batchsize): num_words = len(outcaps[j]) if 'EOS' in outcaps[j]: num_words = outcaps[j].index('EOS') outcap = ' '.join(outcaps[j][:num_words]) pred_captions.append({'image_id': img_ids[j], 'caption': outcap}) scores = language_eval(pred_captions, args.model_dir, split) model_imgcnn.train(True) model_convcap.train(True) return scores
def test(opt,EncoderRNN,DecoderCNN,Convcap,itow,wtoi,modelfn=None): ''' input : option, model, checkpoint/model.pth output :scores ''' t_start = time.time() t_start = time.time() test_data=VideoDataset(opt, 'test') test_loader=DataLoader(test_data, batch_size=opt["batch_size"],num_workers=30, shuffle=False) print('[DEBUG] Loading test data ... %f secs' % (time.time() - t_start)) batchsize =opt['batch_size'] cap_size= opt['max_len'] nbatches = np.int_(np.floor((len(test_data)*1.)/batchsize)) bestscore = .0 batchsize_cap = batchsize*1 max_tokens= opt['max_len'] if(modelfn is not None): encoder=EncoderRNN.EncoderRNN(opt['dim_vid'],opt['dim_hidden'],bidirectional=opt['bidirectional'],rnn_cell=opt['rnn_type']).cuda() decoder=DecoderCNN.DecoderCNN(test_data.get_vocab_size()).cuda() convcap=Convcap.Convcap(encoder,decoder).cuda() print('[DEBUG] Loading checkpoint %s' % modelfn) checkpoint = torch.load(modelfn) convcap.load_state_dict(checkpoint['state_dict']) convcap.train(False) pred_captions = [] itr=0 for data in test_loader: print("iteration"+str(itr)) #print(data['labels'].data[0]) print("\n") print("gt\n") for i in data['labels'].data[0]: print(itow[int(i)]) itr+=1 vid_feat=Variable(data['c3d_feats']).cuda() labels=Variable(data['labels'].type(torch.LongTensor)).cuda() mask = Variable(data['masks']).cpu() word_embed=Variable(data['word_embed']).cuda() vid_id=data['video_ids'] #print(vid_id[0]) wordclass_feed = np.zeros((batchsize, max_tokens), dtype='int64') wordclass_feed[:,0] =wtoi['<sos>'] #1 #index of <sos> # print('wordclass_feed shape') # print(wordclass_feed.shape) outcaps = np.empty((batchsize, 0)).tolist() x_outcaps=np.empty((batchsize, 0)).tolist() for j in range(max_tokens-1): wordclass = Variable(torch.from_numpy(wordclass_feed)).cuda() wordact = convcap(vid_feat,wordclass,word_embed,'test') x=convcap(vid_feat,labels,word_embed,'test') x=x[:,:,:-1] x_t=x.permute(0, 2, 1).contiguous().view(batchsize*(max_tokens-1), -1) x_prob=F.softmax(x_t).cpu().data.numpy() wordact = wordact[:,:,:-1] wordact_t = wordact.permute(0, 2, 1).contiguous().view(batchsize*(max_tokens-1), -1) # print("convcap output"+str(wordact_t.shape)) wordprobs = F.softmax(wordact_t).cpu().data.numpy() x_id=np.argmax(x_prob, axis=1) wordids = np.argmax(wordprobs, axis=1) probs=np.max(wordprobs,axis=1) for k in range(batchsize): word = itow[wordids[j+k*(max_tokens-1)]] x_word=itow[x_id[j+k*(max_tokens-1)]] outcaps[k].append(word) x_outcaps[k].append(x_word) if(j < max_tokens-1): wordclass_feed[k, j+1] = wordids[j+k*(max_tokens-1)] # print("word ids"+str(wordids[j+k*(max_tokens-1)])) for j in range(batchsize): num_words = len(outcaps[j]) x_n_words=len(x_outcaps[j]) if 'eos' in x_outcaps[j]: x_n_words=x_outcaps[j].index('eos') x_outcap=' '.join(x_outcaps[j][:num_words]) if 'eos' in outcaps[j]: num_words = outcaps[j].index('eos') outcap = ' '.join(outcaps[j][:num_words]) pred_captions.append({'vid_id': vid_id[0][5:], 'caption': outcap}) print("------------------------------------------------------------------------------") print("videoID \t"+str(vid_id)) print("caption \n") print(x_outcap) #print(itow[int(i)] for i in data['labels'].data[0]) print("------------------------------------------------------------------------------") scores = language_eval(pred_captions, '/home/sanjay/Documents/Video_convcap/output', 'test') return scores
def test(args, split, modelfn=None, model_convcap=None, model_imgcnn=None): """Runs test on split=val/test with checkpoint file modelfn or loaded model_*""" t_start = time.time() data = coco_loader(args.data_root, split=split, ncap_per_img=1) print('[DEBUG] Loading %s data ... %f secs' % (split, time.time() - t_start)) data_loader = DataLoader(dataset=data, num_workers=args.nthreads,\ batch_size=args.batchsize, shuffle=False, drop_last=False) batchsize = args.batchsize max_tokens = data.max_tokens num_batches = np.int_(np.floor((len(data.ids) * 1.) / batchsize)) if num_batches == 0: num_batches = 1 print('[DEBUG] Running inference on %s with %d batches' % (split, num_batches)) model_imgcnn = resnet50().cuda() model_imgcnn.load_state_dict(rename_keys( torch.load(modelfn)['state_dict'])) word_embeddings = None if args.glove: word_embeddings = get_glove_vectors(args.ge, data.wordlist) model_convcap = convcap(data.numwords, args.num_layers, is_attention=args.attention, embedding_weights=word_embeddings).cuda() if modelfn is not None: print('[DEBUG] Loading checkpoint %s' % modelfn) checkpoint = torch.load(modelfn) model_convcap.load_state_dict(checkpoint['state_dict']) model_imgcnn.load_state_dict(checkpoint['img_state_dict']) model_imgcnn.eval() model_convcap.eval() pred_captions = [] attns = [] pred_tokens = [] all_img_ids = [] loss = 0. for batch_idx, (imgs, _, wordclass_t, mask, img_ids) in \ tqdm(enumerate(data_loader), total=num_batches): batchsize = len(imgs) wordact_t_final = [None for _ in range(batchsize * (max_tokens - 1))] imgs = imgs.view(batchsize, 3, 128, 128) imgs_v = Variable(imgs.cuda()) imgsfeats, imgsfc7 = model_imgcnn(imgs_v) _, featdim, feat_h, feat_w = imgsfeats.size() wordclass_feed = np.zeros((batchsize, max_tokens), dtype='int64') wordclass_feed[:, 0] = data.wordlist.index('<S>') outcaps = np.empty((batchsize, 0)).tolist() for j in range(max_tokens - 1): wordclass = Variable(torch.from_numpy(wordclass_feed)).cuda() wordact, attn = model_convcap(imgsfeats, imgsfc7, wordclass) if args.mode == 'attvis': attn = attn.view(batchsize, max_tokens, feat_h, feat_w) wordact = wordact[:, :, :-1] wordact_t = wordact.permute(0, 2, 1).contiguous().view( batchsize * (max_tokens - 1), -1) wordprobs = F.softmax(wordact_t).cpu().data.numpy() wordids = np.argmax(wordprobs, axis=1) for k in range(batchsize): word = data.wordlist[wordids[j + k * (max_tokens - 1)]] outcaps[k].append(word) if (j < max_tokens - 1): wordclass_feed[k, j + 1] = wordids[j + k * (max_tokens - 1)] wordact_t_final[j + k * (max_tokens - 1)] = wordact_t[j + k * (max_tokens - 1)] wordclass_t = wordclass_t.view(batchsize, max_tokens) mask = mask.view(batchsize, max_tokens) wordclass_t = wordclass_t[:, 1:] mask = mask[:, 1:].contiguous() wordact_t_final = torch.stack(wordact_t_final).cpu() wordclass_t = wordclass_t.contiguous().view( batchsize * (max_tokens - 1), 1) maskids = torch.nonzero(mask.view(-1)).numpy().reshape(-1) loss += F.cross_entropy(wordact_t_final[maskids, ...], \ wordclass_t[maskids, ...].contiguous().view(maskids.shape[0])).data.item() for j in range(batchsize): num_words = len(outcaps[j]) if 'EOS' in outcaps[j]: num_words = outcaps[j].index('EOS') outcap = ' '.join(outcaps[j][:num_words]) if args.mode == 'attvis': pred_tokens.append(outcaps[j][:num_words]) pred_captions.append({ 'image_id': img_ids[j].item(), 'caption': outcap }) print('{} split testing loss is: {}'.format(split, (loss * 1.) / (batch_idx + 1))) scores = language_eval(pred_captions, args.model_dir, split) if args.mode == 'test': labelnames = ['Happy', 'Surprise', 'Fear', 'Disgust', 'Angry', 'Sad'] label_wise_caps = {labelnames[i] : [p for p in pred_captions if data.labels[p['image_id']][i]]\ for i in range(6)} for k, preds in label_wise_caps.iteritems(): print(k) language_eval(preds, args.model_dir, split) return scores