def language_eval(predictions, cocofmt_file, opt): logger.info('>>> Language evaluating ...') tmp_checkpoint_json = os.path.join(opt.model_file + str(uuid.uuid4()) + '.json') json.dump(predictions, open(tmp_checkpoint_json, 'w')) lang_stats = utils.language_eval(cocofmt_file, tmp_checkpoint_json) os.remove(tmp_checkpoint_json) return lang_stats
def validate(model, criterion, loader, opt): model.eval() loader.reset() num_videos = loader.get_num_videos() batch_size = loader.get_batch_size() num_iters = int(math.ceil(num_videos * 1.0 / batch_size)) last_batch_size = num_videos % batch_size seq_per_img = loader.get_seq_per_img() model.set_seq_per_img(seq_per_img) loss_sum = 0 logger.info('#num_iters: %d, batch_size: %d, seg_per_image: %d', num_iters, batch_size, seq_per_img) predictions = [] gt_avglogps = [] test_avglogps = [] for ii in range(num_iters): data = loader.get_batch() feats = [Variable(feat, volatile=True) for feat in data['feats']] if loader.has_label: labels = Variable(data['labels'], volatile=True) masks = Variable(data['masks'], volatile=True) if ii == (num_iters - 1) and last_batch_size > 0: feats = [f[:last_batch_size] for f in feats] if loader.has_label: labels = labels[:last_batch_size * seq_per_img] # labels shape is DxN masks = masks[:last_batch_size * seq_per_img] if torch.cuda.is_available(): feats = [feat.cuda() for feat in feats] if loader.has_label: labels = labels.cuda() masks = masks.cuda() if loader.has_label: pred, gt_seq, gt_logseq = model(feats, labels) if opt.output_logp == 1: gt_avglogp = utils.compute_avglogp(gt_seq, gt_logseq.data) gt_avglogps.extend(gt_avglogp) loss = criterion(pred, labels[:, 1:], masks[:, 1:]) loss_sum += loss.data[0] seq, logseq = model.sample(feats, {'beam_size': opt.beam_size}) sents = utils.decode_sequence(opt.vocab, seq) if opt.output_logp == 1: test_avglogp = utils.compute_avglogp(seq, logseq) test_avglogps.extend(test_avglogp) for jj, sent in enumerate(sents): if opt.output_logp == 1: entry = { 'image_id': data['ids'][jj], 'caption': sent, 'avglogp': test_avglogp[jj] } else: entry = {'image_id': data['ids'][jj], 'caption': sent} predictions.append(entry) logger.debug('[%d] video %s: %s' % (jj, entry['image_id'], entry['caption'])) loss = round(loss_sum / num_iters, 3) results = {} lang_stats = {} if opt.language_eval == 1 and loader.has_label: logger.info('>>> Language evaluating ...') tmp_checkpoint_json = os.path.join(opt.model_file + str(uuid.uuid4()) + '.json') json.dump(predictions, open(tmp_checkpoint_json, 'w')) lang_stats = utils.language_eval(loader.cocofmt_file, tmp_checkpoint_json) os.remove(tmp_checkpoint_json) results['predictions'] = predictions results['scores'] = {'Loss': -loss} results['scores'].update(lang_stats) if opt.output_logp == 1: avglogp = sum(test_avglogps) / float(len(test_avglogps)) results['scores'].update({'avglogp': avglogp}) gt_avglogps = np.array(gt_avglogps).reshape(-1, seq_per_img) assert num_videos == gt_avglogps.shape[0] gt_avglogps_file = opt.model_file.replace('.pth', '_gt_avglogps.pkl', 1) cPickle.dump(gt_avglogps, open(gt_avglogps_file, 'w'), protocol=cPickle.HIGHEST_PROTOCOL) logger.info('Wrote GT logp to: %s', gt_avglogps_file) return results
def validate(model, criterion, loader, opt, max_iters=None, type='val'): model.eval() loader.reset() num_videos = loader.get_num_videos() batch_size = loader.get_batch_size() if max_iters is None: num_iters = int(math.ceil(num_videos * 1.0 / batch_size)) else: num_iters = max_iters last_batch_size = num_videos % batch_size seq_per_img = loader.get_seq_per_img() model.set_seq_per_img(seq_per_img) loss_sum = 0 logger.info('#num_iters: %d, batch_size: %d, seg_per_image: %d', num_iters, batch_size, seq_per_img) predictions = [] gt_avglogps = [] test_avglogps = [] prec_recs = dict() for ii in range(num_iters): data = loader.get_batch() feats = data['feats'] bfeats = data['bfeats'] if loader.has_label: labels = data['labels'] masks = data['masks'] labels_svo = data['labels_svo'] if ii == (num_iters - 1) and last_batch_size > 0: feats = [f[:last_batch_size] for f in feats] bfeats = [f[:last_batch_size] for f in bfeats] if loader.has_label: labels = labels[:last_batch_size * seq_per_img] # labels shape is DxN masks = masks[:last_batch_size * seq_per_img] labels_svo = labels_svo[:last_batch_size * seq_per_img] # labels shape is DxN if torch.cuda.is_available(): feats = [feat.cuda() for feat in feats] bfeats = [bfeat.cuda() for bfeat in bfeats] if loader.has_label: labels = labels.cuda() masks = masks.cuda() labels_svo = labels_svo.cuda() if loader.has_label and model.gt_concepts_while_testing == 0: pred, gt_seq, gt_logseq, _, _, _ = model(feats, bfeats, labels, labels_svo) # memReport() if opt.output_logp == 1: gt_avglogp = utils.compute_avglogp(gt_seq, gt_logseq.data) gt_avglogps.extend(gt_avglogp) loss = criterion(pred, labels[:, 1:], masks[:, 1:]) loss_sum += loss.item() del pred, gt_seq, gt_logseq torch.cuda.empty_cache() seq, logseq, _, concept_seq = model.sample( feats, bfeats, labels_svo, {'beam_size': opt.beam_size}) sents = utils.decode_sequence(opt.vocab, seq) if opt.output_logp == 1: test_avglogp = utils.compute_avglogp(seq, logseq) test_avglogps.extend(test_avglogp) if concept_seq is not None: # if type == 'test': # if concept_seq.shape[0] != 136: # print() labels_svo = torch.reshape( labels_svo, (-1, opt.test_seq_per_img, opt.num_concepts))[:, 0] # concept_seq = torch.reshape(concept_seq, (-1, opt.test_seq_per_img, opt.num_concepts))[:, 0] concept_seq_words = utils.decode_sequence(opt.vocab, concept_seq) # Calculate TP,FP,FN for precision and recall calcs if opt.grounder_type in ['niuc', 'nioc', 'iuc', 'ioc']: gt_concept_seq_words = utils.decode_sequence( opt.vocab, labels_svo) gt_concept_seq_words = [ g.split(' ') for g in gt_concept_seq_words ] for bi in range(len(gt_concept_seq_words)): pr_words = list() repeat = int( len(gt_concept_seq_words) / len(concept_seq_words)) for pr in concept_seq_words[int( math.floor(float(bi) / repeat))].split(' '): pr_word = pr.split(' ')[0] pr_words.append(pr_word) if pr_word not in prec_recs: prec_recs[pr_word] = [0, 0, 0] if pr_word in gt_concept_seq_words[bi]: prec_recs[pr_word][0] += 1 # TP else: prec_recs[pr_word][1] += 1 # FP for gt in gt_concept_seq_words[bi]: if gt not in prec_recs: prec_recs[gt] = [0, 0, 0] if gt not in pr_words: prec_recs[gt][2] += 1 # FN try: for jj, (sent, sent_svo) in enumerate(zip(sents, concept_seq_words)): if opt.output_logp == 1: entry = { 'image_id': data['ids'][jj], 'caption': sent, 'svo': sent_svo, 'avglogp': test_avglogp[jj], 'box_att': model.attention_record[jj].tolist() } else: entry = { 'image_id': data['ids'][jj], 'caption': sent, 'svo': sent_svo } #, 'box_att': model.attention_record[jj].tolist()} # todo removed fot transformer model predictions.append(entry) logger.debug('[%d] video %s: %s pr(%s) gt(%s)' % (jj, entry['image_id'], entry['caption'], entry['svo'], gt_concept_seq_words[jj])) except IndexError: print() else: for jj, sent in enumerate(sents): if opt.output_logp == 1: entry = { 'image_id': data['ids'][jj], 'caption': sent, 'avglogp': test_avglogp[jj], 'box_att': model.attention_record[jj].tolist() } else: entry = {'image_id': data['ids'][jj], 'caption': sent} predictions.append(entry) logger.debug('[%d] video %s: %s' % (jj, entry['image_id'], entry['caption'])) del feats, labels, masks, labels_svo, seq, logseq torch.cuda.empty_cache() loss = round(loss_sum / num_iters, 3) results = {} lang_stats = {} if opt.language_eval == 1 and loader.has_label: logger.info('>>> Language evaluating ...') tmp_checkpoint_json = os.path.join( opt.model_file.split('.')[0] + '_' + type + '.json') json.dump(predictions, open(tmp_checkpoint_json, 'w')) lang_stats = utils.language_eval(loader.cocofmt_file, tmp_checkpoint_json) # os.remove(tmp_checkpoint_json) results['predictions'] = predictions results['scores'] = {'Loss': -loss} results['scores'].update(lang_stats) if opt.output_logp == 1: avglogp = sum(test_avglogps) / float(len(test_avglogps)) results['scores'].update({'avglogp': avglogp}) gt_avglogps = np.array(gt_avglogps).reshape(-1, seq_per_img) assert num_videos == gt_avglogps.shape[0] gt_avglogps_file = opt.model_file.replace('.pth', '_gt_avglogps.pkl', 1) cPickle.dump(gt_avglogps, open(gt_avglogps_file, 'w'), protocol=cPickle.HIGHEST_PROTOCOL) logger.info('Wrote GT logp to: %s', gt_avglogps_file) if len(prec_recs.keys()) > 0: prec = dict() rec = dict() for k, v in prec_recs.items(): if v[0] + v[1] > 0: prec[k] = v[0] / float(v[0] + v[1]) else: prec[k] = 0 if v[0] + v[2] > 0: rec[k] = v[0] / float(v[0] + v[2]) else: rec[k] = 0 precv = sum(prec.values()) / len(prec_recs) recv = sum(rec.values()) / len(prec_recs) results['scores'].update({'prec': precv, 'rec': recv}) print('prec: ', precv, ' .. rec: ', recv) logger.debug('prec: ' + str(prec)) logger.debug('rec: ' + str(rec)) return results
# append the caption to either the predictions or the fake groundtruth cap_id = 0 for index, cap in enumerate(caps[id]): if index == sample_index: # this is the 'predicted' caption predictions.append({'image_id': id, 'caption': cap}) else: # this remains a groundtruth caption gte['annotations'].append({'caption': cap, 'image_id': id, 'id': cap_id}) cap_id += 1 # dump out the new groundtruth and prediction json files json.dump(gte, open(tmp_file_gt, 'w')) json.dump(predictions, open(tmp_file_pr, 'w')) # calculate the language stats lang_stats = utils.language_eval(tmp_file_gt, tmp_file_pr) for k, v in lang_stats.items(): if k not in scores: scores[k] = list() scores[k].append(v) print('------------ scores after %d runs ------------' % runs) print(scores) for k, v in scores.items(): print(k, statistics.mean(v), statistics.stdev(v)) if 0: ######################################## Compare training scores with overfitting # setup paths cocofmt_file = os.path.join('datasets', 'msvd', 'metadata', 'msvd_train_cocofmt.json') # cocofmt_file = os.path.join('datasets', 'msrvtt', 'metadata', 'msrvtt_train_cocofmt.json')