def validate(opt, tb_logger, vid_data_loader, text_data_loader, model, measure='cosine'): # compute the encoding for all the validation video and captions video_embs, video_ids = evaluation.encode_text_or_vid(model.embed_vis, vid_data_loader) cap_embs, caption_ids = evaluation.encode_text_or_vid(model.embed_txt, text_data_loader) t2v_all_errors = evaluation.cal_error(video_embs, cap_embs, measure) v2t_gt, t2v_gt = metrics.get_gt(video_ids, caption_ids) (v2t_r1, v2t_r5, v2t_r10, v2t_medr, v2t_meanr, v2t_map_score), (t2v_r1, t2v_r5, t2v_r10, t2v_medr, t2v_meanr, t2v_map_score) = cal_perf(t2v_all_errors, v2t_gt, t2v_gt, tb_logger=tb_logger, model=model) currscore = 0 if opt.val_metric == "recall": if opt.direction == 'i2t' or opt.direction == 'all': currscore += (v2t_r1 + v2t_r5 + v2t_r10) if opt.direction == 't2i' or opt.direction == 'all': currscore += (t2v_r1 + t2v_r5 + t2v_r10) elif opt.val_metric == "map": if opt.direction == 'i2t' or opt.direction == 'all': currscore += v2t_map_score if opt.direction == 't2i' or opt.direction == 'all': currscore += t2v_map_score tb_logger.log_value('rsum', currscore, step=model.Eiters) return currscore
def evaluate(img_ids, img_embs, t_embs, measure='cosine', n_caption=2, val_metric='map', direction='t2i'): count = {} for iid in img_ids: if int(iid) not in count: count[int(iid)] = (1, 0) else: count[int(iid)] = (count[int(iid)][0] + 1, 0) img_mask, text_mask = [False for _ in img_ids], [True for _ in img_ids] for idx, iid in enumerate(img_ids): c, u = count[int(iid)] if c >= n_caption and u == 0: img_mask[idx] = True count[int(iid)] = (c, 1) elif c >= n_caption and u == 1: count[int(iid)] = (c, 2) else: text_mask[idx] = False img_ids = [x for idx, x in enumerate(img_ids) if img_mask[idx]] img_embs = img_embs[img_mask] t_embs = t_embs[text_mask] c2i_all_errors = evaluation.cal_error(img_embs, t_embs, measure) if val_metric == "recall": # meme retrieval (r1i, r5i, r10i, medri, meanri) = evaluation.t2i(c2i_all_errors, n_caption=n_caption) # caption retrieval (r1, r5, r10, medr, meanr) = evaluation.i2t(c2i_all_errors, n_caption=n_caption) elif val_metric == "map": # meme retrieval t2i_map_score = evaluation.t2i_map(c2i_all_errors, n_caption=n_caption) # caption retrieval i2t_map_score = evaluation.i2t_map(c2i_all_errors, n_caption=n_caption) currscore = 0 if val_metric == "recall": if direction == 'i2t' or direction == 'all': rsum = r1 + r5 + r10 currscore += rsum if direction == 't2i' or direction == 'all': rsumi = r1i + r5i + r10i currscore += rsumi elif val_metric == "map": if direction == 'i2t' or direction == 'all': currscore += i2t_map_score if direction == 't2i' or direction == 'all': currscore += t2i_map_score return currscore
def validate_hybrid(opt, tb_logger, vid_data_loader, text_data_loader, model, measure='cosine', measure_2='cosine'): # compute the encoding for all the validation video and captions model.val_start() video_embs, video_tag_probs, video_ids = evaluation.encode_text_or_vid_tag_hist_prob(model.embed_vis, vid_data_loader) cap_embs, cap_tag_probs, caption_ids = evaluation.encode_text_or_vid_tag_hist_prob(model.embed_txt, text_data_loader) v2t_gt, t2v_gt = metrics.get_gt(video_ids, caption_ids) if opt.space in ['latent', 'hybrid']: t2v_all_errors_1 = evaluation.cal_error(video_embs, cap_embs, measure) # cal_perf(t2v_all_errors_1, v2t_gt, t2v_gt, tb_logger=tb_logger, model=model) if opt.space in ['concept', 'hybrid']: t2v_all_errors_2 = evaluation.cal_error(video_tag_probs, cap_tag_probs, measure_2) t2v_all_errors_2 = t2v_all_errors_2.numpy() # cal_perf(t2v_all_errors_2, v2t_gt, t2v_gt, tb_logger=tb_logger, model=model) if opt.space in ['hybrid']: t2v_all_errors_1 = norm_score(t2v_all_errors_1) t2v_all_errors_2 = norm_score(t2v_all_errors_2) t2v_all_errors = 0.6 * t2v_all_errors_1 + 0.4 * t2v_all_errors_2 (v2t_r1, v2t_r5, v2t_r10, v2t_medr, v2t_meanr, v2t_map_score), (t2v_r1, t2v_r5, t2v_r10, t2v_medr, t2v_meanr, t2v_map_score) = cal_perf(t2v_all_errors, v2t_gt, t2v_gt, tb_logger=tb_logger, model=model) currscore = 0 if opt.val_metric == "recall": if opt.direction == 'i2t' or opt.direction == 'all': currscore += (v2t_r1 + v2t_r5 + v2t_r10) if opt.direction == 't2i' or opt.direction == 'all': currscore += (t2v_r1 + t2v_r5 + t2v_r10) elif opt.val_metric == "map": if opt.direction == 'i2t' or opt.direction == 'all': currscore += v2t_map_score if opt.direction == 't2i' or opt.direction == 'all': currscore += t2v_map_score tb_logger.log_value('rsum', currscore, step=model.Eiters) return currscore
def validate(opt, val_loader, model, measure='cosine'): # compute the encoding for all the validation video and captions video_embs, cap_embs, video_ids, caption_ids = evaluation.encode_data( model, val_loader, opt.log_step, logging.info) # we load data as video-sentence pairs # but we only need to forward each video once for evaluation # so we get the video set and mask out same videos with feature_mask feature_mask = [] evaluate_videos = set() for video_id in video_ids: feature_mask.append(video_id not in evaluate_videos) evaluate_videos.add(video_id) video_embs = video_embs[feature_mask] video_ids = [ x for idx, x in enumerate(video_ids) if feature_mask[idx] is True ] c2i_all_errors = evaluation.cal_error(video_embs, cap_embs, measure) if opt.val_metric == "recall": # video retrieval (r1i, r5i, r10i, medri, meanri) = evaluation.t2i(c2i_all_errors, n_caption=opt.n_caption) print(" * Text to video:") print(" * r_1_5_10: {}".format( [round(r1i, 3), round(r5i, 3), round(r10i, 3)])) print(" * medr, meanr: {}".format([round(medri, 3), round(meanri, 3)])) print(" * " + '-' * 10) # caption retrieval (r1, r5, r10, medr, meanr) = evaluation.i2t(c2i_all_errors, n_caption=opt.n_caption) print(" * Video to text:") print(" * r_1_5_10: {}".format( [round(r1, 3), round(r5, 3), round(r10, 3)])) print(" * medr, meanr: {}".format([round(medr, 3), round(meanr, 3)])) print(" * " + '-' * 10) # record metrics in tensorboard tb_logger.log_value('r1', r1, step=model.Eiters) tb_logger.log_value('r5', r5, step=model.Eiters) tb_logger.log_value('r10', r10, step=model.Eiters) tb_logger.log_value('medr', medr, step=model.Eiters) tb_logger.log_value('meanr', meanr, step=model.Eiters) tb_logger.log_value('r1i', r1i, step=model.Eiters) tb_logger.log_value('r5i', r5i, step=model.Eiters) tb_logger.log_value('r10i', r10i, step=model.Eiters) tb_logger.log_value('medri', medri, step=model.Eiters) tb_logger.log_value('meanri', meanri, step=model.Eiters) elif opt.val_metric == "map": i2t_map_score = evaluation.i2t_map(c2i_all_errors, n_caption=opt.n_caption) t2i_map_score = evaluation.t2i_map(c2i_all_errors, n_caption=opt.n_caption) tb_logger.log_value('i2t_map', i2t_map_score, step=model.Eiters) tb_logger.log_value('t2i_map', t2i_map_score, step=model.Eiters) print('i2t_map', i2t_map_score) print('t2i_map', t2i_map_score) currscore = 0 if opt.val_metric == "recall": if opt.direction == 'i2t' or opt.direction == 'all': currscore += (r1 + r5 + r10) if opt.direction == 't2i' or opt.direction == 'all': currscore += (r1i + r5i + r10i) elif opt.val_metric == "map": if opt.direction == 'i2t' or opt.direction == 'all': currscore += i2t_map_score if opt.direction == 't2i' or opt.direction == 'all': currscore += t2i_map_score tb_logger.log_value('rsum', currscore, step=model.Eiters) return currscore
def validate_split(opt, vid_data_loader, text_data_loader, model, measure='cosine'): # compute the encoding for all the validation video and captions model.val_start() video_embs, video_ids = evaluation.encode_text_or_vid( model.embed_vis, vid_data_loader) cap_embs, caption_ids = evaluation.encode_text_or_vid( model.embed_txt, text_data_loader) c2i_all_errors = evaluation.cal_error(video_embs, cap_embs, measure) if opt.val_metric == "recall": # video retrieval if opt.testCollection.startswith('msvd'): (r1i, r5i, r10i, medri, meanri, t2i_map_score) = evaluation.t2i_varied(c2i_all_errors, caption_ids, video_ids) else: (r1i, r5i, r10i, medri, meanri) = evaluation.t2i(c2i_all_errors, n_caption=opt.n_caption) print(" * Text to video:") print(" * r_1_5_10: {}".format( [round(r1i, 3), round(r5i, 3), round(r10i, 3)])) print(" * medr, meanr: {}".format([round(medri, 3), round(meanri, 3)])) print(" * " + '-' * 10) # caption retrieval if opt.testCollection.startswith('msvd'): (r1, r5, r10, medr, meanr, i2t_map_score) = evaluation.i2t_varied(c2i_all_errors, caption_ids, video_ids) else: (r1, r5, r10, medr, meanr) = evaluation.i2t(c2i_all_errors, n_caption=opt.n_caption) print(" * Video to text:") print(" * r_1_5_10: {}".format( [round(r1, 3), round(r5, 3), round(r10, 3)])) print(" * medr, meanr: {}".format([round(medr, 3), round(meanr, 3)])) print(" * " + '-' * 10) # record metrics in tensorboard tb_logger.log_value('r1', r1, step=model.Eiters) tb_logger.log_value('r5', r5, step=model.Eiters) tb_logger.log_value('r10', r10, step=model.Eiters) tb_logger.log_value('medr', medr, step=model.Eiters) tb_logger.log_value('meanr', meanr, step=model.Eiters) tb_logger.log_value('r1i', r1i, step=model.Eiters) tb_logger.log_value('r5i', r5i, step=model.Eiters) tb_logger.log_value('r10i', r10i, step=model.Eiters) tb_logger.log_value('medri', medri, step=model.Eiters) tb_logger.log_value('meanri', meanri, step=model.Eiters) elif opt.val_metric == "map": i2t_map_score = evaluation.i2t_map(c2i_all_errors, n_caption=opt.n_caption) t2i_map_score = evaluation.t2i_map(c2i_all_errors, n_caption=opt.n_caption) tb_logger.log_value('i2t_map', i2t_map_score, step=model.Eiters) tb_logger.log_value('t2i_map', t2i_map_score, step=model.Eiters) print('i2t_map', i2t_map_score) print('t2i_map', t2i_map_score) currscore = 0 if opt.val_metric == "recall": if opt.direction == 'i2t' or opt.direction == 'all': currscore += (r1 + r5 + r10) if opt.direction == 't2i' or opt.direction == 'all': currscore += (r1i + r5i + r10i) elif opt.val_metric == "map": if opt.direction == 'i2t' or opt.direction == 'all': currscore += i2t_map_score if opt.direction == 't2i' or opt.direction == 'all': currscore += t2i_map_score tb_logger.log_value('rsum', currscore, step=model.Eiters) return currscore
def main(): opt = parse_args() print(json.dumps(vars(opt), indent=2)) rootpath = opt.rootpath testCollection = opt.testCollection n_caption = opt.n_caption resume = os.path.join(opt.logger_name, opt.checkpoint_name) if not os.path.exists(resume): logging.info(resume + ' not exists.') sys.exit(0) checkpoint = torch.load(resume) start_epoch = checkpoint['epoch'] best_rsum = checkpoint['best_rsum'] print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})".format( resume, start_epoch, best_rsum)) options = checkpoint['opt'] if not hasattr(options, 'concate'): setattr(options, "concate", "full") trainCollection = options.trainCollection output_dir = resume.replace(trainCollection, testCollection) output_dir = output_dir.replace('/%s/' % options.cv_name, '/results/%s/' % trainCollection) result_pred_sents = os.path.join(output_dir, 'id.sent.score.txt') pred_error_matrix_file = os.path.join(output_dir, 'pred_errors_matrix.pth.tar') if checkToSkip(pred_error_matrix_file, opt.overwrite): sys.exit(0) makedirsforfile(pred_error_matrix_file) # data loader prepare caption_files = { 'test': os.path.join(rootpath, testCollection, 'TextData', '%s.caption.txt' % testCollection) } img_feat_path = os.path.join(rootpath, testCollection, 'FeatureData', options.visual_feature) visual_feats = {'test': BigFile(img_feat_path)} assert options.visual_feat_dim == visual_feats['test'].ndims video2frames = { 'test': read_dict( os.path.join(rootpath, testCollection, 'FeatureData', options.visual_feature, 'video2frames.txt')) } # set bow vocabulary and encoding bow_vocab_file = os.path.join(rootpath, options.trainCollection, 'TextData', 'vocabulary', 'bow', options.vocab + '.pkl') bow_vocab = pickle.load(open(bow_vocab_file, 'rb')) bow2vec = get_text_encoder('bow')(bow_vocab) options.bow_vocab_size = len(bow_vocab) # set rnn vocabulary rnn_vocab_file = os.path.join(rootpath, options.trainCollection, 'TextData', 'vocabulary', 'rnn', options.vocab + '.pkl') rnn_vocab = pickle.load(open(rnn_vocab_file, 'rb')) options.vocab_size = len(rnn_vocab) # Construct the model model = get_model(options.model)(options) model.load_state_dict(checkpoint['model']) model.Eiters = checkpoint['Eiters'] model.val_start() if testCollection.startswith( 'msvd'): # or testCollection.startswith('msrvtt'): # set data loader video_ids_list = data.read_video_ids(caption_files['test']) vid_data_loader = data.get_vis_data_loader(visual_feats['test'], opt.batch_size, opt.workers, video2frames['test'], video_ids=video_ids_list) text_data_loader = data.get_txt_data_loader(caption_files['test'], rnn_vocab, bow2vec, opt.batch_size, opt.workers) # mapping video_embs, video_ids = evaluation.encode_text_or_vid( model.embed_vis, vid_data_loader) cap_embs, caption_ids = evaluation.encode_text_or_vid( model.embed_txt, text_data_loader) else: # set data loader data_loader = data.get_test_data_loaders(caption_files, visual_feats, rnn_vocab, bow2vec, opt.batch_size, opt.workers, opt.n_caption, video2frames=video2frames) # mapping video_embs, cap_embs, video_ids, caption_ids = evaluation.encode_data( model, data_loader['test'], opt.log_step, logging.info) # remove duplicate videos idx = range(0, video_embs.shape[0], n_caption) video_embs = video_embs[idx, :] video_ids = video_ids[::opt.n_caption] c2i_all_errors = evaluation.cal_error(video_embs, cap_embs, options.measure) torch.save( { 'errors': c2i_all_errors, 'videos': video_ids, 'captions': caption_ids }, pred_error_matrix_file) print("write into: %s" % pred_error_matrix_file) if testCollection.startswith( 'msvd'): # or testCollection.startswith('msrvtt'): # caption retrieval (r1, r5, r10, medr, meanr, i2t_map_score) = evaluation.i2t_varied(c2i_all_errors, caption_ids, video_ids) # video retrieval (r1i, r5i, r10i, medri, meanri, t2i_map_score) = evaluation.t2i_varied(c2i_all_errors, caption_ids, video_ids) else: # caption retrieval (r1i, r5i, r10i, medri, meanri) = evaluation.t2i(c2i_all_errors, n_caption=n_caption) t2i_map_score = evaluation.t2i_map(c2i_all_errors, n_caption=n_caption) # video retrieval (r1, r5, r10, medr, meanr) = evaluation.i2t(c2i_all_errors, n_caption=n_caption) i2t_map_score = evaluation.i2t_map(c2i_all_errors, n_caption=n_caption) print(" * Text to Video:") print(" * r_1_5_10, medr, meanr: {}".format([ round(r1i, 1), round(r5i, 1), round(r10i, 1), round(medri, 1), round(meanri, 1) ])) print(" * recall sum: {}".format(round(r1i + r5i + r10i, 1))) print(" * mAP: {}".format(round(t2i_map_score, 3))) print(" * " + '-' * 10) # caption retrieval print(" * Video to text:") print(" * r_1_5_10, medr, meanr: {}".format([ round(r1, 1), round(r5, 1), round(r10, 1), round(medr, 1), round(meanr, 1) ])) print(" * recall sum: {}".format(round(r1 + r5 + r10, 1))) print(" * mAP: {}".format(round(i2t_map_score, 3))) print(" * " + '-' * 10)
def main(): opt = parse_args() print(json.dumps(vars(opt), indent=2)) rootpath = opt.rootpath collectionStrt = opt.collectionStrt resume = os.path.join(opt.logger_name, opt.checkpoint_name) if not os.path.exists(resume): logging.info(resume + ' not exists.') sys.exit(0) checkpoint = torch.load(resume) start_epoch = checkpoint['epoch'] best_rsum = checkpoint['best_rsum'] print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})".format( resume, start_epoch, best_rsum)) options = checkpoint['opt'] # collection setting testCollection = opt.testCollection collections_pathname = options.collections_pathname collections_pathname['test'] = testCollection trainCollection = options.trainCollection output_dir = resume.replace(trainCollection, testCollection) if 'checkpoints' in output_dir: output_dir = output_dir.replace('/checkpoints/', '/results/') else: output_dir = output_dir.replace( '/%s/' % options.cv_name, '/results/%s/%s/' % (options.cv_name, trainCollection)) result_pred_sents = os.path.join(output_dir, 'id.sent.score.txt') pred_error_matrix_file = os.path.join(output_dir, 'pred_errors_matrix.pth.tar') if checkToSkip(pred_error_matrix_file, opt.overwrite): sys.exit(0) makedirsforfile(pred_error_matrix_file) log_config(output_dir) logging.info(json.dumps(vars(opt), indent=2)) # data loader prepare test_cap = os.path.join(rootpath, collections_pathname['test'], 'TextData', '%s.caption.txt' % testCollection) if collectionStrt == 'single': test_cap = os.path.join( rootpath, collections_pathname['test'], 'TextData', '%s%s.caption.txt' % (testCollection, opt.split)) elif collectionStrt == 'multiple': test_cap = os.path.join(rootpath, collections_pathname['test'], 'TextData', '%s.caption.txt' % testCollection) else: raise NotImplementedError('collection structure %s not implemented' % collectionStrt) caption_files = {'test': test_cap} img_feat_path = os.path.join(rootpath, collections_pathname['test'], 'FeatureData', options.visual_feature) visual_feats = {'test': BigFile(img_feat_path)} assert options.visual_feat_dim == visual_feats['test'].ndims video2frames = { 'test': read_dict( os.path.join(rootpath, collections_pathname['test'], 'FeatureData', options.visual_feature, 'video2frames.txt')) } # set bow vocabulary and encoding bow_vocab_file = os.path.join(rootpath, collections_pathname['train'], 'TextData', 'vocabulary', 'bow', options.vocab + '.pkl') bow_vocab = pickle.load(open(bow_vocab_file, 'rb')) bow2vec = get_text_encoder('bow')(bow_vocab) options.bow_vocab_size = len(bow_vocab) # set rnn vocabulary rnn_vocab_file = os.path.join(rootpath, collections_pathname['train'], 'TextData', 'vocabulary', 'rnn', options.vocab + '.pkl') rnn_vocab = pickle.load(open(rnn_vocab_file, 'rb')) options.vocab_size = len(rnn_vocab) # Construct the model model = get_model(options.model)(options) model.load_state_dict(checkpoint['model']) model.Eiters = checkpoint['Eiters'] model.val_start() # set data loader video_ids_list = data.read_video_ids(caption_files['test']) vid_data_loader = data.get_vis_data_loader(visual_feats['test'], opt.batch_size, opt.workers, video2frames['test'], video_ids=video_ids_list) text_data_loader = data.get_txt_data_loader(caption_files['test'], rnn_vocab, bow2vec, opt.batch_size, opt.workers) # mapping if options.space == 'hybrid': video_embs, video_tag_probs, video_ids = evaluation.encode_text_or_vid_tag_hist_prob( model.embed_vis, vid_data_loader) cap_embs, cap_tag_probs, caption_ids = evaluation.encode_text_or_vid_tag_hist_prob( model.embed_txt, text_data_loader) else: video_embs, video_ids = evaluation.encode_text_or_vid( model.embed_vis, vid_data_loader) cap_embs, caption_ids = evaluation.encode_text_or_vid( model.embed_txt, text_data_loader) v2t_gt, t2v_gt = metrics.get_gt(video_ids, caption_ids) logging.info("write into: %s" % output_dir) if options.space != 'latent': tag_vocab_path = os.path.join( rootpath, collections_pathname['train'], 'TextData', 'tags', 'video_label_th_1', 'tag_vocab_%d.json' % options.tag_vocab_size) evaluation.pred_tag(video_tag_probs, video_ids, tag_vocab_path, os.path.join(output_dir, 'video')) evaluation.pred_tag(cap_tag_probs, caption_ids, tag_vocab_path, os.path.join(output_dir, 'text')) if options.space in ['latent', 'hybrid']: # logging.info("=======Latent Space=======") t2v_all_errors_1 = evaluation.cal_error(video_embs, cap_embs, options.measure) if options.space in ['concept', 'hybrid']: # logging.info("=======Concept Space=======") t2v_all_errors_2 = evaluation.cal_error_batch(video_tag_probs, cap_tag_probs, options.measure_2) if options.space in ['hybrid']: w = 0.6 t2v_all_errors_1 = norm_score(t2v_all_errors_1) t2v_all_errors_2 = norm_score(t2v_all_errors_2) t2v_tag_all_errors = w * t2v_all_errors_1 + (1 - w) * t2v_all_errors_2 cal_perf(t2v_tag_all_errors, v2t_gt, t2v_gt) torch.save( { 'errors': t2v_tag_all_errors, 'videos': video_ids, 'captions': caption_ids }, pred_error_matrix_file) logging.info("write into: %s" % pred_error_matrix_file) elif options.space in ['latent']: cal_perf(t2v_all_errors_1, v2t_gt, t2v_gt) torch.save( { 'errors': t2v_all_errors_1, 'videos': video_ids, 'captions': caption_ids }, pred_error_matrix_file) logging.info("write into: %s" % pred_error_matrix_file)