Esempio n. 1
0
def validate(opt, tb_logger, vid_data_loader, text_data_loader, model, measure='cosine'):
    # compute the encoding for all the validation video and captions
    video_embs, video_ids = evaluation.encode_text_or_vid(model.embed_vis, vid_data_loader)
    cap_embs, caption_ids = evaluation.encode_text_or_vid(model.embed_txt, text_data_loader)

    t2v_all_errors = evaluation.cal_error(video_embs, cap_embs, measure)
    v2t_gt, t2v_gt = metrics.get_gt(video_ids, caption_ids)

    (v2t_r1, v2t_r5, v2t_r10, v2t_medr, v2t_meanr, v2t_map_score), (t2v_r1, t2v_r5, t2v_r10, t2v_medr, t2v_meanr, t2v_map_score) = cal_perf(t2v_all_errors, v2t_gt, t2v_gt, tb_logger=tb_logger, model=model)
    
    currscore = 0
    if opt.val_metric == "recall":
        if opt.direction == 'i2t' or opt.direction == 'all':
            currscore += (v2t_r1 + v2t_r5 + v2t_r10)
        if opt.direction == 't2i' or opt.direction == 'all':
            currscore += (t2v_r1 + t2v_r5 + t2v_r10)
    elif opt.val_metric == "map":
        if opt.direction == 'i2t' or opt.direction == 'all':
            currscore += v2t_map_score
        if opt.direction == 't2i' or opt.direction == 'all':
            currscore += t2v_map_score

    tb_logger.log_value('rsum', currscore, step=model.Eiters)

    return currscore
Esempio n. 2
0
def validate_split(opt,
                   vid_data_loader,
                   text_data_loader,
                   model,
                   measure='cosine'):
    # compute the encoding for all the validation video and captions

    model.val_start()
    video_embs, video_ids = evaluation.encode_text_or_vid(
        model.embed_vis, vid_data_loader)
    cap_embs, caption_ids = evaluation.encode_text_or_vid(
        model.embed_txt, text_data_loader)

    c2i_all_errors = evaluation.cal_error(video_embs, cap_embs, measure)
    if opt.val_metric == "recall":

        # video retrieval
        if opt.testCollection.startswith('msvd'):
            (r1i, r5i, r10i, medri, meanri,
             t2i_map_score) = evaluation.t2i_varied(c2i_all_errors,
                                                    caption_ids, video_ids)
        else:
            (r1i, r5i, r10i, medri,
             meanri) = evaluation.t2i(c2i_all_errors, n_caption=opt.n_caption)
        print(" * Text to video:")
        print(" * r_1_5_10: {}".format(
            [round(r1i, 3), round(r5i, 3),
             round(r10i, 3)]))
        print(" * medr, meanr: {}".format([round(medri, 3), round(meanri, 3)]))
        print(" * " + '-' * 10)

        # caption retrieval
        if opt.testCollection.startswith('msvd'):
            (r1, r5, r10, medr, meanr,
             i2t_map_score) = evaluation.i2t_varied(c2i_all_errors,
                                                    caption_ids, video_ids)
        else:
            (r1, r5, r10, medr,
             meanr) = evaluation.i2t(c2i_all_errors, n_caption=opt.n_caption)
        print(" * Video to text:")
        print(" * r_1_5_10: {}".format(
            [round(r1, 3), round(r5, 3),
             round(r10, 3)]))
        print(" * medr, meanr: {}".format([round(medr, 3), round(meanr, 3)]))
        print(" * " + '-' * 10)

        # record metrics in tensorboard
        tb_logger.log_value('r1', r1, step=model.Eiters)
        tb_logger.log_value('r5', r5, step=model.Eiters)
        tb_logger.log_value('r10', r10, step=model.Eiters)
        tb_logger.log_value('medr', medr, step=model.Eiters)
        tb_logger.log_value('meanr', meanr, step=model.Eiters)
        tb_logger.log_value('r1i', r1i, step=model.Eiters)
        tb_logger.log_value('r5i', r5i, step=model.Eiters)
        tb_logger.log_value('r10i', r10i, step=model.Eiters)
        tb_logger.log_value('medri', medri, step=model.Eiters)
        tb_logger.log_value('meanri', meanri, step=model.Eiters)

    elif opt.val_metric == "map":
        i2t_map_score = evaluation.i2t_map(c2i_all_errors,
                                           n_caption=opt.n_caption)
        t2i_map_score = evaluation.t2i_map(c2i_all_errors,
                                           n_caption=opt.n_caption)
        tb_logger.log_value('i2t_map', i2t_map_score, step=model.Eiters)
        tb_logger.log_value('t2i_map', t2i_map_score, step=model.Eiters)
        print('i2t_map', i2t_map_score)
        print('t2i_map', t2i_map_score)

    currscore = 0
    if opt.val_metric == "recall":
        if opt.direction == 'i2t' or opt.direction == 'all':
            currscore += (r1 + r5 + r10)
        if opt.direction == 't2i' or opt.direction == 'all':
            currscore += (r1i + r5i + r10i)
    elif opt.val_metric == "map":
        if opt.direction == 'i2t' or opt.direction == 'all':
            currscore += i2t_map_score
        if opt.direction == 't2i' or opt.direction == 'all':
            currscore += t2i_map_score

    tb_logger.log_value('rsum', currscore, step=model.Eiters)

    return currscore
Esempio n. 3
0
def main():
    opt = parse_args()
    print(json.dumps(vars(opt), indent=2))

    rootpath = opt.rootpath
    testCollection = opt.testCollection
    n_caption = opt.n_caption
    resume = os.path.join(opt.logger_name, opt.checkpoint_name)

    if not os.path.exists(resume):
        logging.info(resume + ' not exists.')
        sys.exit(0)

    checkpoint = torch.load(resume)
    start_epoch = checkpoint['epoch']
    best_rsum = checkpoint['best_rsum']
    print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})".format(
        resume, start_epoch, best_rsum))
    options = checkpoint['opt']
    if not hasattr(options, 'concate'):
        setattr(options, "concate", "full")

    trainCollection = options.trainCollection
    output_dir = resume.replace(trainCollection, testCollection)
    output_dir = output_dir.replace('/%s/' % options.cv_name,
                                    '/results/%s/' % trainCollection)
    result_pred_sents = os.path.join(output_dir, 'id.sent.score.txt')
    pred_error_matrix_file = os.path.join(output_dir,
                                          'pred_errors_matrix.pth.tar')
    if checkToSkip(pred_error_matrix_file, opt.overwrite):
        sys.exit(0)
    makedirsforfile(pred_error_matrix_file)

    # data loader prepare
    caption_files = {
        'test':
        os.path.join(rootpath, testCollection, 'TextData',
                     '%s.caption.txt' % testCollection)
    }
    img_feat_path = os.path.join(rootpath, testCollection, 'FeatureData',
                                 options.visual_feature)
    visual_feats = {'test': BigFile(img_feat_path)}
    assert options.visual_feat_dim == visual_feats['test'].ndims
    video2frames = {
        'test':
        read_dict(
            os.path.join(rootpath, testCollection, 'FeatureData',
                         options.visual_feature, 'video2frames.txt'))
    }

    # set bow vocabulary and encoding
    bow_vocab_file = os.path.join(rootpath, options.trainCollection,
                                  'TextData', 'vocabulary', 'bow',
                                  options.vocab + '.pkl')
    bow_vocab = pickle.load(open(bow_vocab_file, 'rb'))
    bow2vec = get_text_encoder('bow')(bow_vocab)
    options.bow_vocab_size = len(bow_vocab)

    # set rnn vocabulary
    rnn_vocab_file = os.path.join(rootpath, options.trainCollection,
                                  'TextData', 'vocabulary', 'rnn',
                                  options.vocab + '.pkl')
    rnn_vocab = pickle.load(open(rnn_vocab_file, 'rb'))
    options.vocab_size = len(rnn_vocab)

    # Construct the model
    model = get_model(options.model)(options)
    model.load_state_dict(checkpoint['model'])
    model.Eiters = checkpoint['Eiters']
    model.val_start()

    if testCollection.startswith(
            'msvd'):  # or testCollection.startswith('msrvtt'):
        # set data loader
        video_ids_list = data.read_video_ids(caption_files['test'])
        vid_data_loader = data.get_vis_data_loader(visual_feats['test'],
                                                   opt.batch_size,
                                                   opt.workers,
                                                   video2frames['test'],
                                                   video_ids=video_ids_list)
        text_data_loader = data.get_txt_data_loader(caption_files['test'],
                                                    rnn_vocab, bow2vec,
                                                    opt.batch_size,
                                                    opt.workers)
        # mapping
        video_embs, video_ids = evaluation.encode_text_or_vid(
            model.embed_vis, vid_data_loader)
        cap_embs, caption_ids = evaluation.encode_text_or_vid(
            model.embed_txt, text_data_loader)
    else:
        # set data loader
        data_loader = data.get_test_data_loaders(caption_files,
                                                 visual_feats,
                                                 rnn_vocab,
                                                 bow2vec,
                                                 opt.batch_size,
                                                 opt.workers,
                                                 opt.n_caption,
                                                 video2frames=video2frames)
        # mapping
        video_embs, cap_embs, video_ids, caption_ids = evaluation.encode_data(
            model, data_loader['test'], opt.log_step, logging.info)
        # remove duplicate videos
        idx = range(0, video_embs.shape[0], n_caption)
        video_embs = video_embs[idx, :]
        video_ids = video_ids[::opt.n_caption]

    c2i_all_errors = evaluation.cal_error(video_embs, cap_embs,
                                          options.measure)
    torch.save(
        {
            'errors': c2i_all_errors,
            'videos': video_ids,
            'captions': caption_ids
        }, pred_error_matrix_file)
    print("write into: %s" % pred_error_matrix_file)

    if testCollection.startswith(
            'msvd'):  # or testCollection.startswith('msrvtt'):
        # caption retrieval
        (r1, r5, r10, medr, meanr,
         i2t_map_score) = evaluation.i2t_varied(c2i_all_errors, caption_ids,
                                                video_ids)
        # video retrieval
        (r1i, r5i, r10i, medri, meanri,
         t2i_map_score) = evaluation.t2i_varied(c2i_all_errors, caption_ids,
                                                video_ids)
    else:
        # caption retrieval
        (r1i, r5i, r10i, medri, meanri) = evaluation.t2i(c2i_all_errors,
                                                         n_caption=n_caption)
        t2i_map_score = evaluation.t2i_map(c2i_all_errors, n_caption=n_caption)

        # video retrieval
        (r1, r5, r10, medr, meanr) = evaluation.i2t(c2i_all_errors,
                                                    n_caption=n_caption)
        i2t_map_score = evaluation.i2t_map(c2i_all_errors, n_caption=n_caption)

    print(" * Text to Video:")
    print(" * r_1_5_10, medr, meanr: {}".format([
        round(r1i, 1),
        round(r5i, 1),
        round(r10i, 1),
        round(medri, 1),
        round(meanri, 1)
    ]))
    print(" * recall sum: {}".format(round(r1i + r5i + r10i, 1)))
    print(" * mAP: {}".format(round(t2i_map_score, 3)))
    print(" * " + '-' * 10)

    # caption retrieval
    print(" * Video to text:")
    print(" * r_1_5_10, medr, meanr: {}".format([
        round(r1, 1),
        round(r5, 1),
        round(r10, 1),
        round(medr, 1),
        round(meanr, 1)
    ]))
    print(" * recall sum: {}".format(round(r1 + r5 + r10, 1)))
    print(" * mAP: {}".format(round(i2t_map_score, 3)))
    print(" * " + '-' * 10)
Esempio n. 4
0
def main():
    opt = parse_args()
    print(json.dumps(vars(opt), indent=2))

    rootpath = opt.rootpath
    collectionStrt = opt.collectionStrt
    resume = os.path.join(opt.logger_name, opt.checkpoint_name)

    if not os.path.exists(resume):
        logging.info(resume + ' not exists.')
        sys.exit(0)

    checkpoint = torch.load(resume)
    start_epoch = checkpoint['epoch']
    best_rsum = checkpoint['best_rsum']
    print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})".format(
        resume, start_epoch, best_rsum))
    options = checkpoint['opt']

    # collection setting
    testCollection = opt.testCollection
    collections_pathname = options.collections_pathname
    collections_pathname['test'] = testCollection

    trainCollection = options.trainCollection
    output_dir = resume.replace(trainCollection, testCollection)
    if 'checkpoints' in output_dir:
        output_dir = output_dir.replace('/checkpoints/', '/results/')
    else:
        output_dir = output_dir.replace(
            '/%s/' % options.cv_name,
            '/results/%s/%s/' % (options.cv_name, trainCollection))
    result_pred_sents = os.path.join(output_dir, 'id.sent.score.txt')
    pred_error_matrix_file = os.path.join(output_dir,
                                          'pred_errors_matrix.pth.tar')
    if checkToSkip(pred_error_matrix_file, opt.overwrite):
        sys.exit(0)
    makedirsforfile(pred_error_matrix_file)

    log_config(output_dir)
    logging.info(json.dumps(vars(opt), indent=2))

    # data loader prepare
    test_cap = os.path.join(rootpath, collections_pathname['test'], 'TextData',
                            '%s.caption.txt' % testCollection)
    if collectionStrt == 'single':
        test_cap = os.path.join(
            rootpath, collections_pathname['test'], 'TextData',
            '%s%s.caption.txt' % (testCollection, opt.split))
    elif collectionStrt == 'multiple':
        test_cap = os.path.join(rootpath, collections_pathname['test'],
                                'TextData', '%s.caption.txt' % testCollection)
    else:
        raise NotImplementedError('collection structure %s not implemented' %
                                  collectionStrt)

    caption_files = {'test': test_cap}
    img_feat_path = os.path.join(rootpath, collections_pathname['test'],
                                 'FeatureData', options.visual_feature)
    visual_feats = {'test': BigFile(img_feat_path)}
    assert options.visual_feat_dim == visual_feats['test'].ndims
    video2frames = {
        'test':
        read_dict(
            os.path.join(rootpath, collections_pathname['test'], 'FeatureData',
                         options.visual_feature, 'video2frames.txt'))
    }

    # set bow vocabulary and encoding
    bow_vocab_file = os.path.join(rootpath, collections_pathname['train'],
                                  'TextData', 'vocabulary', 'bow',
                                  options.vocab + '.pkl')
    bow_vocab = pickle.load(open(bow_vocab_file, 'rb'))
    bow2vec = get_text_encoder('bow')(bow_vocab)
    options.bow_vocab_size = len(bow_vocab)

    # set rnn vocabulary
    rnn_vocab_file = os.path.join(rootpath, collections_pathname['train'],
                                  'TextData', 'vocabulary', 'rnn',
                                  options.vocab + '.pkl')
    rnn_vocab = pickle.load(open(rnn_vocab_file, 'rb'))
    options.vocab_size = len(rnn_vocab)

    # Construct the model
    model = get_model(options.model)(options)
    model.load_state_dict(checkpoint['model'])
    model.Eiters = checkpoint['Eiters']
    model.val_start()

    # set data loader
    video_ids_list = data.read_video_ids(caption_files['test'])
    vid_data_loader = data.get_vis_data_loader(visual_feats['test'],
                                               opt.batch_size,
                                               opt.workers,
                                               video2frames['test'],
                                               video_ids=video_ids_list)
    text_data_loader = data.get_txt_data_loader(caption_files['test'],
                                                rnn_vocab, bow2vec,
                                                opt.batch_size, opt.workers)

    # mapping
    if options.space == 'hybrid':
        video_embs, video_tag_probs, video_ids = evaluation.encode_text_or_vid_tag_hist_prob(
            model.embed_vis, vid_data_loader)
        cap_embs, cap_tag_probs, caption_ids = evaluation.encode_text_or_vid_tag_hist_prob(
            model.embed_txt, text_data_loader)
    else:
        video_embs, video_ids = evaluation.encode_text_or_vid(
            model.embed_vis, vid_data_loader)
        cap_embs, caption_ids = evaluation.encode_text_or_vid(
            model.embed_txt, text_data_loader)

    v2t_gt, t2v_gt = metrics.get_gt(video_ids, caption_ids)

    logging.info("write into: %s" % output_dir)
    if options.space != 'latent':
        tag_vocab_path = os.path.join(
            rootpath, collections_pathname['train'], 'TextData', 'tags',
            'video_label_th_1', 'tag_vocab_%d.json' % options.tag_vocab_size)
        evaluation.pred_tag(video_tag_probs, video_ids, tag_vocab_path,
                            os.path.join(output_dir, 'video'))
        evaluation.pred_tag(cap_tag_probs, caption_ids, tag_vocab_path,
                            os.path.join(output_dir, 'text'))

    if options.space in ['latent', 'hybrid']:
        # logging.info("=======Latent Space=======")
        t2v_all_errors_1 = evaluation.cal_error(video_embs, cap_embs,
                                                options.measure)

    if options.space in ['concept', 'hybrid']:
        # logging.info("=======Concept Space=======")
        t2v_all_errors_2 = evaluation.cal_error_batch(video_tag_probs,
                                                      cap_tag_probs,
                                                      options.measure_2)

    if options.space in ['hybrid']:
        w = 0.6
        t2v_all_errors_1 = norm_score(t2v_all_errors_1)
        t2v_all_errors_2 = norm_score(t2v_all_errors_2)
        t2v_tag_all_errors = w * t2v_all_errors_1 + (1 - w) * t2v_all_errors_2
        cal_perf(t2v_tag_all_errors, v2t_gt, t2v_gt)
        torch.save(
            {
                'errors': t2v_tag_all_errors,
                'videos': video_ids,
                'captions': caption_ids
            }, pred_error_matrix_file)
        logging.info("write into: %s" % pred_error_matrix_file)

    elif options.space in ['latent']:
        cal_perf(t2v_all_errors_1, v2t_gt, t2v_gt)
        torch.save(
            {
                'errors': t2v_all_errors_1,
                'videos': video_ids,
                'captions': caption_ids
            }, pred_error_matrix_file)
        logging.info("write into: %s" % pred_error_matrix_file)
Esempio n. 5
0
def main():
    opt = parse_args()
    logging.info(json.dumps(vars(opt), indent=2))

    rootpath = opt.rootpath
    testCollection = opt.testCollection
    assert collectionStrt == "multiple"
    resume = os.path.join(opt.logger_name, opt.checkpoint_name)

    if not os.path.exists(resume):
        logging.info(resume + ' not exists.')
        sys.exit(0)

    checkpoint = torch.load(resume)
    start_epoch = checkpoint['epoch']
    best_rsum = checkpoint['best_rsum']
    logging.info("=> loaded checkpoint '{}' (epoch {}, best_rsum {})".format(
        resume, start_epoch, best_rsum))
    options = checkpoint['opt']

    trainCollection = options.trainCollection
    valCollection = options.valCollection

    visual_feat_file = BigFile(
        os.path.join(rootpath, testCollection, 'FeatureData',
                     options.visual_feature))
    assert options.visual_feat_dim == visual_feat_file.ndims
    video2frame = read_dict(
        os.path.join(rootpath, testCollection, 'FeatureData',
                     options.visual_feature, 'video2frames.txt'))
    vid_data_loader = data.get_vis_data_loader(visual_feat_file,
                                               opt.batch_size, opt.workers,
                                               video2frame)
    vis_embs = None

    # set bow vocabulary and encoding
    bow_vocab_file = os.path.join(rootpath, options.trainCollection,
                                  'TextData', 'vocabulary', 'bow',
                                  options.vocab + '.pkl')
    bow_vocab = pickle.load(open(bow_vocab_file, 'rb'))
    bow2vec = get_text_encoder('bow')(bow_vocab)
    options.bow_vocab_size = len(bow_vocab)

    # set rnn vocabulary
    rnn_vocab_file = os.path.join(rootpath, options.trainCollection,
                                  'TextData', 'vocabulary', 'rnn',
                                  options.vocab + '.pkl')
    rnn_vocab = pickle.load(open(rnn_vocab_file, 'rb'))
    options.vocab_size = len(rnn_vocab)

    model = get_model(options.model)(options)
    model.load_state_dict(checkpoint['model'])
    model.val_start()

    output_dir = resume.replace(trainCollection, testCollection)
    for query_set in opt.query_sets.strip().split(','):
        output_dir_tmp = output_dir.replace(
            valCollection,
            '%s/%s/%s' % (query_set, trainCollection, valCollection))
        output_dir_tmp = output_dir_tmp.replace('/%s/' % options.cv_name,
                                                '/results/')
        pred_result_file = os.path.join(output_dir_tmp, 'id.sent.score.txt')
        logging.info(pred_result_file)
        if checkToSkip(pred_result_file, opt.overwrite):
            sys.exit(0)
        makedirsforfile(pred_result_file)

        # query data loader
        query_file = os.path.join(rootpath, testCollection, 'TextData',
                                  query_set)
        query_loader = data.get_txt_data_loader(query_file, rnn_vocab, bow2vec,
                                                opt.batch_size, opt.workers)

        # encode videos
        if vis_embs is None:
            start = time.time()
            if options.space == 'hybrid':
                video_embs, video_tag_probs, video_ids = evaluation.encode_text_or_vid_tag_hist_prob(
                    model.embed_vis, vid_data_loader)
            else:
                video_embs, video_ids = evaluation.encode_text_or_vid(
                    model.embed_vis, vid_data_loader)
            logging.info("encode video time: %.3f s" % (time.time() - start))

        # encode text
        start = time.time()
        if options.space == 'hybrid':
            query_embs, query_tag_probs, query_ids = evaluation.encode_text_or_vid_tag_hist_prob(
                model.embed_txt, query_loader)
        else:
            query_embs, query_ids = evaluation.encode_text_or_vid(
                model.embed_txt, query_loader)
        logging.info("encode text time: %.3f s" % (time.time() - start))

        if options.space == 'hybrid':
            t2v_matrix_1 = evaluation.cal_simi(query_embs, video_embs)
            # eval_avs(t2v_matrix_1, query_ids, video_ids, pred_result_file, rootpath, testCollection, query_set)

            t2v_matrix_2 = evaluation.cal_simi(query_tag_probs,
                                               video_tag_probs)
            # pred_result_file = os.path.join(output_dir_tmp, 'id.sent.score_2.txt')
            # eval_avs(t2v_matrix_2, query_ids, video_ids, pred_result_file, rootpath, testCollection, query_set)

            t2v_matrix_1 = norm_score(t2v_matrix_1)
            t2v_matrix_2 = norm_score(t2v_matrix_2)
            for w in [0.8]:
                print("\n")
                t2v_matrix = w * t2v_matrix_1 + (1 - w) * t2v_matrix_2
                pred_result_file = os.path.join(output_dir_tmp,
                                                'id.sent.score_%.1f.txt' % w)
                eval_avs(t2v_matrix, query_ids, video_ids, pred_result_file,
                         rootpath, testCollection, query_set)
        else:
            t2v_matrix_1 = evaluation.cal_simi(query_embs, video_embs)
            eval_avs(t2v_matrix_1, query_ids, video_ids, pred_result_file,
                     rootpath, testCollection, query_set)