def eval(model, crit, dataset, vocab, opt, model_path):
    model.eval()
    loader = DataLoader(dataset, batch_size=opt['batch_size'], shuffle=True)
    scorer = COCOScorer()
    gt_dataframe = json_normalize(
        json.load(open(opt["input_json"]))['sentences'])
    gts = convert_data_to_coco_scorer_format(gt_dataframe)
    results = []
    samples = {}
    for data in loader:
        # forward the model to get loss
        image_feats = data['image_feats'].cuda()
        audio_mfcc = data['audio_mfcc'].cuda()
        video_ids = data['video_ids']
        # forward the model to also get generated samples for each image
        with torch.no_grad():
            seq_probs, seq_preds = model(image_feats, audio_mfcc, mode='inference', opt=opt)

        sents = NLUtils.decode_sequence(vocab, seq_preds)

        for k, sent in enumerate(sents):
            video_id = video_ids[k]
            samples[video_id] = [{'image_id': video_id, 'caption': sent}]

    with suppress_stdout_stderr():
        valid_score = scorer.score(gts, samples, samples.keys())
    results.append(valid_score)
    print(valid_score)

    if not os.path.exists(opt["results_path"]):
        os.makedirs(opt["results_path"])

    validation_file_name = opt['model_directory'].split('/')[-1]+'_val_score.txt'
    with open(os.path.join(opt["results_path"], validation_file_name), 'a') as scores_table:
        scores_table.write(model_path.split('/')[-1]+': '+json.dumps(results[0]) + "\n")
Exemple #2
0
def eval(model, crit, loader, vocab, opt):
    model.eval()
    scorer = COCOScorer()
    ip_json = open(opt['input_json'])
    gt_dataframe = json_normalize(json.load(ip_json)['sentences'])
    ip_json.close()
    gts = convert_data_to_coco_scorer_format(gt_dataframe)
    results = []
    samples = {}
    for data in tqdm(loader):
        # forward the model to get loss
        video_ids = data['video_ids']
        audio_fc2 = data['audio_fc2'].cuda()
        video_feat = data['video_feat'].cuda()

        # forward the model to also get generated samples for each image
        with torch.no_grad():
            seq_probs, seq_preds = model(audio_fc2,
                                         video_feat,
                                         mode='inference',
                                         opt=opt)

        sents = NLUtils.decode_sequence(vocab, seq_preds)

        for k, sent in enumerate(sents):
            video_id = video_ids[k]
            samples[video_id] = [{'image_id': video_id, 'caption': sent}]

    with suppress_stdout_stderr():
        valid_score = scorer.score(gts, samples, samples.keys())
    results.append(valid_score)
    print(valid_score)

    return valid_score
def eval(model, crit, loader, vocab, opt):
    model.eval()
    '''
    if opt['beam']:
        bs = 1
    else:
        bs = opt['batch_size']
    loader = DataLoader(dataset, batch_size=bs, shuffle=True)
    '''
    scorer = COCOScorer()
    gt_dataframe = json_normalize(
        json.load(open(opt["input_json"]))['sentences'])
    gts = convert_data_to_coco_scorer_format(gt_dataframe)
    results = []
    samples = {}
    for data in tqdm(loader):
        # forward the model to get loss
        video_ids = data['video_ids']
        audio_conv4 = data['audio_conv4'].cuda()
        audio_fc2 = data['audio_fc2'].cuda()
        sem_feats = data['sem_feats'].cuda()

        # forward the model to also get generated samples for each image
        with torch.no_grad():
            seq_probs, seq_preds = model(audio_conv4,
                                         audio_fc2,
                                         sem_feats,
                                         mode='inference',
                                         opt=opt)

        sents = NLUtils.decode_sequence(vocab, seq_preds)

        for k, sent in enumerate(sents):
            video_id = video_ids[k]
            samples[video_id] = [{'image_id': video_id, 'caption': sent}]

    with suppress_stdout_stderr():
        valid_score = scorer.score(gts, samples, samples.keys())
    results.append(valid_score)
    print(valid_score)

    if not os.path.exists(opt["results_path"]):
        os.makedirs(opt["results_path"])
    '''
    with open(os.path.join(opt["results_path"], "scores.txt"), 'a') as scores_table:
        scores_table.write(json.dumps(results[0]) + "\n")
    with open(os.path.join(opt["results_path"],
                           'vanilla' + ".json"), 'w') as prediction_results:
        json.dump({"predictions": samples, "scores": valid_score},
                  prediction_results)
    '''
    return valid_score
Exemple #4
0
def evaluate(opt, net, eval_range, prediction_txt_path, reference):
    eval_loader = get_eval_loader(eval_range, opt.feature_h5_path,
                                  opt.region_feature_h5_path,
                                  opt.test_batch_size)

    result = {}
    for i, (frames, regions, spatials,
            video_ids) in tqdm(enumerate(eval_loader)):
        frames = frames.to(DEVICE)
        regions = regions.to(DEVICE)
        spatials = spatials.to(DEVICE)

        outputs, _ = net(frames, regions, spatials, None)
        for (tokens, vid) in zip(outputs, video_ids):
            if opt.use_multi_gpu:
                s = net.module.decoder.decode_tokens(tokens.data)
            else:
                s = net.decoder.decode_tokens(tokens.data)
            result[vid] = s

    with open(prediction_txt_path, 'w') as f:
        for vid, s in result.items():
            f.write('%d\t%s\n' % (vid, s))

    prediction_json = convert_prediction(prediction_txt_path)

    # compute scores
    scorer = COCOScorer()
    with suppress_stdout_stderr():
        scores, sub_category_score = scorer.score(reference, prediction_json,
                                                  prediction_json.keys())
    for metric, score in scores.items():
        print('%s: %.6f' % (metric, score * 100))

    if sub_category_score is not None:
        print('Sub Category Score in Spice:')
        for category, score in sub_category_score.items():
            print('%s: %.6f' % (category, score * 100))
    return scores
def test(saved_model=''):
    scorer = COCOScorer()
    ixtoword = pd.Series(np.load(cfg.vocab_path + 'ixtoword.npy').tolist())
    combine_features = load_flickr30k_features if cfg.id == "Flickr30k" else load_msr_vtt_features

    model = s2vt(dim_image=cfg.dim_image,
                 n_words=len(ixtoword),
                 dim_hidden=cfg.dim_hidden,
                 batch_size=cfg.batch_size,
                 n_frame_steps=cfg.n_frame_step,
                 n_lstm_steps=cfg.n_lstm_step,
                 dim_word_emb=cfg.dim_word_emb,
                 cell_clip=cfg.cell_clip,
                 forget_bias=cfg.forget_bias,
                 input_keep_prob=cfg.input_keep_prob,
                 output_keep_prob=cfg.output_keep_prob,
                 bias_init_vector=None)

    _, video_tf, caption_tf, _, _ = model.build_model("inference")
    session = tf.InteractiveSession(config=tf.ConfigProto(
        gpu_options=gpu_options))
    saver = tf.train.Saver()
    saver.restore(session, saved_model)

    if cfg.id == "Flickr30k":
        _, _, test_data = get_flickr30k_data(cfg)
    elif cfg.id == "MSR-VTT":
        _, _, test_data = get_msr_vtt_data(cfg)

    splits = []

    splits.append((test_data['video_path'].unique(), test_data))
    results = []
    for split, gt_dataframe in splits:
        gts = convert_data_to_coco_scorer_format(gt_dataframe)
        samples = {}
        for start, end in zip(
                range(0, len(split), cfg.batch_size),
                range(cfg.batch_size,
                      len(split) + cfg.batch_size, cfg.batch_size)):

            current_batch = split[start:end]
            current_feats = np.zeros(
                (cfg.batch_size, cfg.n_frame_step, cfg.dim_image))
            current_feats_vals = [
                combine_features(vid) for vid in current_batch
            ]

            for ind, feat in enumerate(current_feats_vals):
                current_feats[ind][:len(current_feats_vals[ind])] = feat

            generated_word_index = session.run(
                caption_tf, feed_dict={video_tf: current_feats})
            generated_word_index = np.asarray(generated_word_index).transpose()
            periods = np.argmax(generated_word_index == 0, axis=1) + 1
            periods[
                periods ==
                0] = cfg.n_lstm_step  #take the whole sequence if a period was not produced
            for i in range(len(current_batch)):
                generated_sentence = ' '.join(
                    ixtoword[generated_word_index[i, :periods[i] - 1]])
                video_id = current_batch[i].split("/")[-1].split("_")[
                    0]  #+ ".jpg"
                samples[video_id] = [{
                    u'image_id': video_id,
                    u'caption': generated_sentence
                }]

        with suppress_stdout_stderr():
            valid_score = scorer.score(gts, samples, samples.keys())
        results.append(valid_score)
        print valid_score

    print len(samples)
    if not os.path.exists(cfg.results_path):
        os.makedirs(cfg.results_path)

    with open(cfg.results_path + "scores.txt", 'a') as scores_table:
        scores_table.write(json.dumps(results[0]) + "\n")
    with open(cfg.results_path + saved_model.split("/")[-1] + ".json",
              'w') as prediction_results:
        json.dump({
            "predictions": samples,
            "scores": valid_score
        }, prediction_results)