def eval(model, crit, dataset, vocab, opt, model_path): model.eval() loader = DataLoader(dataset, batch_size=opt['batch_size'], shuffle=True) scorer = COCOScorer() gt_dataframe = json_normalize( json.load(open(opt["input_json"]))['sentences']) gts = convert_data_to_coco_scorer_format(gt_dataframe) results = [] samples = {} for data in loader: # forward the model to get loss image_feats = data['image_feats'].cuda() audio_mfcc = data['audio_mfcc'].cuda() video_ids = data['video_ids'] # forward the model to also get generated samples for each image with torch.no_grad(): seq_probs, seq_preds = model(image_feats, audio_mfcc, mode='inference', opt=opt) sents = NLUtils.decode_sequence(vocab, seq_preds) for k, sent in enumerate(sents): video_id = video_ids[k] samples[video_id] = [{'image_id': video_id, 'caption': sent}] with suppress_stdout_stderr(): valid_score = scorer.score(gts, samples, samples.keys()) results.append(valid_score) print(valid_score) if not os.path.exists(opt["results_path"]): os.makedirs(opt["results_path"]) validation_file_name = opt['model_directory'].split('/')[-1]+'_val_score.txt' with open(os.path.join(opt["results_path"], validation_file_name), 'a') as scores_table: scores_table.write(model_path.split('/')[-1]+': '+json.dumps(results[0]) + "\n")
def eval(model, crit, loader, vocab, opt): model.eval() scorer = COCOScorer() ip_json = open(opt['input_json']) gt_dataframe = json_normalize(json.load(ip_json)['sentences']) ip_json.close() gts = convert_data_to_coco_scorer_format(gt_dataframe) results = [] samples = {} for data in tqdm(loader): # forward the model to get loss video_ids = data['video_ids'] audio_fc2 = data['audio_fc2'].cuda() video_feat = data['video_feat'].cuda() # forward the model to also get generated samples for each image with torch.no_grad(): seq_probs, seq_preds = model(audio_fc2, video_feat, mode='inference', opt=opt) sents = NLUtils.decode_sequence(vocab, seq_preds) for k, sent in enumerate(sents): video_id = video_ids[k] samples[video_id] = [{'image_id': video_id, 'caption': sent}] with suppress_stdout_stderr(): valid_score = scorer.score(gts, samples, samples.keys()) results.append(valid_score) print(valid_score) return valid_score
def eval(model, crit, loader, vocab, opt): model.eval() ''' if opt['beam']: bs = 1 else: bs = opt['batch_size'] loader = DataLoader(dataset, batch_size=bs, shuffle=True) ''' scorer = COCOScorer() gt_dataframe = json_normalize( json.load(open(opt["input_json"]))['sentences']) gts = convert_data_to_coco_scorer_format(gt_dataframe) results = [] samples = {} for data in tqdm(loader): # forward the model to get loss video_ids = data['video_ids'] audio_conv4 = data['audio_conv4'].cuda() audio_fc2 = data['audio_fc2'].cuda() sem_feats = data['sem_feats'].cuda() # forward the model to also get generated samples for each image with torch.no_grad(): seq_probs, seq_preds = model(audio_conv4, audio_fc2, sem_feats, mode='inference', opt=opt) sents = NLUtils.decode_sequence(vocab, seq_preds) for k, sent in enumerate(sents): video_id = video_ids[k] samples[video_id] = [{'image_id': video_id, 'caption': sent}] with suppress_stdout_stderr(): valid_score = scorer.score(gts, samples, samples.keys()) results.append(valid_score) print(valid_score) if not os.path.exists(opt["results_path"]): os.makedirs(opt["results_path"]) ''' with open(os.path.join(opt["results_path"], "scores.txt"), 'a') as scores_table: scores_table.write(json.dumps(results[0]) + "\n") with open(os.path.join(opt["results_path"], 'vanilla' + ".json"), 'w') as prediction_results: json.dump({"predictions": samples, "scores": valid_score}, prediction_results) ''' return valid_score
def evaluate(opt, net, eval_range, prediction_txt_path, reference): eval_loader = get_eval_loader(eval_range, opt.feature_h5_path, opt.region_feature_h5_path, opt.test_batch_size) result = {} for i, (frames, regions, spatials, video_ids) in tqdm(enumerate(eval_loader)): frames = frames.to(DEVICE) regions = regions.to(DEVICE) spatials = spatials.to(DEVICE) outputs, _ = net(frames, regions, spatials, None) for (tokens, vid) in zip(outputs, video_ids): if opt.use_multi_gpu: s = net.module.decoder.decode_tokens(tokens.data) else: s = net.decoder.decode_tokens(tokens.data) result[vid] = s with open(prediction_txt_path, 'w') as f: for vid, s in result.items(): f.write('%d\t%s\n' % (vid, s)) prediction_json = convert_prediction(prediction_txt_path) # compute scores scorer = COCOScorer() with suppress_stdout_stderr(): scores, sub_category_score = scorer.score(reference, prediction_json, prediction_json.keys()) for metric, score in scores.items(): print('%s: %.6f' % (metric, score * 100)) if sub_category_score is not None: print('Sub Category Score in Spice:') for category, score in sub_category_score.items(): print('%s: %.6f' % (category, score * 100)) return scores
def test(saved_model=''): scorer = COCOScorer() ixtoword = pd.Series(np.load(cfg.vocab_path + 'ixtoword.npy').tolist()) combine_features = load_flickr30k_features if cfg.id == "Flickr30k" else load_msr_vtt_features model = s2vt(dim_image=cfg.dim_image, n_words=len(ixtoword), dim_hidden=cfg.dim_hidden, batch_size=cfg.batch_size, n_frame_steps=cfg.n_frame_step, n_lstm_steps=cfg.n_lstm_step, dim_word_emb=cfg.dim_word_emb, cell_clip=cfg.cell_clip, forget_bias=cfg.forget_bias, input_keep_prob=cfg.input_keep_prob, output_keep_prob=cfg.output_keep_prob, bias_init_vector=None) _, video_tf, caption_tf, _, _ = model.build_model("inference") session = tf.InteractiveSession(config=tf.ConfigProto( gpu_options=gpu_options)) saver = tf.train.Saver() saver.restore(session, saved_model) if cfg.id == "Flickr30k": _, _, test_data = get_flickr30k_data(cfg) elif cfg.id == "MSR-VTT": _, _, test_data = get_msr_vtt_data(cfg) splits = [] splits.append((test_data['video_path'].unique(), test_data)) results = [] for split, gt_dataframe in splits: gts = convert_data_to_coco_scorer_format(gt_dataframe) samples = {} for start, end in zip( range(0, len(split), cfg.batch_size), range(cfg.batch_size, len(split) + cfg.batch_size, cfg.batch_size)): current_batch = split[start:end] current_feats = np.zeros( (cfg.batch_size, cfg.n_frame_step, cfg.dim_image)) current_feats_vals = [ combine_features(vid) for vid in current_batch ] for ind, feat in enumerate(current_feats_vals): current_feats[ind][:len(current_feats_vals[ind])] = feat generated_word_index = session.run( caption_tf, feed_dict={video_tf: current_feats}) generated_word_index = np.asarray(generated_word_index).transpose() periods = np.argmax(generated_word_index == 0, axis=1) + 1 periods[ periods == 0] = cfg.n_lstm_step #take the whole sequence if a period was not produced for i in range(len(current_batch)): generated_sentence = ' '.join( ixtoword[generated_word_index[i, :periods[i] - 1]]) video_id = current_batch[i].split("/")[-1].split("_")[ 0] #+ ".jpg" samples[video_id] = [{ u'image_id': video_id, u'caption': generated_sentence }] with suppress_stdout_stderr(): valid_score = scorer.score(gts, samples, samples.keys()) results.append(valid_score) print valid_score print len(samples) if not os.path.exists(cfg.results_path): os.makedirs(cfg.results_path) with open(cfg.results_path + "scores.txt", 'a') as scores_table: scores_table.write(json.dumps(results[0]) + "\n") with open(cfg.results_path + saved_model.split("/")[-1] + ".json", 'w') as prediction_results: json.dump({ "predictions": samples, "scores": valid_score }, prediction_results)