def evaluate(model, data_loader, device, eval_file, args): nll_meter = stats.AverageMeter() model.eval() pred_dict = {} with open(eval_file, "r") as fh: gold_dict = json_load(fh) with torch.no_grad(): for x, y, c_padding_mask, c_starts, ids in data_loader: batch_size = x.size(0) _, loss_val, scores = forward(x, y, c_padding_mask, args, device, model) nll_meter.update(loss_val, batch_size) # Get F1 and EM scores p1, p2 = model.module.get_prob(scores).split(1, dim=-1) p1, p2 = p1.squeeze(-1), p2.squeeze(-1) starts, ends = util.discretize(p1, p2, args.max_ans_len, args.use_squad_v2) preds, _ = util.convert_tokens( gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), args.use_squad_v2, c_starts.tolist(), ) pred_dict.update(preds) model.train() results = {"NLL": nll_meter.avg} results.update(eval.eval_dicts(gold_dict, pred_dict, args.use_squad_v2)) return results, pred_dict
def evaluate( model, data_loader, device, eval_file, max_len, use_squad_v2, args, padding_idx, ): nll_meter = stats.AverageMeter() model.eval() pred_dict = {} with open(eval_file, "r") as fh: gold_dict = json_load(fh) with torch.no_grad(), tqdm(total=len(data_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader: batch_size = cw_idxs.size(0) _, loss_val, scores = forward(cw_idxs, qw_idxs, y1, y2, padding_idx, args, device, model) nll_meter.update(loss_val, batch_size) # Get F1 and EM scores p1, p2 = model.module.get_prob(scores).split(1, dim=-1) p1, p2 = p1.squeeze(-1), p2.squeeze(-1) starts, ends = util.discretize(p1, p2, max_len, use_squad_v2) # Log info progress_bar.update(batch_size) progress_bar.set_postfix(NLL=nll_meter.avg) preds, _ = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), use_squad_v2) pred_dict.update(preds) model.train() results = eval.eval_dicts(gold_dict, pred_dict, use_squad_v2) results_list = [ ("NLL", nll_meter.avg), ("F1", results["F1"]), ("EM", results["EM"]), ] if use_squad_v2: results_list.append(("AvNA", results["AvNA"])) results = OrderedDict(results_list) return results, pred_dict
def evaluate(model, data_loader, device, eval_file, max_len, use_squad_v2): nll_meter = stats.AverageMeter() model.eval() pred_dict = {} with open(eval_file, "r") as fh: gold_dict = json_load(fh) with torch.no_grad(), tqdm(total=len(data_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) # Forward log_p1, log_p2 = model(cw_idxs, qw_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) nll_meter.update(loss.item(), batch_size) # Get F1 and EM scores p1, p2 = log_p1.exp(), log_p2.exp() starts, ends = util.discretize(p1, p2, max_len, use_squad_v2) # Log info progress_bar.update(batch_size) progress_bar.set_postfix(NLL=nll_meter.avg) preds, _ = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), use_squad_v2) pred_dict.update(preds) model.train() results = {"NLL": nll_meter.avg} results.update(eval.eval_dicts(gold_dict, pred_dict, use_squad_v2)) return results, pred_dict
def test(args): # Set up logging log = util.get_logger(args.save_dir, args.name) log.info(f"Args: {dumps(vars(args), indent=4, sort_keys=True)}") device, gpu_ids = util.get_available_devices() args.batch_size *= max(1, len(gpu_ids)) # Get embeddings log.info("Loading embeddings...") word_vectors = util.torch_from_json(args.word_emb_file) # Get model log.info("Building model...") model = BiDAF( word_vectors=word_vectors, hidden_size=args.hidden_size, use_glove=args.use_glove, ) model = nn.DataParallel(model, gpu_ids) log.info(f"Loading checkpoint from {args.load_path}...") model = util.load_model(model, args.load_path, gpu_ids, return_step=False) model = model.to(device) model.eval() # Get data loader log.info("Building dataset...") record_file = vars(args)[f"{args.split}_record_file"] dataset = SQuAD(record_file, args.use_squad_v2) data_loader = data.DataLoader( dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn, ) # Evaluate log.info(f"Evaluating on {args.split} split...") nll_meter = stats.AverageMeter() pred_dict = {} # Predictions for TensorBoard sub_dict = {} # Predictions for submission eval_file = vars(args)[f"{args.split}_eval_file"] with open(eval_file, "r") as fh: gold_dict = json_load(fh) with torch.no_grad(), tqdm(total=len(dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) # Forward log_p1, log_p2 = model(cw_idxs, qw_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) nll_meter.update(loss.item(), batch_size) # Get F1 and EM scores p1, p2 = log_p1.exp(), log_p2.exp() starts, ends = util.discretize(p1, p2, args.max_ans_len, args.use_squad_v2) # Log info progress_bar.update(batch_size) if args.split != "test": # No labels for the test set, so NLL would be invalid progress_bar.set_postfix(NLL=nll_meter.avg) idx2pred, uuid2pred = util.convert_tokens( gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), args.use_squad_v2, ) pred_dict.update(idx2pred) sub_dict.update(uuid2pred) # Log results (except for test set, since it does not come with labels) if args.split != "test": results = {"NLL": nll_meter.avg} results.update(eval.eval_dicts(gold_dict, pred_dict, args.use_squad_v2)) # Log to console results_str = ", ".join(f"{k}: {v:05.2f}" for k, v in results.items()) log.info(f"{args.split.title()} {results_str}") # Log to TensorBoard tbx = SummaryWriter(args.save_dir) util.visualize( tbx, pred_dict=pred_dict, eval_path=eval_file, step=0, split=args.split, num_visuals=args.num_visuals, ) # Write submission file if args.split == "dev": sub_path = join(args.save_dir, "val" + "_" + args.sub_file) else: sub_path = join(args.save_dir, args.split + "_" + args.sub_file) log.info(f"Writing submission file to {sub_path}...") eval.write_submission(sub_path, sub_dict)
def test(args): # Set up logging log = util.get_logger(args.save_dir, args.name) log.info(f"Args: {dumps(vars(args), indent=4, sort_keys=True)}") device, gpu_ids = util.get_available_devices() args.batch_size *= max(1, len(gpu_ids)) # Get embeddings log.info("Loading embeddings...") word_vectors = util.torch_from_json(args.word_emb_file) # TODO: Hardcode padding_idx padding_idx = 0 # Get model log.info("Building model...") model = WordTransformerQA( dim=args.dim, n_heads=args.n_heads, ff_dim=args.ff_dim, activation=args.activation, dropout=args.dropout, attn_dropout=args.attn_dropout, act_dropout=args.act_dropout, n_layers=args.n_layers, max_positions=args.max_positions, word_vectors=word_vectors, ) model = nn.DataParallel(model, gpu_ids) log.info(f"Loading checkpoint from {args.load_path}...") model = util.load_model(model, args.load_path, gpu_ids, return_step=False) model = model.to(device) model.eval() # Get data loader log.info("Building dataset...") record_file = vars(args)[f"{args.split}_record_file"] dataset = SQuAD(record_file, args.use_squad_v2) data_loader = data.DataLoader( dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn, ) # Evaluate log.info(f"Evaluating on {args.split} split...") nll_meter = stats.AverageMeter() pred_dict = {} # Predictions for TensorBoard sub_dict = {} # Predictions for submission eval_file = vars(args)[f"{args.split}_eval_file"] with open(eval_file, "r") as fh: gold_dict = json_load(fh) with torch.no_grad(), tqdm(total=len(dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader: batch_size = cw_idxs.size(0) _, loss_val, scores = forward(cw_idxs, qw_idxs, y1, y2, padding_idx, args, device, model) nll_meter.update(loss_val, batch_size) # Get F1 and EM scores p1, p2 = model.module.get_prob(scores).split(1, dim=-1) p1, p2 = p1.squeeze(-1), p2.squeeze(-1) starts, ends = util.discretize(p1, p2, args.max_ans_len, args.use_squad_v2) # Log info progress_bar.update(batch_size) if args.split != "test": # No labels for the test set, so NLL would be invalid progress_bar.set_postfix(NLL=nll_meter.avg) idx2pred, uuid2pred = util.convert_tokens( gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), args.use_squad_v2, ) pred_dict.update(idx2pred) sub_dict.update(uuid2pred) # Log results (except for test set, since it does not come with labels) if args.split != "test": results = eval.eval_dicts(gold_dict, pred_dict, args.use_squad_v2) results_list = [ ("NLL", nll_meter.avg), ("F1", results["F1"]), ("EM", results["EM"]), ] if args.use_squad_v2: results_list.append(("AvNA", results["AvNA"])) results = OrderedDict(results_list) # Log to console results_str = ", ".join(f"{k}: {v:05.2f}" for k, v in results.items()) log.info(f"{args.split.title()} {results_str}") # Log to TensorBoard tbx = SummaryWriter(args.save_dir) util.visualize( tbx, pred_dict=pred_dict, eval_path=eval_file, step=0, split=args.split, num_visuals=args.num_visuals, ) # Write submission file if args.split == "dev": sub_path = join(args.save_dir, "val" + "_" + args.sub_file) else: sub_path = join(args.save_dir, args.split + "_" + args.sub_file) log.info(f"Writing submission file to {sub_path}...") eval.write_submission(sub_path, sub_dict)
def test(args): trainer = base_trainer.Trainer(is_train=False) args, device = get_args(args) args, log, tbx = trainer.setup(args) # Get BPE log.info("Loading BPE...") bpe = get_bpe(args) log.info("Loaded {} BPE tokens".format(len(bpe))) # Get data loader log.info("Building dataset...") record_file = vars(args)[f"{args.split}_record_file"] dataset, data_loader = get_dataset(args, record_file, shuffle=False, randomize=False) # Get model log.info("Building model...") model = get_model(args, bpe) model = trainer.setup_model(model, device) model.eval() trainer.setup_close() # Evaluate log.info(f"Evaluating on {args.split} split...") nll_meter = stats.AverageMeter() pred_dict = {} # Predictions for TensorBoard sub_dict = {} # Predictions for submission eval_file = vars(args)[f"{args.split}_eval_file"] with open(eval_file, "r") as fh: gold_dict = json_load(fh) with torch.no_grad(), tqdm(total=len(dataset)) as progress_bar: for x, y, c_padding_mask, c_starts, ids in data_loader: batch_size = x.size(0) _, loss_val, scores = forward(x, y, c_padding_mask, args, device, model) nll_meter.update(loss_val, batch_size) # Get F1 and EM scores p1, p2 = model.module.get_prob(scores).split(1, dim=-1) p1, p2 = p1.squeeze(-1), p2.squeeze(-1) starts, ends = util.discretize(p1, p2, args.max_ans_len, args.use_squad_v2) # Log info progress_bar.update(batch_size) if args.split != "test": # No labels for the test set, so NLL would be invalid progress_bar.set_postfix(NLL=nll_meter.avg) idx2pred, uuid2pred = util.convert_tokens( gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), args.use_squad_v2, c_starts.tolist(), ) pred_dict.update(idx2pred) sub_dict.update(uuid2pred) # Log results (except for test set, since it does not come with labels) if args.split != "test": results = {"NLL": nll_meter.avg} results.update(eval.eval_dicts(gold_dict, pred_dict, args.use_squad_v2)) # Log to console results_str = ", ".join(f"{k}: {v:05.2f}" for k, v in results.items()) log.info(f"{args.split.title()} {results_str}") # Log to TensorBoard tbx = SummaryWriter(args.save_dir) util.visualize( tbx, pred_dict=pred_dict, eval_path=eval_file, step=0, split=args.split, num_visuals=args.num_visuals, ) # Write submission file if args.split == "dev": sub_path = join(args.save_dir, "val" + "_" + args.sub_file) else: sub_path = join(args.save_dir, args.split + "_" + args.sub_file) log.info(f"Writing submission file to {sub_path}...") eval.write_submission(sub_path, sub_dict)