def test(model, tokenizer, test_file, checkpoint, output_dir=None): test_data = TrainData(data_file=test_file, max_length=args.max_length, tokenizer=tokenizer, model_type=args.model_type) test_dataLoader = DataLoader(dataset=test_data, batch_size=args.batch_size, shuffle=False) logger.debug("***** Running test {} *****".format(checkpoint)) logger.debug(" Num examples = %d", len(test_dataLoader)) logger.debug(" Batch size = %d", args.batch_size) loss = [] all_labels = None all_logits = None model.eval() for batch in tqdm(test_dataLoader, desc="Evaluating", ncols=50): with torch.no_grad(): if 'roberta' in args.model_type: batch = [t.to(args.device) for t in batch[:-2]] input_ids, attention_mask, labels = batch outputs = model(input_ids=input_ids.long(), attention_mask=attention_mask.long(), labels=labels) else: batch = [t.to(args.device) for t in batch[:-2]] input_ids, token_type_ids, attention_mask, labels = batch outputs = model(input_ids=input_ids.long(), token_type_ids=token_type_ids.long(), attention_mask=attention_mask.long(), labels=labels) eval_loss, logits = outputs[:2] loss.append(eval_loss.item()) if all_labels is None: all_labels = labels.detach().cpu().numpy() all_logits = logits.detach().cpu().numpy() else: all_labels = np.concatenate((all_labels, labels.detach().cpu().numpy()), axis=0) all_logits = np.concatenate((all_logits, logits.detach().cpu().numpy()), axis=0) acc = accuracy(all_logits, all_labels) f1 = f1_score(all_logits, all_labels) return np.array(loss).mean(), acc, f1
def test(model, tokenizer, test_file, model_type): test_data = None test_dataLoader = None if model_type == 'baseline': test_data = TrainData(data_file=test_file, max_length=args2.max_length, tokenizer=tokenizer) test_dataLoader = DataLoader(test_data, batch_size=args2.batch_size, shuffle=False) elif model_type == 'vae2task': test_data = Multi_task_dataset(data_file=test_file, max_length=args2.max_length, tokenizer=tokenizer) test_dataLoader = DataLoader(dataset=test_data, batch_size=args2.batch_size, shuffle=False) elif model_type == 'cvae': test_data = TrainData(data_file=test_file, max_length=args2.max_length, tokenizer=tokenizer) test_dataLoader = DataLoader(test_data, batch_size=args2.batch_size, shuffle=False) loss = [] all_labels = None all_logits = None model.eval() mdoel = model.to(args2.device) if model_type == "vae2task": for batch in tqdm(test_dataLoader, desc="Evaluating", ncols=50): with torch.no_grad(): batch = [t.to(args2.device) for t in batch] input_ids, token_type_ids, attention_mask, labels_main, labels_vice1, labels_vice2 = batch outputs = model(input_ids=input_ids.long(), token_type_ids=token_type_ids.long(), attention_mask=attention_mask.long(), labels_main=labels_main, labels_vice1=labels_vice1, labels_vice2=labels_vice2) eval_loss, logits = outputs[:2] loss.append(eval_loss.item()) if all_labels is None: all_labels = labels_main.detach().cpu().numpy() all_logits = logits.detach().cpu().numpy() else: all_labels = np.concatenate( (all_labels, labels_main.detach().cpu().numpy()), axis=0) all_logits = np.concatenate( (all_logits, logits.detach().cpu().numpy()), axis=0) elif model_type == 'baseline': for batch in tqdm(test_dataLoader, desc="Evaluating", ncols=50): with torch.no_grad(): batch = [t.to(args2.device) for t in batch[:-2]] input_ids, token_type_ids, attention_mask, labels = batch outputs = model(input_ids=input_ids.long(), token_type_ids=token_type_ids.long(), attention_mask=attention_mask.long(), labels=labels) eval_loss, logits = outputs[:2] loss.append(eval_loss.item()) if all_labels is None: all_labels = labels.detach().cpu().numpy() all_logits = logits.detach().cpu().numpy() else: all_labels = np.concatenate( (all_labels, labels.detach().cpu().numpy()), axis=0) all_logits = np.concatenate( (all_logits, logits.detach().cpu().numpy()), axis=0) elif model_type == 'cvae': for batch in tqdm(test_dataLoader, desc="Evaluating", ncols=50): with torch.no_grad(): query1, query2 = batch[-2:] batch = [t.to(args2.device) for t in batch[:-2]] input_ids, token_type_ids, attention_mask, labels = batch outputs = model(input_ids=input_ids.long(), token_type_ids=token_type_ids.long(), attention_mask=attention_mask.long(), labels=labels, query1=query1, query2=query2) eval_loss, logits = outputs[:2] loss.append(eval_loss.item()) if all_labels is None: all_labels = labels.detach().cpu().numpy() all_logits = logits.detach().cpu().numpy() else: all_labels = np.concatenate( (all_labels, labels.detach().cpu().numpy()), axis=0) all_logits = np.concatenate( (all_logits, logits.detach().cpu().numpy()), axis=0) acc = accuracy(all_logits, all_labels) f1 = f1_score(all_logits, all_labels) return np.array(loss).mean(), acc, f1