def evaluate(predictions, labels): label = int(labels[-1]) hr = metrics.hit(label, predictions) mrr = metrics.mrr(label, predictions) ndcg = metrics.ndcg(label, predictions) return hr, mrr, ndcg
def evaluate(model, tokenizer, eval_file, checkpoint, output_dir=None): eval_data = TrainData(data_file=eval_file, doc_file=doc_file, max_length=args.max_length, tokenizer=tokenizer, attacked_file=None) eval_dataLoader = DataLoader(dataset=eval_data, batch_size=args.batch_size, shuffle=False) logger.debug("***** Running evaluation {} *****".format(checkpoint)) logger.debug(" Num examples = %d", len(eval_dataLoader)) logger.debug(" Batch size = %d", args.batch_size) loss = [] mrrs = [] maps = [] all_labels = None all_logits = None model.eval() for batch in tqdm(eval_dataLoader, desc="Evaluating"): batch = tuple(t.to('cuda') for t in batch[:4]) input_ids, token_type_ids, attention_mask, labels = batch with torch.no_grad(): outputs = model(input_ids=input_ids.long(), token_type_ids=token_type_ids.long(), attention_mask=attention_mask, labels=labels) eval_loss, logits = outputs[:2] loss.append(eval_loss.item()) if all_labels is None: all_labels = labels.detach().cpu().numpy() all_logits = logits.detach().cpu().numpy() else: all_labels = np.concatenate( (all_labels, labels.detach().cpu().numpy()), axis=0) all_logits = np.concatenate( (all_logits, logits.detach().cpu().numpy()), axis=0) # 评价指标 start = 0 for key in eval_data.docs_keys: end = start + len(eval_data.docs[key]) maps.append(map(all_labels[start:end], all_logits[start:end])) mrrs.append(mrr(all_labels[start:end], all_logits[start:end])) start = end return np.array(loss).mean(), np.array(maps).mean(), np.array(mrrs).mean()