def evaluate(args, is_test, final=False): if is_test: dataset = read_dataset(args.test_path) else: dataset = read_dataset(args.dev_path) instances_num = len(dataset) batch_size = args.batch_size if is_test: logger.info(f"Batch size:{batch_size}") print(f"The number of test instances:{instances_num}") true_labels_all = [] predicted_labels_all = [] confusion = torch.zeros(len(labels_map), len(labels_map), dtype=torch.long) model.eval() test_batcher = Batcher(args, dataset, token_pad=tokenizer.pad_token_id, label_pad=labels_map[PAD_TOKEN]) for i, (input_ids_batch, label_ids_batch, mask_ids_batch, pos_ids_batch, vm_ids_batch, segment_ids_batch) in enumerate(test_batcher): input_ids_batch = input_ids_batch.to(device) label_ids_batch = label_ids_batch.to(device) mask_ids_batch = mask_ids_batch.to(device) pos_ids_batch = pos_ids_batch.to(device) vm_ids_batch = vm_ids_batch.long().to(device) segment_ids_batch = segment_ids_batch.long().to(device) pred, logits, scores = model(input_ids_batch, segment_ids_batch, mask_ids_batch, label_ids_batch, pos_ids_batch, vm_ids_batch, use_kg=args.use_kg) for pred_sample, gold_sample, mask in zip(pred, label_ids_batch, mask_ids_batch): pred_labels = [ idx_to_label.get(key) for key in pred_sample.tolist() ] gold_labels = [ idx_to_label.get(key) for key in gold_sample.tolist() ] num_labels = sum(mask) # Exclude the [CLS], and [SEP] tokens pred_labels = pred_labels[1:num_labels - 1] true_labels = gold_labels[1:num_labels - 1] pred_labels = [p.replace('_NOKG', '') for p in pred_labels] true_labels = [t.replace('_NOKG', '') for t in true_labels] true_labels, pred_labels = filter_kg_labels( true_labels, pred_labels) pred_labels = [p.replace('_', '-') for p in pred_labels] true_labels = [t.replace('_', '-') for t in true_labels] biluo_tags_predicted = get_bio(pred_labels) biluo_tags_true = get_bio(true_labels) if len(biluo_tags_predicted) != len(biluo_tags_true): logger.error( 'The length of the predicted labels is not same as that of true labels..' ) exit() predicted_labels_all.append(biluo_tags_predicted) true_labels_all.append(biluo_tags_true) if final: with open(f'{args.output_file_prefix}_predictions.txt', 'a') as p, \ open(f'{args.output_file_prefix}_gold.txt', 'a') as g: p.write('\n'.join([' '.join(l) for l in predicted_labels_all])) g.write('\n'.join([' '.join(l) for l in true_labels_all])) return dict( f1=seqeval.metrics.f1_score(true_labels_all, predicted_labels_all), precision=seqeval.metrics.precision_score(true_labels_all, predicted_labels_all), recall=seqeval.metrics.recall_score(true_labels_all, predicted_labels_all), f1_span=f1_score_span(true_labels_all, predicted_labels_all), precision_span=precision_score_span(true_labels_all, predicted_labels_all), recall_span=recall_score_span(true_labels_all, predicted_labels_all), )
bio_tags_predicted = get_bio(pred_labels) if len(bio_tags_true) != len(bio_tags_predicted): print(bio_tags_true) print(bio_tags_predicted) print(line_id) true_labels_final.append(bio_tags_true) predicted_labels_final.append(bio_tags_predicted) results = dict( f1=metrics.f1_score(true_labels_final, predicted_labels_final), precision=metrics.precision_score(true_labels_final, predicted_labels_final), recall=metrics.recall_score(true_labels_final, predicted_labels_final), f1_span=f1_score_span(true_labels_final, predicted_labels_final), precision_span=precision_score_span(true_labels_final, predicted_labels_final), recall_span=recall_score_span(true_labels_final, predicted_labels_final), ) print(results) # def parse_args(): # parser = argparse.ArgumentParser() # parser.add_argument('--output_dir', type=str, default='outputs') # parser.add_argument('--t_labels', type=str, default="ubuntu_label.txt") # parser.add_argument('--p_labels', type=str, default="ubuntu_predict.txt") # parser.add_argument('--text', type=str, default="ubuntu_text.txt") # return vars(parser.parse_args()) # # # if __name__ == '__main__':