def evaluate(args, is_test, final=False):
        if is_test:
            dataset = read_dataset(args.test_path)
        else:
            dataset = read_dataset(args.dev_path)

        instances_num = len(dataset)
        batch_size = args.batch_size

        if is_test:
            logger.info(f"Batch size:{batch_size}")
            print(f"The number of test instances:{instances_num}")

        true_labels_all = []
        predicted_labels_all = []
        confusion = torch.zeros(len(labels_map),
                                len(labels_map),
                                dtype=torch.long)
        model.eval()

        test_batcher = Batcher(args,
                               dataset,
                               token_pad=tokenizer.pad_token_id,
                               label_pad=labels_map[PAD_TOKEN])

        for i, (input_ids_batch, label_ids_batch, mask_ids_batch,
                pos_ids_batch, vm_ids_batch,
                segment_ids_batch) in enumerate(test_batcher):

            input_ids_batch = input_ids_batch.to(device)
            label_ids_batch = label_ids_batch.to(device)
            mask_ids_batch = mask_ids_batch.to(device)
            pos_ids_batch = pos_ids_batch.to(device)
            vm_ids_batch = vm_ids_batch.long().to(device)
            segment_ids_batch = segment_ids_batch.long().to(device)

            pred, logits, scores = model(input_ids_batch,
                                         segment_ids_batch,
                                         mask_ids_batch,
                                         label_ids_batch,
                                         pos_ids_batch,
                                         vm_ids_batch,
                                         use_kg=args.use_kg)

            for pred_sample, gold_sample, mask in zip(pred, label_ids_batch,
                                                      mask_ids_batch):

                pred_labels = [
                    idx_to_label.get(key) for key in pred_sample.tolist()
                ]
                gold_labels = [
                    idx_to_label.get(key) for key in gold_sample.tolist()
                ]

                num_labels = sum(mask)

                # Exclude the [CLS], and [SEP] tokens
                pred_labels = pred_labels[1:num_labels - 1]
                true_labels = gold_labels[1:num_labels - 1]

                pred_labels = [p.replace('_NOKG', '') for p in pred_labels]
                true_labels = [t.replace('_NOKG', '') for t in true_labels]

                true_labels, pred_labels = filter_kg_labels(
                    true_labels, pred_labels)

                pred_labels = [p.replace('_', '-') for p in pred_labels]
                true_labels = [t.replace('_', '-') for t in true_labels]

                biluo_tags_predicted = get_bio(pred_labels)
                biluo_tags_true = get_bio(true_labels)

                if len(biluo_tags_predicted) != len(biluo_tags_true):
                    logger.error(
                        'The length of the predicted labels is not same as that of true labels..'
                    )
                    exit()

                predicted_labels_all.append(biluo_tags_predicted)
                true_labels_all.append(biluo_tags_true)

        if final:
            with open(f'{args.output_file_prefix}_predictions.txt', 'a') as p, \
                    open(f'{args.output_file_prefix}_gold.txt', 'a') as g:
                p.write('\n'.join([' '.join(l) for l in predicted_labels_all]))
                g.write('\n'.join([' '.join(l) for l in true_labels_all]))

        return dict(
            f1=seqeval.metrics.f1_score(true_labels_all, predicted_labels_all),
            precision=seqeval.metrics.precision_score(true_labels_all,
                                                      predicted_labels_all),
            recall=seqeval.metrics.recall_score(true_labels_all,
                                                predicted_labels_all),
            f1_span=f1_score_span(true_labels_all, predicted_labels_all),
            precision_span=precision_score_span(true_labels_all,
                                                predicted_labels_all),
            recall_span=recall_score_span(true_labels_all,
                                          predicted_labels_all),
        )
Esempio n. 2
0
        bio_tags_true = get_bio(true_labels)
        bio_tags_predicted = get_bio(pred_labels)

        if len(bio_tags_true) != len(bio_tags_predicted):
            print(bio_tags_true)
            print(bio_tags_predicted)
            print(line_id)

        true_labels_final.append(bio_tags_true)
        predicted_labels_final.append(bio_tags_predicted)
    results = dict(
        f1=metrics.f1_score(true_labels_final, predicted_labels_final),
        precision=metrics.precision_score(true_labels_final,
                                          predicted_labels_final),
        recall=metrics.recall_score(true_labels_final, predicted_labels_final),
        f1_span=f1_score_span(true_labels_final, predicted_labels_final),
        precision_span=precision_score_span(true_labels_final,
                                            predicted_labels_final),
        recall_span=recall_score_span(true_labels_final,
                                      predicted_labels_final),
    )
    print(results)

# def parse_args():
#     parser = argparse.ArgumentParser()
#     parser.add_argument('--output_dir', type=str, default='outputs')
#     parser.add_argument('--t_labels', type=str, default="ubuntu_label.txt")
#     parser.add_argument('--p_labels', type=str, default="ubuntu_predict.txt")
#     parser.add_argument('--text', type=str, default="ubuntu_text.txt")
#     return vars(parser.parse_args())
#