Esempio n. 1
0
def precision_recall_f1_support_sequence_labelling(y_true, y_pred):
    """Compute precision, recall, f1 and support for sequence labelling tasks.

    For given gold (`y_true`) and predicted (`y_pred`) sequence labels, returns the precision,
    recall, f1 and support per label, and the macro and micro average of these scores across
    labels. Expects `y_true` and `y_pred` to be a sequence of IOB1/2, IOE1/2, or IOBES formatted
    labels.

    Args:
        y_true (list): List of IOB1/2, IOE1/2, or IOBES formatted sequence labels.
        y_pred (list): List of IOB1/2, IOE1/2, or IOBES formatted sequence labels.

    Returns:
        A dictionary of scores keyed by the labels in `y_true` where each score is a 4-tuple
        containing precision, recall, f1 and support. Additionally includes the keys
        'Macro avg' and 'Micro avg' containing the macro and micro averages across scores.
    """
    scores = {}
    # Unique labels, not including NEG
    labels = list(
        {tag.split('-')[-1]
         for tag in set(y_true) if tag != OUTSIDE})
    labels.sort(
    )  # ensures labels displayed in same order across runs / partitions

    for label in labels:
        y_true_lab = [
            tag if tag.endswith(label) else OUTSIDE for tag in y_true
        ]
        y_pred_lab = [
            tag if tag.endswith(label) else OUTSIDE for tag in y_pred
        ]

        # TODO (John): Open a pull request to seqeval with a new function that returns all these
        # scores in one call. There is a lot of repeated computation here.
        precision = precision_score(y_true_lab, y_pred_lab)
        recall = recall_score(y_true_lab, y_pred_lab)
        f1 = f1_score(y_true_lab, y_pred_lab)
        support = len(set(get_entities(y_true_lab)))

        scores[label] = precision, recall, f1, support

    # Get macro and micro performance metrics averages
    macro_precision = mean([v[0] for v in scores.values()])
    macro_recall = mean([v[1] for v in scores.values()])
    macro_f1 = mean([v[2] for v in scores.values()])
    total_support = sum([v[3] for v in scores.values()])

    micro_precision = precision_score(y_true, y_pred)
    micro_recall = recall_score(y_true, y_pred)
    micro_f1 = f1_score(y_true, y_pred)

    scores[
        'Macro avg'] = macro_precision, macro_recall, macro_f1, total_support
    scores[
        'Micro avg'] = micro_precision, micro_recall, micro_f1, total_support

    return scores
Esempio n. 2
0
def evaluate(gold_data, parsed_data, fnversion=1.7):
    
    tic()
    
    gold_frames, pred_frames = [],[]
    gold_args, pred_args = [],[]
    gold_fulls, pred_fulls = [],[]
    
    for i in range(len(parsed_data)):
        gold = gold_data[i]
        parsed = parsed_data[i]
        
        gold_frame = [i for i in gold[2] if i != '_'][0]
        pred_frame = [i for i in parsed[2] if i != '_'][0]
        
        gold_arg = [i for i in gold[3] if i != 'X']
        pred_arg = [i for i in parsed[3]]
        
        gold_frames.append(gold_frame)
        pred_frames.append(pred_frame)
        
        weighted_gold_frame, weighted_pred_frame, weighted_gold_arg, weighted_pred_arg = weighting(gold_frame, pred_frame, gold_arg, pred_arg, fnversion=fnversion)
        
        gold_args.append(weighted_gold_arg)
        pred_args.append(weighted_pred_arg)
        
        gold_full = []
        gold_full += weighted_gold_frame
        gold_full += weighted_gold_arg

        pred_full = []
        pred_full += weighted_pred_frame
        pred_full += weighted_pred_arg

        gold_fulls.append(gold_full)
        pred_fulls.append(pred_full)
        
        
            
    acc = accuracy_score(gold_frames, pred_frames)
    arg_f1 = f1_score(gold_args, pred_args)
    arg_precision = precision_score(gold_args, pred_args)
    arg_recall = recall_score(gold_args, pred_args)

    full_f1 = f1_score(gold_fulls, pred_fulls)
    full_precision = precision_score(gold_fulls, pred_fulls)
    full_recall = recall_score(gold_fulls, pred_fulls)
    
    result = (acc, arg_precision, arg_recall, arg_f1, full_precision, full_recall, full_f1)
    
    print('evaluation is complete:',tac())
    
    return result
Esempio n. 3
0
def score(y_true, y_pred):
    """Wrapper of seqeval metrics
    """
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    return precision, recall, f1
Esempio n. 4
0
def get_slot_metrics(preds, labels):
    assert len(preds) == len(labels)
    return {
        "slot_precision": precision_score(labels, preds),
        "slot_recall": recall_score(labels, preds),
        "slot_f1": f1_score(labels, preds)
    }
Esempio n. 5
0
    def _eval_end(self, outputs):
        "Evaluation called for both Val and Test"
        val_loss_mean = torch.stack([x["val_loss"] for x in outputs]).mean()
        entity_preds = np.concatenate([x["entity_preds"] for x in outputs], axis=0)
        entity_preds = np.argmax(entity_preds, axis=2)
        intent_acc = torch.stack([x["intent_acc"] for x in outputs]).mean()
        
        out_label_ids = np.concatenate([x["entity_targets"] for x in outputs], axis=0)

        out_label_list = [[] for _ in range(out_label_ids.shape[0])]
        entity_preds_list = [[] for _ in range(out_label_ids.shape[0])]

        for i in range(out_label_ids.shape[0]):
            for j in range(out_label_ids.shape[1]):
                if out_label_ids[i, j] != FLAGS.pad_token_label_id:
                    out_label_list[i].append(self.id2entity[out_label_ids[i][j]])
                    entity_preds_list[i].append(self.id2entity[entity_preds[i][j]])

        results = {
            "val_loss": val_loss_mean,
            "precision": precision_score(out_label_list, entity_preds_list),
            "recall": recall_score(out_label_list, entity_preds_list),
            "f1": f1_score(out_label_list, entity_preds_list),
            "intent_acc": intent_acc
        }

        ret = {k: v for k, v in results.items()}
        ret["log"] = results
        return ret, entity_preds_list, out_label_list
Esempio n. 6
0
    def compute_metrics(p: EvalPrediction) -> Dict:

        preds_list, out_label_list, matrix = align_predictions(
            p.predictions, p.label_ids)

        return {
            "precision":
            precision_score(out_label_list, preds_list),
            "recall":
            recall_score(out_label_list, preds_list),
            "f1":
            f1_score(out_label_list, preds_list),
            # classification_report, accuracy_score
            "accuracy":
            accuracy_score(out_label_list, preds_list),
            "matrix":
            matrix,
            # "multilabel_confusion_matrix": multilabel_confusion_matrix(multi_out_label_list, multi_preds_list, labels=["B", "I", "O"]),
            # "confusion_matrix": confusion_matrix(out_label_list, preds_list),
            # "multilabel_confusion_matrix": multilabel_confusion_matrix(out_label_list, preds_list),
            # "balanced_accuracy_score": balanced_accuracy_score(out_label_list, preds_list),
            # "roc_auc_score": roc_auc_score(out_label_list, preds_list),
            # "top_k_accuracy_score": top_k_accuracy_score(out_label_list, preds_list),
            "classification_report":
            classification_report(out_label_list, preds_list)
        }
Esempio n. 7
0
def compute_pos_metrics(label_map, p: EvalPrediction) -> Dict:
    def align_predictions(
            predictions: np.ndarray,
            label_ids: np.ndarray) -> Tuple[List[int], List[int]]:
        # Token predictions need to be reduced to word predictions
        preds = np.argmax(predictions, axis=2)

        batch_size, seq_len = preds.shape

        out_label_list = [[] for _ in range(batch_size)]
        preds_list = [[] for _ in range(batch_size)]

        for i in range(batch_size):
            for j in range(seq_len):
                if label_ids[i, j] != nn.CrossEntropyLoss().ignore_index:
                    out_label_list[i].append(label_map[label_ids[i][j]])
                    preds_list[i].append(label_map[preds[i][j]])

        return preds_list, out_label_list

    preds_list, out_label_list = align_predictions(p.predictions, p.label_ids)
    return {
        "accuracy_score": accuracy_score(out_label_list, preds_list),
        "precision": precision_score(out_label_list, preds_list),
        "recall": recall_score(out_label_list, preds_list),
        "f1": f1_score(out_label_list, preds_list),
    }
Esempio n. 8
0
    def compute_metrics_from_preds_and_labels(cls, task, preds, labels):
        label_mask = np.stack([row["label_mask"] for row in labels])

        # Account for smart-truncate
        assert (label_mask[:, preds.shape[-1]:] == 0).all()
        label_mask = label_mask[:, :preds.shape[-1]].astype(bool)

        labels_for_eval = [label["pos_list"] for label in labels]
        preds_for_eval = []
        assert len(labels) == preds.shape[0]
        for i in range(len(labels)):
            relevant_preds = preds[i][label_mask[i]]
            relevant_preds_pos = [
                task.LABEL_BIMAP.b[pos_id] for pos_id in relevant_preds
            ]
            preds_for_eval.append(relevant_preds_pos)

        minor = {
            "precision":
            seqeval_metrics.precision_score(labels_for_eval, preds_for_eval),
            "recall":
            seqeval_metrics.recall_score(labels_for_eval, preds_for_eval),
            "f1":
            seqeval_metrics.f1_score(labels_for_eval, preds_for_eval),
        }
        return Metrics(
            major=minor["f1"],
            minor=minor,
        )
Esempio n. 9
0
def get_slot_metrics(preds, labels):
    '''According IOB1 scheme to evaluate seq'''
    return {
        'slot_f1': f1_score(labels, preds),
        'slot_recall': recall_score(labels, preds),
        'slot_precision': precision_score(labels, preds),
    }
Esempio n. 10
0
def evaluation(model, data_loader, index_to_label, vocab_dict, paras, device):
    """
    Contributor:
        Peng Qianqian: conlleval.pl for model evaluation.
        Oyang Sizhuo: conlleval.pl for model evaluation.
    """
    model.eval()

    total_pred_label = []
    total_ture_label = []
    with torch.no_grad():
        for step, batch in enumerate(data_loader):
            batch_data, batch_label = batch
            batch_data_list = [data.split('&&&') for data in batch_data]
            batch_label_list = [label.split('&&&') for label in batch_label]

            input_ids, mask = batch_data_processing(batch_data_list,
                                                    paras.max_length,
                                                    vocab_dict.get('[PAD]'),
                                                    vocab_dict.get('[CLS]'),
                                                    vocab_dict.get('[SEP]'))

            input_ids = input_ids.to(device)
            mask = mask.to(device)

            batch_max_length = input_ids.shape[1]

            predict_result = model(input_ids, mask)

            predict_label_list = convert_index_to_label(
                predict_result, index_to_label)
            ture_label_list = label_truncation(batch_label_list,
                                               batch_max_length)

            if args.print_example:
                logger.debug('Example:')
                logger.debug(f'predict: {predict_label_list[0]}')
                logger.debug(f'ture: {ture_label_list[0]}')

            for predict_list, ture_list in zip(predict_label_list,
                                               ture_label_list):
                if len(predict_list) != len(ture_list):
                    logger.debug('different length.')
                    logger.debug(
                        f'predict: {len(predict_list)}, ture: {len(ture_list)}'
                    )
                    logger.debug(f'{predict_list}\n{ture_list}')
                    continue
                total_pred_label.append(predict_list)
                total_ture_label.append(ture_list)

    logger.debug(f'total ture_label: {len(total_ture_label)}, '
                 f'total pred_label: {len(total_pred_label)}')

    acc = accuracy_score(total_ture_label, total_pred_label)
    precision = precision_score(total_ture_label, total_pred_label)
    recall = recall_score(total_ture_label, total_pred_label)
    f1 = f1_score(total_ture_label, total_pred_label)

    return acc, precision, recall, f1
Esempio n. 11
0
def compute_metrics_ner(p: EvalPrediction, id_to_label):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    # Remove ignored index (special tokens)
    true_predictions = [[
        id_to_label[p] for (p, l) in zip(prediction, label) if l != -100
    ] for prediction, label in zip(predictions, labels)]
    true_labels = [[
        id_to_label[l] for (p, l) in zip(prediction, label) if l != -100
    ] for prediction, label in zip(predictions, labels)]

    from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
    report = classification_report(true_labels,
                                   true_predictions,
                                   output_dict=True)
    return {
        "accuracy_score": accuracy_score(true_labels, true_predictions),
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
        "macro_precision": report["macro avg"]["precision"],
        "macro_recall": report["macro avg"]["recall"],
        "macro_f1": report["macro avg"]["f1-score"],
        "micro_precision": report["micro avg"]["precision"],
        "micro_recall": report["micro avg"]["recall"],
        "micro_f1": report["micro avg"]["f1-score"],
        "weighted_precision": report["weighted avg"]["precision"],
        "weighted_recall": report["weighted avg"]["recall"],
        "weighted_f1": report["weighted avg"]["f1-score"]
    }
Esempio n. 12
0
    def get_metric(self, reset: bool):
        if not reset:
            return {}

        all_prediction_sequence = []
        all_gold_sequence = []
        results = []
        for doc_id in self.gold_labels.keys():
            prediction = self.span_to_label_sequence(self.prediction[doc_id])
            gold = self.span_to_label_sequence(self.gold_labels[doc_id])
            all_prediction_sequence.append(prediction)
            all_gold_sequence.append(gold)
            results.append({
                "words": self.doc_id_to_words[doc_id],
                "gold": gold,
                "prediction": prediction
            })

        if self.prediction_save_path is not None:
            with open(self.prediction_save_path, "w") as f:
                json.dump(results, f)

        return dict(
            f1=f1_score(all_gold_sequence,
                        all_prediction_sequence,
                        scheme=IOB2),
            precision=precision_score(all_gold_sequence,
                                      all_prediction_sequence,
                                      scheme=IOB2),
            recall=recall_score(all_gold_sequence,
                                all_prediction_sequence,
                                scheme=IOB2),
        )
Esempio n. 13
0
def evaluate(y_pred, id_to_word, test_words, test_tags, id_to_tag, model_name, training_time, dataset):
    """
    Evaluate current model using CoNLL script.
    """
    true_list = []
    pred_list = []
    word_true_perd =[]

    print("Calculating results and printing classification report")
    print(54 * "-")
    for i in range(len(y_pred)):
        p = np.argmax(y_pred[i], axis=-1)
        for word_id, tag_id, pred in zip(test_words[i], test_tags[i], p):
            if id_to_word[word_id] != 'PAD':
                true_list.append(id_to_tag[tag_id])
                pred_list.append(id_to_tag[pred])
                word_true_perd.append([id_to_word[word_id],id_to_tag[tag_id],id_to_tag[pred]])
    today = date.today()
    current_time = today.strftime("%d%m%Y")
    save_obj(word_true_perd,'word_true_perd'+str(current_time))
    precision = precision_score(true_list, pred_list)
    recall = recall_score(true_list, pred_list)
    f1score = f1_score(true_list, pred_list)
    print(classification_report(true_list, pred_list))
    cur_list = [model_name, dataset, training_time, precision, recall, f1score]
    return cur_list
Esempio n. 14
0
def get_slot_metrics(preds, labels):
    assert len(preds) == len(labels)
    return OrderedDict({
        "Slot Precision": precision_score(labels, preds),
        "Slot Recall": recall_score(labels, preds),
        "Slot F1": f1_score(labels, preds)
    })
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    filtered_labels = []
    filtered_preds = []

    for i in range(labels.shape[0]):
        filtered_labels_inner = []
        filtered_preds_inner = []

        for j in range(labels.shape[1]):

            if labels[i][j] != -100:
                filtered_labels_inner.append(id_to_label(labels[i][j]))
                filtered_preds_inner.append(id_to_label(preds[i][j]))

        filtered_labels.append(filtered_labels_inner)
        filtered_preds.append(filtered_preds_inner)

    return {
        'accuracy': metrics.accuracy_score(filtered_labels, filtered_preds),
        'f1': metrics.f1_score(filtered_labels, filtered_preds),
        'precision': metrics.precision_score(filtered_labels, filtered_preds),
        'recall': metrics.recall_score(filtered_labels, filtered_preds),
    }
Esempio n. 16
0
    def _eval_end(self, outputs):
        "Task specific validation"
        val_loss_mean = torch.stack([x["val_loss"] for x in outputs]).mean()
        preds = np.concatenate([x["pred"] for x in outputs], axis=0)
        preds = np.argmax(preds, axis=2)
        out_label_ids = np.concatenate([x["target"] for x in outputs], axis=0)

        label_map = {i: label for i, label in enumerate(self.labels)}
        out_label_list = [[] for _ in range(out_label_ids.shape[0])]
        preds_list = [[] for _ in range(out_label_ids.shape[0])]

        for i in range(out_label_ids.shape[0]):
            for j in range(out_label_ids.shape[1]):
                if out_label_ids[i, j] != self.pad_token_label_id:
                    out_label_list[i].append(label_map[out_label_ids[i][j]])
                    preds_list[i].append(label_map[preds[i][j]])

        results = {
            "val_loss": val_loss_mean,
            "precision": precision_score(out_label_list, preds_list),
            "recall": recall_score(out_label_list, preds_list),
            "f1": f1_score(out_label_list, preds_list),
        }

        if self.is_logger():
            logger.info(self.proc_rank)
            logger.info("***** Eval results *****")
            for key in sorted(results.keys()):
                logger.info("  %s = %s", key, str(results[key]))

        tensorboard_logs = results
        ret = {k: v for k, v in results.items()}
        ret["log"] = tensorboard_logs
        return ret, preds_list, out_label_list
Esempio n. 17
0
def compute_metrics(preds_list, out_label_list):

    return {
            "precision": precision_score(out_label_list, preds_list),
            "recall": recall_score(out_label_list, preds_list),
            "f1": f1_score(out_label_list, preds_list),
        }
Esempio n. 18
0
    def __call__(self, eval_pred: EvalPrediction) -> Dict:
        """Computes accuracy precision, recall and f1 based on the list of IOB2 labels. 
        Positions with labels with a value of -100 will be filtered out both from true labela dn prediction.

        Args:
            eval_pred (EvalPrediction): the predictions and targets to be matched as np.ndarrays.

        Returns:
            (Dict): a dictionary with accuracy_score, precision, recall and f1.
        """
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=-1)

        # Remove ignored index (special tokens)
        true_predictions = [
            [self.label_list[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]
        true_labels = [
            [self.label_list[l] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]

        print("\n"+" " * 80)
        print(classification_report(true_labels, true_predictions))
        return {
            "accuracy_score": accuracy_score(true_labels, true_predictions),
            "precision": precision_score(true_labels, true_predictions),
            "recall": recall_score(true_labels, true_predictions),
            "f1": f1_score(true_labels, true_predictions),
        }
Esempio n. 19
0
    def on_epoch_end(self, epoch, logs=None):
        if (epoch + 1) % self.step == 0:
            y_pred = self.kash_model.predict(self.valid_x,
                                             batch_size=self.batch_size)

            if self.kash_model.task in [
                    macros.TaskType.LABELING, macros.TaskType.RAW_LABELING
            ]:
                y_true = [
                    seq[:len(y_pred[index])]
                    for index, seq in enumerate(self.valid_y)
                ]
                precision = seq_metrics.precision_score(y_true, y_pred)
                recall = seq_metrics.recall_score(y_true, y_pred)
                f1 = seq_metrics.f1_score(y_true, y_pred)
            else:
                y_true = self.valid_y
                precision = metrics.precision_score(y_true,
                                                    y_pred,
                                                    average=self.average)
                recall = metrics.recall_score(y_true,
                                              y_pred,
                                              average=self.average)
                f1 = metrics.f1_score(y_true, y_pred, average=self.average)

            self.logs[epoch] = {
                'precision': precision,
                'recall': recall,
                'f1': f1
            }
            print(
                f"\nepoch: {epoch} precision: {precision:.6f}, recall: {recall:.6f}, f1: {f1:.6f}"
            )
Esempio n. 20
0
def eval(args, model, dataset, label_list):
    dataloader = DataLoader(dataset,
                            batch_size=args.batch_size,
                            shuffle=False,
                            collate_fn=collate_fn)

    all_true_labels, all_pred_labels = [], []
    with torch.no_grad():
        for batch in tqdm(dataloader):
            inputs = {k: v.to(args.device) for k, v in batch.items()}
            outputs = model(**inputs)

            #predictions = outputs[1].permute(1, 0, 2).detach().cpu().numpy()
            predictions = outputs[1]
            label_ids = batch['label_ids'].permute(1, 0).detach().cpu().numpy()

            preds_list, out_label_list = align_predictions(
                predictions, label_ids, label_list)

            all_true_labels += out_label_list
            all_pred_labels += preds_list

    report = classification_report(all_true_labels, all_pred_labels)
    logger.info(report)

    return {
        'precision': precision_score(all_true_labels, all_pred_labels),
        'recall': recall_score(all_true_labels, all_pred_labels),
        'f1': f1_score(all_true_labels, all_pred_labels),
    }
Esempio n. 21
0
def all_metrics(pred_tag, true_tag):

    print(classification_report(pred_tag, true_tag))
    print('=' * 25)
    print("Precision: \t", precision_score(pred_tag, true_tag))
    print("Recall: \t", recall_score(pred_tag, true_tag))
    print("F1: \t\t", f1_score(pred_tag, true_tag))
Esempio n. 22
0
def test(dataloader, model, label_list):
    size = len(dataloader.dataset)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        all_pred_y = list()
        all_y = list()
        print("testing")
        for X, y, masks in tqdm(dataloader):
            X, y = X.to(device), y.to(device)
            masks = masks.to(device)
            # output = model(X)
            # loss = 0 - crf(output, y)
            loss, pred_y = model(X, y, masks)
            test_loss += loss
            all_pred_y.extend(pred_y)
            all_y.extend(y.cpu().numpy().tolist())
    all_pred_y_label = [[label_list[t1] for t1 in t2] for t2 in all_pred_y]
    all_y_label = [[label_list[t1] for t1 in t2] for t2 in all_y]
    print('p', precision_score(all_pred_y_label, all_y_label))
    print('r', recall_score(all_pred_y_label, all_y_label))
    print('f1', f1_score(all_pred_y_label, all_y_label))
    print('acc', accuracy_score(all_pred_y_label, all_y_label))
    test_loss /= size
    print(f"Avg loss: {test_loss:>8f} \n")
Esempio n. 23
0
    def _eval_end(self, outputs):
        """Evaluation called for both Val and Test"""
        val_loss_mean = torch.stack([x['val_loss'] for x in outputs]).mean()
        preds = np.concatenate([x['pred'] for x in outputs], axis=0)
        preds = np.argmax(preds, axis=2)
        out_label_ids = np.concatenate([x['target'] for x in outputs], axis=0)

        label_map = {i: label for i, label in enumerate(self.labels)}
        out_label_list = [[] for _ in range(out_label_ids.shape[0])]
        preds_list = [[] for _ in range(out_label_ids.shape[0])]

        for i in range(out_label_ids.shape[0]):
            for j in range(out_label_ids.shape[1]):
                if out_label_ids[i, j] != self.pad_token_label_id:
                    out_label_list[i].append(label_map[out_label_ids[i][j]])
                    preds_list[i].append(label_map[preds[i][j]])

        results = {
            'val_loss': val_loss_mean,
            'precision': precision_score(out_label_list, preds_list),
            'recall': recall_score(out_label_list, preds_list),
            'f1': f1_score(out_label_list, preds_list),
        }

        ret = {k: v for k, v in results.items()}
        ret['log'] = results
        return ret, preds_list, out_label_list
Esempio n. 24
0
    def evaluate(self, data, labels):
        """Evaluate the performance of ner model.

        Args:
            data: list of tokenized texts (, like ``[['我', '是', '中', '国', '人']]``
            labels: list of list of str, the corresponding label strings

        """
        features, y = self.preprocessor.prepare_input(data, labels)
        pred_probs = self.model.predict(features)
        if self.preprocessor.use_bert:
            pred_probs = pred_probs[:, 1:-1, :]  # remove <CLS> and <SEQ>

        lengths = [
            min(len(label), pred_prob.shape[0])
            for label, pred_prob in zip(labels, pred_probs)
        ]
        y_pred = self.preprocessor.label_decode(pred_probs, lengths)

        r = metrics.recall_score(labels, y_pred)
        p = metrics.precision_score(labels, y_pred)
        f1 = metrics.f1_score(labels, y_pred)

        print('Recall: {}, Precision: {}, F1: {}'.format(r, p, f1))
        print(metrics.classification_report(labels, y_pred))
        return f1
Esempio n. 25
0
    def _eval_end(self, outputs):
        val_loss_mean = T.stack([x["val_loss"] for x in outputs]).mean()
        preds = np.concatenate([x["pred"] for x in outputs], axis=0)
        preds = np.argmax(preds, axis=2)
        out_label_ids = np.concatenate([x["target"] for x in outputs], axis=0)

        label_map = {i: label for i, label in enumerate(self.labels)}
        out_label_list = [[] for _ in range(out_label_ids.shape[0])]
        preds_list = [[] for _ in range(out_label_ids.shape[0])]

        for i in range(out_label_ids.shape[0]):
            for j in range(out_label_ids.shape[1]):
                if out_label_ids[i, j] != self.pad_token_label_id:
                    out_label_list[i].append(label_map[out_label_ids[i][j]])
                    preds_list[i].append(label_map[preds[i][j]])

        results = {
            "val_loss": val_loss_mean,
            "precision": precision_score(out_label_list, preds_list),
            "recall": recall_score(out_label_list, preds_list),
            "f1": f1_score(out_label_list, preds_list),
        }

        ret = {k: v for k, v in results.items()}
        ret["log"] = results
        return ret, preds_list, out_label_list
Esempio n. 26
0
def weighted_voting(args, dataset, src_probs, labels, pad_token_label_id):
    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

    out_label_ids = None
    for batch in eval_dataloader:
        out_label_ids = batch[3] if out_label_ids is None else np.append(out_label_ids, batch[3], axis=0)

    preds = np.argmax(src_probs.cpu().numpy(), axis=-1)

    label_map = {i: label for i, label in enumerate(labels)}

    out_label_list = [[] for _ in range(out_label_ids.shape[0])]
    preds_list = [[] for _ in range(out_label_ids.shape[0])]

    for i in range(out_label_ids.shape[0]):
        for j in range(out_label_ids.shape[1]):
            if out_label_ids[i, j] != pad_token_label_id:
                out_label_list[i].append(label_map[out_label_ids[i][j]])
                preds_list[i].append(label_map[preds[i][j]])

    results = {
        "precision": precision_score(out_label_list, preds_list),
        "recall": recall_score(out_label_list, preds_list),
        "f1": f1_score(out_label_list, preds_list)
    }

    logger.info("***** Eval results %s *****", os.path.basename(args.data_dir))
    for key in sorted(results.keys()):
        logger.info("  %s = %s", key, str(results[key]))

    return results, preds_list
Esempio n. 27
0
 def compute_metrics(p: EvalPrediction) -> Dict:
     preds_list, out_label_list = align_predictions(p.predictions, p.label_ids)
     return {
         "precision": precision_score(out_label_list, preds_list),
         "recall": recall_score(out_label_list, preds_list),
         "f1": f1_score(out_label_list, preds_list),
     }
Esempio n. 28
0
    def test_epoch_end(self, outputs: List[Dict[str, torch.Tensor]]):
        preds = np.concatenate(
            [x["pred"].detach().cpu().numpy() for x in outputs], axis=0)
        preds = np.argmax(preds, axis=2)
        target_ids = np.concatenate(
            [x["target"].detach().cpu().numpy() for x in outputs], axis=0)

        target_list: StrListList = [[] for _ in range(target_ids.shape[0])]
        preds_list: StrListList = [[] for _ in range(target_ids.shape[0])]
        for i in range(target_ids.shape[0]):
            for j in range(target_ids.shape[1]):
                if target_ids[i][j] != PAD_TOKEN_LABEL_ID:
                    target_list[i].append(
                        self.label_ids_to_label[target_ids[i][j]])
                    preds_list[i].append(self.label_ids_to_label[preds[i][j]])

        accuracy = accuracy_score(target_list, preds_list)
        precision = precision_score(target_list,
                                    preds_list,
                                    mode="strict",
                                    scheme=BILOU)
        recall = recall_score(target_list,
                              preds_list,
                              mode="strict",
                              scheme=BILOU)
        f1 = f1_score(target_list, preds_list, mode="strict", scheme=BILOU)
        self.log("test_accuracy", accuracy)
        self.log("test_precision", precision)
        self.log("test_recall", recall)
        self.log("test_f1", f1)
Esempio n. 29
0
def calculate_token_class_metrics(pred_toks, targ_toks, metric_key):
    if (metric_key == 'accuracy'): return seq_metrics.accuracy_score(targ_toks, pred_toks)
    if (metric_key == 'precision'): return seq_metrics.precision_score(targ_toks, pred_toks)
    if (metric_key == 'recall'): return seq_metrics.recall_score(targ_toks, pred_toks)
    if (metric_key == 'f1'): return seq_metrics.f1_score(targ_toks, pred_toks)

    if (metric_key == 'classification_report'): return seq_metrics.classification_report(targ_toks, pred_toks)
Esempio n. 30
0
def model_evaluate(model, data, label, tag2id, batch_size, seq_len_list):
    id2tag = {value: key for key, value in tag2id.items()}
    pred_logits = model.predict(data, batch_size=batch_size)
    # pred shape [batch_size, max_len]
    preds = np.argmax(pred_logits, axis=2).tolist()

    assert len(preds) == len(seq_len_list)
    # get predcit label
    predict_label = []
    target_label = []
    for i in range(len(preds)):
        pred = preds[i][1:]
        temp = []
        true_label = label[i][:min(seq_len_list[i], len(pred))]
        for j in range(min(seq_len_list[i], len(pred))):
            temp.append(id2tag[pred[j]])
        assert len(temp) == len(true_label)
        target_label.append(true_label)
        predict_label.append(temp)

    # 计算 precision, recall, f1_score
    precision = precision_score(target_label,
                                predict_label,
                                average="macro",
                                zero_division=0)
    recall = recall_score(target_label,
                          predict_label,
                          average="macro",
                          zero_division=0)
    f1 = f1_score(target_label,
                  predict_label,
                  average="macro",
                  zero_division=0)
    logger.info(classification_report(target_label, predict_label))
    return precision, recall, f1