Exemple #1
0
    def evaluate_ner_seq_eval(self, batch_ner_labels, batch_ner_predictions,
                              labels: List[str], partition, head_identifier):
        id2label = {}
        entity_labels = labels
        for idx, label in enumerate(entity_labels):
            if label.endswith('NP'):
                label = label[:2] + head_identifier.split('_')[-1]
            elif label == 'BERT_TOKEN':
                label = 'O'
            id2label[idx] = label
        ner_ground_truth = [[id2label[idx] for idx in seq]
                            for seq in batch_ner_labels]
        ner_predictions = [[id2label[idx] for idx in seq]
                           for seq in batch_ner_predictions]

        # Get results
        default_results = classification_report(y_true=ner_ground_truth,
                                                y_pred=ner_predictions,
                                                output_dict=True,
                                                digits=3,
                                                mode='default',
                                                scheme=IOB2)
        default_results['performance'] = performance_measure(
            y_true=ner_ground_truth, y_pred=ner_predictions)
        default_results = {
            metric_group1:
            {metric: float(value)
             for metric, value in metric_group2.items()}
            for metric_group1, metric_group2 in default_results.items()
        }

        strict_results = classification_report(y_true=ner_ground_truth,
                                               y_pred=ner_predictions,
                                               output_dict=True,
                                               digits=3,
                                               mode='strict',
                                               scheme=IOB2)
        strict_results['performance'] = performance_measure(
            y_true=ner_ground_truth, y_pred=ner_predictions)
        strict_results = {
            metric_group1:
            {metric: float(value)
             for metric, value in metric_group2.items()}
            for metric_group1, metric_group2 in strict_results.items()
        }

        mlflow.log_dict(dict(lenient=default_results, strict=strict_results),
                        f"{partition}/{self.epoch}/{head_identifier}.json")
Exemple #2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("predictions")
    parser.add_argument("input_df_pkl")
    args = parser.parse_args()

    test_df = pd.read_pickle(args.input_df_pkl)
    y_pred = []
    with open(args.predictions) as fp:
        for line in fp:
            y_pred += line.strip().split()

    y_true = reduce(lambda acc, l: acc + l, test_df.entity_type, [])
    y_true = pd.Series(y_true)
    y_pred = pd.Series(y_pred)

    print(len(y_true))
    print(len(y_pred))

    print("Token level")
    eval_d = ner_report(y_true, y_pred, mode="token", return_dict=True)
    eval_d = dict(eval_d["PATHWAY"])
    with open("pathway_metrics_token.json", "w") as fp:
        json.dump(eval_d, fp)
        fp.write("\n")
    pprint(eval_d)

    print("Entity level")
    y_pred_corr = pd.Series(correct_iob(y_pred))
    eval_d = ner_report(y_true, y_pred_corr, mode="entity", return_dict=True)
    eval_d = dict(eval_d["PATHWAY"])
    with open("pathway_metrics_entity.json", "w") as fp:
        json.dump(eval_d, fp)
        fp.write("\n")
    pprint(eval_d)

    print("Seqeval")
    y_true = list(test_df.entity_type)
    y_pred = []
    with open(args.predictions) as fp:
        for line in fp:
            y_pred.append(line.strip().split())

    from collections import Counter
    c = Counter()
    for x in y_true:
        c.update(x)
    print(c)
    total = 0
    for s1, s2 in zip(y_true, y_pred):
        total += sum(t1 == t2 for t1, t2 in zip(s1, s2))
    acc = total / sum(len(s) for s in y_true)
    print("acc:", acc)
    print("acc_score:", accuracy_score(y_true, y_pred))
    print(classification_report(y_true, y_pred, scheme=IOB2, mode="strict"))
    print(performance_measure(y_true, y_pred))
Exemple #3
0
 def test_performance_measure(self):
     y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'O', 'B-ORG'],
               ['B-PER', 'I-PER', 'O', 'B-PER']]
     y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O', 'O'],
               ['B-PER', 'I-PER', 'O', 'B-MISC']]
     performance_dict = performance_measure(y_true, y_pred)
     self.assertDictEqual(performance_dict, {
         'FN': 1,
         'FP': 3,
         'TN': 4,
         'TP': 3
     })
Exemple #4
0
def train_epoch(model, criterion, optimizer, data, tag2idx, idx2tag, device,
                scheduler):
    epoch_loss = 0
    epoch_metrics = FMeasureStorage()

    model.train()

    for batch in data:
        tokens = batch[0].to(device)
        tags = batch[1].to(device)

        batch_element_length = len(tags[0])

        predictions = model(tokens)
        predictions = predictions.view(-1, predictions.shape[-1])

        tags_mask = tags != tag2idx['<pad>']
        tags_mask = tags_mask.view(-1)
        labels = torch.where(
            tags_mask, tags.view(-1),
            torch.tensor(criterion.ignore_index).type_as(tags))

        loss = criterion(predictions, labels)

        predictions = predictions.argmax(dim=1)

        predictions = predictions.cpu().numpy()
        labels = labels.cpu().numpy()

        # clear <PAD>, CLS and SEP tags from both labels and predictions
        clear_labels, clear_predictions = clear_tags(labels, predictions,
                                                     idx2tag, tag2idx,
                                                     batch_element_length)

        iteration_result = performance_measure(clear_labels, clear_predictions)

        epoch_metrics + iteration_result
        epoch_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1)
        optimizer.step()
        if scheduler:
            scheduler.step()
        torch.cuda.empty_cache()

    epoch_f1_score, epoch_precision, epoch_recall = epoch_metrics.report()
    print(
        'Train Loss = {:.5f}, F1-score = {:.3%}, Precision = {:.3%}, Recall = {:.3%}'
        .format(epoch_loss / len(data), epoch_f1_score, epoch_precision,
                epoch_recall))
Exemple #5
0
def test_epoch(model, criterion, data, tag2idx, idx2tag, device):
    name = '[Final] Test :'
    epoch_loss = 0
    epoch_metrics = FMeasureStorage()

    model.eval()

    with torch.no_grad():
        with tqdm(total=len(data)) as progress_bar:
            for batch in data:
                tokens = batch[0].to(device)
                tags = batch[1].to(device)

                batch_element_length = len(tags[0])

                predictions, _ = model(tokens)
                predictions = predictions.view(-1, predictions.shape[-1])
                tags_mask = tags != tag2idx['<PAD>']
                tags_mask = tags_mask.view(-1)
                labels = torch.where(tags_mask, tags.view(-1), torch.tensor(criterion.ignore_index).type_as(tags))

                loss = criterion(predictions, labels)

                predictions = predictions.argmax(dim=1)

                predictions = predictions.cpu().numpy()
                labels = labels.cpu().numpy()

                # clear <PAD>, CLS and SEP tags from both labels and predictions
                clear_labels, clear_predictions = clear_tags(labels, predictions, idx2tag, tag2idx, batch_element_length)

                iteration_result = performance_measure(clear_labels, clear_predictions)

                epoch_metrics + iteration_result
                epoch_loss += loss.item()

                progress_bar.update()
                progress_bar.set_description(
                    '{:>5s} Loss = {:.5f}, F1-score = {:.2%}'.format(name, loss.item(), 0))

            epoch_f1_score, epoch_precision, epoch_recall = epoch_metrics.report()
            progress_bar.set_description(
                '{:>5s} Loss = {:.5f}, F1-score = {:.2%}'.format(name, epoch_loss / len(data), epoch_f1_score))
Exemple #6
0
def test_epoch(model, criterion, data, tag2idx, idx2tag, device):
    epoch_loss = 0
    epoch_metrics = FMeasureStorage()

    model.eval()

    with torch.no_grad():
        for batch in data:
            tokens = batch[0].to(device)
            tags = batch[1].to(device)

            batch_element_length = len(tags[0])

            predictions = model(tokens)
            predictions = predictions.view(-1, predictions.shape[-1])
            tags_mask = tags != tag2idx['<pad>']
            tags_mask = tags_mask.view(-1)
            labels = torch.where(
                tags_mask, tags.view(-1),
                torch.tensor(criterion.ignore_index).type_as(tags))

            loss = criterion(predictions, labels)

            predictions = predictions.argmax(dim=1)

            predictions = predictions.cpu().numpy()
            labels = labels.cpu().numpy()

            # clear <PAD>, CLS and SEP tags from both labels and predictions
            clear_labels, clear_predictions = clear_tags(
                labels, predictions, idx2tag, tag2idx, batch_element_length)

            iteration_result = performance_measure(clear_labels,
                                                   clear_predictions)

            epoch_metrics + iteration_result
            epoch_loss += loss.item()

    epoch_f1_score, epoch_precision, epoch_recall = epoch_metrics.report()
    print(
        'Test  Loss = {:.5f}, F1-score = {:.3%}, Precision = {:.3%}, Recall = {:.3%}'
        .format(epoch_loss / len(data), epoch_f1_score, epoch_precision,
                epoch_recall))
    def evaluate(self, X, y, sentences=None):
        """
        Evaluate the model using the given data.

        Parameters
        ----------
        X: Evaluation data.
        y: Evaluation data labels.
        sentences: Evaluation data sentences, used to print the wrong results.

        Returns
        -------
        Evaluation score (precision, recall, and f1-score)
        """
        y_pred = self.predict(X)
        y_true = []
        y_pred2 = []

        for seq in y:
            for label in seq:
                if label == 'O':
                    y_true.append(0)
                elif label == 'B-ASPECT':
                    y_true.append(1)
                elif label == 'I-ASPECT':
                    y_true.append(2)
                elif label == 'B-SENTIMENT':
                    y_true.append(3)
                elif label == 'I-SENTIMENT':
                    y_true.append(4)

        for seq in y_pred:
            for label in seq:
                if label == 'O':
                    y_pred2.append(0)
                elif label == 'B-ASPECT':
                    y_pred2.append(1)
                elif label == 'I-ASPECT':
                    y_pred2.append(2)
                elif label == 'B-SENTIMENT':
                    y_pred2.append(3)
                elif label == 'I-SENTIMENT':
                    y_pred2.append(4)

        print("Confusion Matrix:")
        print(confusion_matrix(y_true, y_pred2))
        print()
        print("Precision:")
        print(precision_score(y_true, y_pred2, average=None))
        print()
        print("Recall:")
        print(recall_score(y_true, y_pred2, average=None))
        print()
        print("F1-score:")
        print(f1_score(y_true, y_pred2, average=None))
        print()
        print("Report (entity level):")
        print(classification_report(y, y_pred))
        print(performance_measure(y, y_pred))

        if sentences != None:
            self.get_wrong_predictions(y, y_pred, sentences)
Exemple #8
0
    def eval_ner(self, dataset_id, corr_labels, pred_labels, train_data,
                 test_data):
        # compute entity-level metrics
        metrics = classification_report(corr_labels, pred_labels, digits=4)
        metrics = pd.read_csv(StringIO(metrics), sep=' {2,}',
                              engine='python') * 100

        # sort the labels alphabetically and rename columns
        metrics.sort_index(inplace=True)
        metrics.rename(columns={
            'precision': 'Prec',
            'recall': 'Rec',
            'f1-score': 'F1'
        },
                       inplace=True)

        # append the prefix B- to all tags in order to compute token-level metrics
        corr_labels_t = [[
            'B' + ent[1:] if ent[0] in ('B', 'I') else 'B-' + ent
            for ent in sent
        ] for sent in corr_labels]
        pred_labels_t = [[
            'B' + ent[1:] if ent[0] in ('B', 'I') else 'B-' + ent
            for ent in sent
        ] for sent in pred_labels]

        # compute token-level metrics
        metrics_t = classification_report(corr_labels_t,
                                          pred_labels_t,
                                          digits=4)
        metrics_t = pd.read_csv(
            StringIO(metrics_t), sep=' {2,}', engine='python') * 100

        # sort the labels alphabetically and rename columns
        metrics_t.sort_index(inplace=True)
        metrics_t.rename(columns={
            'precision': 'Prec',
            'recall': 'Rec',
            'f1-score': 'F1'
        },
                         inplace=True)

        # compute performance metrics
        perf = performance_measure(corr_labels, pred_labels)
        tp, tn, fp, fn = perf['TP'], perf['TN'], perf['FP'], perf['FN']

        # compute entity- and token-level accuracy
        ent_acc = round((tp + tn) / (tp + tn + fp + fn) * 100, 2)
        tok_acc = round(accuracy_score(corr_labels, pred_labels) * 100, 2)

        # obtain overall Prec, Rec and F1 for entity- and token-level
        ent_avg = metrics.loc['micro avg'].drop('support')
        tok_avg = metrics_t.loc['macro avg'].drop('support')

        ent_avg = pd.concat([pd.Series({'Acc': ent_acc}), ent_avg])
        tok_avg = pd.concat([pd.Series({'Acc': tok_acc}), tok_avg])

        metrics_avg = pd.concat([ent_avg, tok_avg],
                                keys=['Entity Spans', 'Tokens'])
        self.ner_metrics[dataset_id] = metrics_avg

        # obtain F1 score at the entity- and token-level per entity type
        ent_f1 = metrics['F1'].drop(['micro avg', 'macro avg'])
        tok_f1 = metrics_t['F1'].drop(['O', 'micro avg', 'macro avg'])

        metrics_f1 = pd.concat([ent_f1, tok_f1],
                               keys=['Entity Spans', 'Tokens'])
        self.ent_type_f1[dataset_id] = metrics_f1