コード例 #1
0
def evaluate_rnn(y, preds):
    """Because the RNN sequences get clipped as necessary based
    on the `max_length` parameter, they have to be realigned to
    get a classification report. This method does that, building
    in the assumption that any clipped tokens are assigned an
    incorrect label.

    Parameters
    ----------
    y : list of list of labels
    preds : list of list of labels

    Both of these lists need to have the same length, but the
    sequences they contain can vary in length.
    """
    labels = sorted({c for ex in y for c in ex})
    new_preds = []
    for gold, pred in zip(y, preds):
        delta = len(gold) - len(pred)
        if delta > 0:
            # Make a *wrong* guess for these clipped tokens:
            pred += [random.choice(list(set(labels)-{label}))
                     for label in gold[-delta: ]]
        new_preds.append(pred)
    labels = sorted({cls for ex in y for cls in ex} - {'OTHER'})
    data = {}
    data['classification_report'] = flat_classification_report(y, new_preds)
    data['f1_macro'] = flat_f1_score(y, new_preds, average='macro')
    data['f1_micro'] = flat_f1_score(y, new_preds, average='micro')
    data['f1'] = flat_f1_score(y, new_preds, average=None)
    data['precision_score'] = flat_precision_score(y, new_preds, average=None)
    data['recall_score'] = flat_recall_score(y, new_preds, average=None)
    data['accuracy'] = flat_accuracy_score(y, new_preds)
    data['sequence_accuracy_score'] = sequence_accuracy_score(y, new_preds)
    return data
コード例 #2
0
    def run(self, batches: Generator) -> None:
        """
        Runs the CRF model, storing to pickle in the end
        """
        st = time.time()

        x = []
        y = []

        # For prediction, CRF does not implement batching, so we pass a list
        for batch in batches:
            b = list(batch)
            x.extend(b[0])
            y.extend(b[1])

        accuracy = self.model.score(x, y)
        y_pred = self.model.predict(x)
        f1_score = metrics.flat_f1_score(y, y_pred, average='weighted')
        accuracy_sentence = metrics.sequence_accuracy_score(y, y_pred)
        classification_report = metrics.flat_classification_report(
            y, y_pred, labels=self.model.classes_)
        print("*" * 80)
        print("MODEL EVALUATION")
        print("*" * 80)
        print("Token-wise accuracy score on Test Data:")
        print(round(accuracy, 3))
        print("F1 score on Test Data:")
        print(round(f1_score, 3))
        print(
            "Sequence accurancy score (% of sentences scored 100% correctly):")
        print(round(accuracy_sentence, 3))
        print("Class-wise classification report:")
        print(classification_report)
        et = time.time()
        print(f"Evaluation finished in {round(et-st, 2)} seconds.")
    def evaluate(self, output_path):
        loss = self._model.evaluate(self._data_reader.test_X,
                                    self._data_reader.test_y)
        print('Loss is: %f' % loss)

        all_predicted_labels = []
        all_true_labels = []
        for i, _test_instance in enumerate(self._data_reader.test_X):
            test_prediction = self._model.predict(
                _test_instance.reshape(
                    1, self._data_reader.max_train_sentence_length))[0]

            predicted_labels, true_labels = [], []
            for encoded_true_label_array, encoded_test_label_array in zip(
                    self._data_reader.test_y[i], test_prediction):
                contains_all_zeros = not numpy.any(encoded_true_label_array)
                if not contains_all_zeros:
                    predicted_labels.append(
                        self._data_reader.decode_single_label(
                            encoded_test_label_array))
                    true_labels.append(
                        self._data_reader.decode_single_label(
                            encoded_true_label_array))

            all_predicted_labels.append(predicted_labels)
            all_true_labels.append(true_labels)

        classification_report = metrics.flat_classification_report(
            all_true_labels,
            all_predicted_labels,
            labels=self._data_reader.labels)

        sequence_accuracy = metrics.sequence_accuracy_score(
            all_true_labels, all_predicted_labels)

        precision = metrics.flat_precision_score(all_true_labels,
                                                 all_predicted_labels,
                                                 average='weighted')

        recall = metrics.flat_recall_score(all_true_labels,
                                           all_predicted_labels,
                                           average='weighted')

        _save_metrics(output_path=output_path,
                      classification_report=classification_report,
                      sequence_accuracy=sequence_accuracy,
                      precision=precision,
                      recall=recall)

        return classification_report, sequence_accuracy, precision, recall
コード例 #4
0
def _print_metrics(y_pred, y_true):
    labels = get_labels(y_true)
    print("Sequence accuracy: {:0.1%}".format(
        metrics.sequence_accuracy_score(y_true, y_pred))
    )
    print("Per-tag F1: {:0.3f}".format(
        metrics.flat_f1_score(y_true, y_pred,
                              average='macro',
                              labels=labels)
    ))
    print("Per-tag Classification report: \n{}".format(
        metrics.flat_classification_report(y_true, y_pred,
                                           labels=labels, digits=3))
    )
コード例 #5
0
def train_seq(X_train, Y_train, X_dev, Y_dev):
    # crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=50, all_possible_states=True)
    crf = CRF(algorithm='lbfgs', c1=0.1, c2=10,
              max_iterations=50)  #, all_possible_states=True)
    #Just to fit on training data
    crf.fit(X_train, Y_train)
    labels = list(crf.classes_)
    #testing:
    y_pred = crf.predict(X_dev)
    sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0]))
    print(
        metrics.flat_f1_score(Y_dev, y_pred, average='weighted',
                              labels=labels))
    print(
        metrics.flat_classification_report(Y_dev,
                                           y_pred,
                                           labels=sorted_labels,
                                           digits=3))
    print(metrics.sequence_accuracy_score(Y_dev, y_pred))
    get_confusion_matrix(Y_dev, y_pred, labels=sorted_labels)
コード例 #6
0
def evaluate_rnn(y, preds):
    """ Evaluate the RNN performance using various metrics.

  Parameters
  ----------
  y: list of list of labels
  preds: list of list of labels

  Both of these lists need to have the same length, but the
  sequences they contain can vary in length.

  Returns
  -------
  data: dict
  """

    labels = sorted({c for ex in y for c in ex})
    new_preds = []
    for gold, pred in zip(y, preds):
        delta = len(gold) - len(pred)
        if delta > 0:
            # Make a *wrong* guess for these clipped tokens:
            pred += [
                random.choice(list(set(labels) - {label}))
                for label in gold[-delta:]
            ]
        new_preds.append(pred)
    labels = sorted({cls for ex in y for cls in ex} - {"OTHER"})
    data = {}
    data["classification_report"] = flat_classification_report(y,
                                                               new_preds,
                                                               digits=3)
    data["f1_macro"] = flat_f1_score(y, new_preds, average="macro")
    data["f1_micro"] = flat_f1_score(y, new_preds, average="micro")
    data["f1"] = flat_f1_score(y, new_preds, average=None)
    data["precision_score"] = flat_precision_score(y, new_preds, average=None)
    data["recall_score"] = flat_recall_score(y, new_preds, average=None)
    data["accuracy"] = flat_accuracy_score(y, new_preds)
    data["sequence_accuracy_score"] = sequence_accuracy_score(y, new_preds)

    return data
コード例 #7
0
def print_classification_report(annotations, n_splits=10, model=None):
    """ Evaluate model, print classification report """
    if model is None:
        # FIXME: we're overfitting on hyperparameters - they should be chosen
        # using inner cross-validation, not set to fixed values beforehand.
        model = get_model(use_precise_form_types=True)

    annotations = [a for a in annotations if a.fields_annotated]
    form_types = formtype_model.get_realistic_form_labels(
        annotations=annotations,
        n_splits=n_splits,
        full_type_names=False
    )

    X, y = get_Xy(
        annotations=annotations,
        form_types=form_types,
        full_type_names=True,
    )
    group_kfold = GroupKFold(n_splits=n_splits)
    groups = [get_domain(ann.url) for ann in annotations]
    y_pred = cross_val_predict(model, X, y, cv=group_kfold, groups=groups,
                               n_jobs=-1)

    all_labels = list(annotations[0].field_schema.types.keys())
    labels = sorted(set(flatten(y_pred)), key=lambda k: all_labels.index(k))
    print((flat_classification_report(y, y_pred, digits=2,
                                     labels=labels, target_names=labels)))

    print((
        "{:0.1f}% fields are classified correctly.".format(
            flat_accuracy_score(y, y_pred) * 100
        )
    ))
    print((
        "All fields are classified correctly in {:0.1f}% forms.".format(
            sequence_accuracy_score(y, y_pred) * 100
        )
    ))
コード例 #8
0
def print_classification_report(annotations, n_folds=10, model=None):
    """ Evaluate model, print classification report """
    if model is None:
        # FIXME: we're overfitting on hyperparameters - they should be chosen
        # using inner cross-validation, not set to fixed values beforehand.
        model = get_model(use_precise_form_types=True)

    annotations = [a for a in annotations if a.fields_annotated]
    form_types = formtype_model.get_realistic_form_labels(
        annotations=annotations, n_folds=n_folds, full_type_names=False
    )

    X, y = get_Xy(annotations=annotations, form_types=form_types, full_type_names=True)
    cv = get_annotation_folds(annotations, n_folds=n_folds)
    y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)

    all_labels = list(annotations[0].field_schema.types.keys())
    labels = sorted(set(flatten(y_pred)), key=lambda k: all_labels.index(k))
    print(flat_classification_report(y, y_pred, digits=2, labels=labels, target_names=labels))

    print("{:0.1f}% fields are classified correctly.".format(flat_accuracy_score(y, y_pred) * 100))
    print("All fields are classified correctly in {:0.1f}% forms.".format(sequence_accuracy_score(y, y_pred) * 100))
コード例 #9
0
def test_sequence_accuracy():
    assert metrics.sequence_accuracy_score(y1, y2) == 0
    assert metrics.sequence_accuracy_score([], []) == 0
    assert metrics.sequence_accuracy_score([[1,2], [3], [4]], [[1,2], [4], [4]]) == 2 / 3
    assert metrics.sequence_accuracy_score([[1,2], [3]], [[1,2], [3]]) == 1.0
コード例 #10
0
ファイル: main.py プロジェクト: sagarjounkani/NCRFpp
def evaluate(data, model, name, nbest=None):
    if name == "train":
        instances = data.train_Ids
    elif name == "dev":
        instances = data.dev_Ids
    elif name == 'test':
        instances = data.test_Ids
    elif name == 'raw':
        instances = data.raw_Ids
    else:
        print("Error: wrong evaluate name,", name)
        exit(1)
    right_token = 0
    whole_token = 0
    nbest_pred_results = []
    pred_scores = []
    pred_results = []
    gold_results = []
    ## set model in eval model
    model.eval()
    batch_size = data.HP_batch_size
    start_time = time.time()
    train_num = len(instances)
    total_batch = train_num // batch_size + 1
    for batch_id in range(total_batch):
        start = batch_id * batch_size
        end = (batch_id + 1) * batch_size
        if end > train_num:
            end = train_num
        instance = instances[start:end]
        if not instance:
            continue
        batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask = batchify_with_label(
            instance, data.HP_gpu, False, data.sentence_classification)
        if nbest and not data.sentence_classification:
            scores, nbest_tag_seq = model.decode_nbest(
                batch_word, batch_features, batch_wordlen, batch_char,
                batch_charlen, batch_charrecover, mask, nbest)
            nbest_pred_result = recover_nbest_label(nbest_tag_seq, mask,
                                                    data.label_alphabet,
                                                    batch_wordrecover)
            nbest_pred_results += nbest_pred_result
            pred_scores += scores[batch_wordrecover].cpu().data.numpy().tolist(
            )
            ## select the best sequence to evalurate
            tag_seq = nbest_tag_seq[:, :, 0]
        else:
            tag_seq = model(batch_word, batch_features, batch_wordlen,
                            batch_char, batch_charlen, batch_charrecover, mask)
        # print("tag:",tag_seq)
        pred_label, gold_label = recover_label(tag_seq, batch_label, mask,
                                               data.label_alphabet,
                                               batch_wordrecover,
                                               data.sentence_classification)
        pred_results += pred_label
        gold_results += gold_label
    decode_time = time.time() - start_time
    speed = len(instances) / decode_time
    acc, p, r, f = get_ner_fmeasure(gold_results, pred_results, data.tagScheme)
    print("Classification report: \n",
          flat_classification_report(gold_results, pred_results))
    print(
        f"Sequence accuracy score: {sequence_accuracy_score(gold_results, pred_results)}"
    )
    data.seq_acc = sequence_accuracy_score(gold_results, pred_results)
    if nbest and not data.sentence_classification:
        return speed, acc, p, r, f, nbest_pred_results, pred_scores
    return speed, acc, p, r, f, pred_results, pred_scores
コード例 #11
0
def test_sequence_accuracy():
    assert metrics.sequence_accuracy_score(y1, y2) == 0
    assert metrics.sequence_accuracy_score([], []) == 0
    assert metrics.sequence_accuracy_score([[1, 2], [3], [4]],
                                           [[1, 2], [4], [4]]) == 2 / 3
    assert metrics.sequence_accuracy_score([[1, 2], [3]], [[1, 2], [3]]) == 1.0
コード例 #12
0
def joint_classification_report(p,
                                intent_label_list,
                                slot_label_list,
                                verbose=True):
    intent_predictions, slot_predictions = p.predictions
    intent_labels, slot_labels = p.label_ids

    slot_predictions = np.argmax(slot_predictions, axis=2)
    intent_predictions = np.argmax(intent_predictions, axis=1)

    slot_predictions_clean = [[
        p for (p, l) in zip(prediction, label) if l != -100
    ] for prediction, label in zip(slot_predictions, slot_labels)]
    slot_labels_clean = [[
        l for (p, l) in zip(prediction, label) if l != -100
    ] for prediction, label in zip(slot_predictions, slot_labels)]

    labels_slot = list(range(len(slot_label_list)))
    labels_intent = list(range(len(intent_label_list)))
    seq_acc = seq_metrics.sequence_accuracy_score(slot_labels_clean,
                                                  slot_predictions_clean)

    if verbose:
        print(
            classification_report(
                intent_labels,
                intent_predictions,
                target_names=intent_label_list,
                labels=labels_intent,
                digits=4,
            ))
        print(
            seq_metrics.flat_classification_report(
                slot_labels_clean,
                slot_predictions_clean,
                target_names=slot_label_list,
                labels=labels_slot,
                digits=4,
            ))
        print("sequence accuracy: ", seq_acc)

    # In efficient
    # can be done in one run and pretty print output reconstructed from dictionary
    slot_res_dict = seq_metrics.flat_classification_report(
        slot_labels_clean,
        slot_predictions_clean,
        target_names=slot_label_list,
        labels=labels_slot,
        output_dict=True,
        digits=5,
    )

    intent_res_dict = classification_report(
        intent_labels,
        intent_predictions,
        target_names=intent_label_list,
        labels=labels_intent,
        output_dict=True,
        digits=5,
    )

    return {
        "sequence_accuracy": seq_acc,
        "slot_results": slot_res_dict,
        "intent_results": intent_res_dict,
    }
コード例 #13
0
    X_test = [sent2features(s) for s in test_data]
    y_test = [sent2labels(s) for s in test_data]

    y_train = [sent2labels(s) for s in train_data]
    y_validation = [sent2labels(s) for s in validation_data]

    labels = list(
        set([label for labels in y_train + y_test for label in labels]))
    sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0]))

    with open("best_crf_model.pkl", "rb") as in_file:
        crf = pickle.load(in_file)

    y_pred = crf.predict(X_test)

    print("### Classification Report ###")
    print(metrics.flat_classification_report(y_test,
                                             y_pred,
                                             labels=sorted_labels),
          end='\n\n')

    print("### Sequence Accuracy Score ###")
    print(metrics.sequence_accuracy_score(y_test, y_pred), end='\n\n')

    print("### Weighted Precision Score ###")
    print(metrics.flat_precision_score(y_test, y_pred, average='weighted'),
          end='\n\n')

    print("### Weighted Recall Score ###")
    print(flat_recall_score(y_test, y_pred, average='weighted'), end='\n\n')
コード例 #14
0
def testCRF(corpus_file_name, testtype):
    test_types = ['test', 'evaluate']
    if testtype not in test_types:
        raise ValueError("Invalid test type. Expected one of: %s" % test_types)

    X_set = []
    Y_set = []

    #Read the corpus
    CS_Corpus = open(corpus_file_name, 'rb')
    CS_Reader = csv.reader(CS_Corpus, delimiter=',', quotechar='"')
    CS_Reader.next()  #Skip first line

    lines = 0
    for row in CS_Reader:
        (X_set_part,
         Y_set_part) = TrainTweetToCRF(tweet=Corpus.getTweetTokensTags(row),
                                       token_prev_next=token_prev_next,
                                       options=options,
                                       y_set=True)
        if X_set_part and Y_set_part:
            X_set.extend(X_set_part)
            Y_set.extend(Y_set_part)
        lines += 1

    CS_Corpus.close()
    print "Tweets read: %d" % lines
    print "X set: %d" % len(X_set)
    print "Y set: %d" % len(Y_set)

    if testtype == "evaluate":
        train_amount = (len(X_set) * 80 / 100)  #80% tweets for the train set
        test_amount = (len(X_set) * 10 / 100
                       )  #10% tweets for the evaluation set
        print "Amount of tweets for training set: %d" % train_amount
        print "Amount of tweets for evaluation set: %d" % test_amount
    elif testtype == "test":
        train_amount = (len(X_set) * 90 / 100)  #90% tweets for the train set
        test_amount = (len(X_set) * 10 / 100)  #10% tweets for the test set
        print "Amount of tweets for training set: %d" % train_amount
        print "Amount of tweets for testing set: %d" % test_amount

    crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                               c1=0.1,
                               c2=0.1,
                               max_iterations=100,
                               all_possible_transitions=True)

    crf.fit(X_set[:train_amount], Y_set[:train_amount])  #Train CRF

    labels = list(crf.classes_)
    labels.remove('-')
    print labels

    y_pred = crf.predict(X_set[train_amount:train_amount + test_amount])

    sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0]))
    print(
        metrics.flat_classification_report(Y_set[train_amount:train_amount +
                                                 test_amount],
                                           y_pred,
                                           labels=sorted_labels,
                                           digits=3))

    print "Sequence item accuracy: %.5f" % crf.score(
        X_set[train_amount:train_amount + test_amount],
        Y_set[train_amount:train_amount + test_amount])
    print "Sequence accuracy : %.5f" % metrics.sequence_accuracy_score(
        Y_set[train_amount:train_amount + test_amount], y_pred)
    print "Global tag accuracy: %.5f" % globalTagAccuracy(
        Y_set[train_amount:train_amount + test_amount], y_pred)
    F1scores(Y_set[train_amount:train_amount + test_amount], y_pred)
コード例 #15
0
def crossValidation(corpus_file_name, k):
    k_fold = KFold(n_splits=k, shuffle=False, random_state=None)
    print "Number of iterations in the cross validator: %d" % k

    X_set = []
    Y_set = []
    global options

    #Read the corpus
    CS_Corpus = open(corpus_file_name, 'rb')
    CS_Reader = csv.reader(CS_Corpus, delimiter=',', quotechar='"')
    CS_Reader.next()  #Skip first line

    lines = 0
    for row in CS_Reader:
        (X_set_part,
         Y_set_part) = TrainTweetToCRF(tweet=Corpus.getTweetTokensTags(row),
                                       token_prev_next=token_prev_next,
                                       options=options,
                                       y_set=True)
        if X_set_part and Y_set_part:
            X_set.extend(X_set_part)
            Y_set.extend(Y_set_part)
        lines += 1

    CS_Corpus.close()
    print "Tweets read: %d" % lines
    print "X set: %d" % len(X_set)
    print "Y set: %d" % len(Y_set)

    X = np.array(X_set)
    Y = np.array(Y_set)

    crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                               c1=0.1,
                               c2=0.1,
                               max_iterations=100,
                               all_possible_transitions=True)
    item_scores = []
    seq_scores = []
    global_scores = []
    for train_index, test_index in k_fold.split(X):
        print "Test set: [" + str(test_index[0]) + " - " + str(
            test_index[len(test_index) - 1]) + "]"
        x_train, x_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]
        crf.fit(x_train, y_train)

        item_score = crf.score(x_test, y_test)
        item_scores.append(item_score)
        print "Sequence item score: %f" % item_score

        y_pred = crf.predict(x_test)
        seq_score = metrics.sequence_accuracy_score(y_test, y_pred)
        seq_scores.append(seq_score)
        print "Sequence score: %f" % seq_score

        global_score = globalTagAccuracy(y_test, y_pred)
        global_scores.append(global_score)
        print "Global tag score: %f" % global_score

        print ""

    #Mean
    print "Cross validation results"
    print "------------------------"
    print "Sequence item mean score: %.5f" % np.mean(item_scores)
    print "Sequence mean score: %.5f" % np.mean(seq_scores)
    print "Global tag mean score: %.5f" % np.mean(global_scores)
    #Standard deviation
    print "Sequence item standard deviation: %.5f" % np.std(item_scores)
    print "Sequence standard deviation: %.5f" % np.std(seq_scores)
    print "Global tag standard deviation: %.5f" % np.std(global_scores)
コード例 #16
0
labels = list(crf.classes_)
labels.remove('O')
labels

start_test = datetime.datetime.now()
print(start_test)

# Make prediction...

Predict = cross_val_predict(estimator=crf, X=X_te, y=y_te, cv=5)
Predict = crf.predict(X_te)

end_test = datetime.datetime.now()
print(end_test)
print('Test time : {}'.format(end_test - start_test))

# Classifier evaluation...

report = flat_classification_report(y_te, Predict, labels=labels)
print(report)

# Compute accuracy...

a = sequence_accuracy_score(y_te, Predict)
print(a)

end_time = datetime.datetime.now()
print(end_time)
print('Execution Time (Overall Time): {}'.format(end_time - start_time))