def evaluate_rnn(y, preds): """Because the RNN sequences get clipped as necessary based on the `max_length` parameter, they have to be realigned to get a classification report. This method does that, building in the assumption that any clipped tokens are assigned an incorrect label. Parameters ---------- y : list of list of labels preds : list of list of labels Both of these lists need to have the same length, but the sequences they contain can vary in length. """ labels = sorted({c for ex in y for c in ex}) new_preds = [] for gold, pred in zip(y, preds): delta = len(gold) - len(pred) if delta > 0: # Make a *wrong* guess for these clipped tokens: pred += [random.choice(list(set(labels)-{label})) for label in gold[-delta: ]] new_preds.append(pred) labels = sorted({cls for ex in y for cls in ex} - {'OTHER'}) data = {} data['classification_report'] = flat_classification_report(y, new_preds) data['f1_macro'] = flat_f1_score(y, new_preds, average='macro') data['f1_micro'] = flat_f1_score(y, new_preds, average='micro') data['f1'] = flat_f1_score(y, new_preds, average=None) data['precision_score'] = flat_precision_score(y, new_preds, average=None) data['recall_score'] = flat_recall_score(y, new_preds, average=None) data['accuracy'] = flat_accuracy_score(y, new_preds) data['sequence_accuracy_score'] = sequence_accuracy_score(y, new_preds) return data
def run(self, batches: Generator) -> None: """ Runs the CRF model, storing to pickle in the end """ st = time.time() x = [] y = [] # For prediction, CRF does not implement batching, so we pass a list for batch in batches: b = list(batch) x.extend(b[0]) y.extend(b[1]) accuracy = self.model.score(x, y) y_pred = self.model.predict(x) f1_score = metrics.flat_f1_score(y, y_pred, average='weighted') accuracy_sentence = metrics.sequence_accuracy_score(y, y_pred) classification_report = metrics.flat_classification_report( y, y_pred, labels=self.model.classes_) print("*" * 80) print("MODEL EVALUATION") print("*" * 80) print("Token-wise accuracy score on Test Data:") print(round(accuracy, 3)) print("F1 score on Test Data:") print(round(f1_score, 3)) print( "Sequence accurancy score (% of sentences scored 100% correctly):") print(round(accuracy_sentence, 3)) print("Class-wise classification report:") print(classification_report) et = time.time() print(f"Evaluation finished in {round(et-st, 2)} seconds.")
def evaluate(self, output_path): loss = self._model.evaluate(self._data_reader.test_X, self._data_reader.test_y) print('Loss is: %f' % loss) all_predicted_labels = [] all_true_labels = [] for i, _test_instance in enumerate(self._data_reader.test_X): test_prediction = self._model.predict( _test_instance.reshape( 1, self._data_reader.max_train_sentence_length))[0] predicted_labels, true_labels = [], [] for encoded_true_label_array, encoded_test_label_array in zip( self._data_reader.test_y[i], test_prediction): contains_all_zeros = not numpy.any(encoded_true_label_array) if not contains_all_zeros: predicted_labels.append( self._data_reader.decode_single_label( encoded_test_label_array)) true_labels.append( self._data_reader.decode_single_label( encoded_true_label_array)) all_predicted_labels.append(predicted_labels) all_true_labels.append(true_labels) classification_report = metrics.flat_classification_report( all_true_labels, all_predicted_labels, labels=self._data_reader.labels) sequence_accuracy = metrics.sequence_accuracy_score( all_true_labels, all_predicted_labels) precision = metrics.flat_precision_score(all_true_labels, all_predicted_labels, average='weighted') recall = metrics.flat_recall_score(all_true_labels, all_predicted_labels, average='weighted') _save_metrics(output_path=output_path, classification_report=classification_report, sequence_accuracy=sequence_accuracy, precision=precision, recall=recall) return classification_report, sequence_accuracy, precision, recall
def _print_metrics(y_pred, y_true): labels = get_labels(y_true) print("Sequence accuracy: {:0.1%}".format( metrics.sequence_accuracy_score(y_true, y_pred)) ) print("Per-tag F1: {:0.3f}".format( metrics.flat_f1_score(y_true, y_pred, average='macro', labels=labels) )) print("Per-tag Classification report: \n{}".format( metrics.flat_classification_report(y_true, y_pred, labels=labels, digits=3)) )
def train_seq(X_train, Y_train, X_dev, Y_dev): # crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=50, all_possible_states=True) crf = CRF(algorithm='lbfgs', c1=0.1, c2=10, max_iterations=50) #, all_possible_states=True) #Just to fit on training data crf.fit(X_train, Y_train) labels = list(crf.classes_) #testing: y_pred = crf.predict(X_dev) sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0])) print( metrics.flat_f1_score(Y_dev, y_pred, average='weighted', labels=labels)) print( metrics.flat_classification_report(Y_dev, y_pred, labels=sorted_labels, digits=3)) print(metrics.sequence_accuracy_score(Y_dev, y_pred)) get_confusion_matrix(Y_dev, y_pred, labels=sorted_labels)
def evaluate_rnn(y, preds): """ Evaluate the RNN performance using various metrics. Parameters ---------- y: list of list of labels preds: list of list of labels Both of these lists need to have the same length, but the sequences they contain can vary in length. Returns ------- data: dict """ labels = sorted({c for ex in y for c in ex}) new_preds = [] for gold, pred in zip(y, preds): delta = len(gold) - len(pred) if delta > 0: # Make a *wrong* guess for these clipped tokens: pred += [ random.choice(list(set(labels) - {label})) for label in gold[-delta:] ] new_preds.append(pred) labels = sorted({cls for ex in y for cls in ex} - {"OTHER"}) data = {} data["classification_report"] = flat_classification_report(y, new_preds, digits=3) data["f1_macro"] = flat_f1_score(y, new_preds, average="macro") data["f1_micro"] = flat_f1_score(y, new_preds, average="micro") data["f1"] = flat_f1_score(y, new_preds, average=None) data["precision_score"] = flat_precision_score(y, new_preds, average=None) data["recall_score"] = flat_recall_score(y, new_preds, average=None) data["accuracy"] = flat_accuracy_score(y, new_preds) data["sequence_accuracy_score"] = sequence_accuracy_score(y, new_preds) return data
def print_classification_report(annotations, n_splits=10, model=None): """ Evaluate model, print classification report """ if model is None: # FIXME: we're overfitting on hyperparameters - they should be chosen # using inner cross-validation, not set to fixed values beforehand. model = get_model(use_precise_form_types=True) annotations = [a for a in annotations if a.fields_annotated] form_types = formtype_model.get_realistic_form_labels( annotations=annotations, n_splits=n_splits, full_type_names=False ) X, y = get_Xy( annotations=annotations, form_types=form_types, full_type_names=True, ) group_kfold = GroupKFold(n_splits=n_splits) groups = [get_domain(ann.url) for ann in annotations] y_pred = cross_val_predict(model, X, y, cv=group_kfold, groups=groups, n_jobs=-1) all_labels = list(annotations[0].field_schema.types.keys()) labels = sorted(set(flatten(y_pred)), key=lambda k: all_labels.index(k)) print((flat_classification_report(y, y_pred, digits=2, labels=labels, target_names=labels))) print(( "{:0.1f}% fields are classified correctly.".format( flat_accuracy_score(y, y_pred) * 100 ) )) print(( "All fields are classified correctly in {:0.1f}% forms.".format( sequence_accuracy_score(y, y_pred) * 100 ) ))
def print_classification_report(annotations, n_folds=10, model=None): """ Evaluate model, print classification report """ if model is None: # FIXME: we're overfitting on hyperparameters - they should be chosen # using inner cross-validation, not set to fixed values beforehand. model = get_model(use_precise_form_types=True) annotations = [a for a in annotations if a.fields_annotated] form_types = formtype_model.get_realistic_form_labels( annotations=annotations, n_folds=n_folds, full_type_names=False ) X, y = get_Xy(annotations=annotations, form_types=form_types, full_type_names=True) cv = get_annotation_folds(annotations, n_folds=n_folds) y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1) all_labels = list(annotations[0].field_schema.types.keys()) labels = sorted(set(flatten(y_pred)), key=lambda k: all_labels.index(k)) print(flat_classification_report(y, y_pred, digits=2, labels=labels, target_names=labels)) print("{:0.1f}% fields are classified correctly.".format(flat_accuracy_score(y, y_pred) * 100)) print("All fields are classified correctly in {:0.1f}% forms.".format(sequence_accuracy_score(y, y_pred) * 100))
def test_sequence_accuracy(): assert metrics.sequence_accuracy_score(y1, y2) == 0 assert metrics.sequence_accuracy_score([], []) == 0 assert metrics.sequence_accuracy_score([[1,2], [3], [4]], [[1,2], [4], [4]]) == 2 / 3 assert metrics.sequence_accuracy_score([[1,2], [3]], [[1,2], [3]]) == 1.0
def evaluate(data, model, name, nbest=None): if name == "train": instances = data.train_Ids elif name == "dev": instances = data.dev_Ids elif name == 'test': instances = data.test_Ids elif name == 'raw': instances = data.raw_Ids else: print("Error: wrong evaluate name,", name) exit(1) right_token = 0 whole_token = 0 nbest_pred_results = [] pred_scores = [] pred_results = [] gold_results = [] ## set model in eval model model.eval() batch_size = data.HP_batch_size start_time = time.time() train_num = len(instances) total_batch = train_num // batch_size + 1 for batch_id in range(total_batch): start = batch_id * batch_size end = (batch_id + 1) * batch_size if end > train_num: end = train_num instance = instances[start:end] if not instance: continue batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask = batchify_with_label( instance, data.HP_gpu, False, data.sentence_classification) if nbest and not data.sentence_classification: scores, nbest_tag_seq = model.decode_nbest( batch_word, batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, mask, nbest) nbest_pred_result = recover_nbest_label(nbest_tag_seq, mask, data.label_alphabet, batch_wordrecover) nbest_pred_results += nbest_pred_result pred_scores += scores[batch_wordrecover].cpu().data.numpy().tolist( ) ## select the best sequence to evalurate tag_seq = nbest_tag_seq[:, :, 0] else: tag_seq = model(batch_word, batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, mask) # print("tag:",tag_seq) pred_label, gold_label = recover_label(tag_seq, batch_label, mask, data.label_alphabet, batch_wordrecover, data.sentence_classification) pred_results += pred_label gold_results += gold_label decode_time = time.time() - start_time speed = len(instances) / decode_time acc, p, r, f = get_ner_fmeasure(gold_results, pred_results, data.tagScheme) print("Classification report: \n", flat_classification_report(gold_results, pred_results)) print( f"Sequence accuracy score: {sequence_accuracy_score(gold_results, pred_results)}" ) data.seq_acc = sequence_accuracy_score(gold_results, pred_results) if nbest and not data.sentence_classification: return speed, acc, p, r, f, nbest_pred_results, pred_scores return speed, acc, p, r, f, pred_results, pred_scores
def test_sequence_accuracy(): assert metrics.sequence_accuracy_score(y1, y2) == 0 assert metrics.sequence_accuracy_score([], []) == 0 assert metrics.sequence_accuracy_score([[1, 2], [3], [4]], [[1, 2], [4], [4]]) == 2 / 3 assert metrics.sequence_accuracy_score([[1, 2], [3]], [[1, 2], [3]]) == 1.0
def joint_classification_report(p, intent_label_list, slot_label_list, verbose=True): intent_predictions, slot_predictions = p.predictions intent_labels, slot_labels = p.label_ids slot_predictions = np.argmax(slot_predictions, axis=2) intent_predictions = np.argmax(intent_predictions, axis=1) slot_predictions_clean = [[ p for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(slot_predictions, slot_labels)] slot_labels_clean = [[ l for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(slot_predictions, slot_labels)] labels_slot = list(range(len(slot_label_list))) labels_intent = list(range(len(intent_label_list))) seq_acc = seq_metrics.sequence_accuracy_score(slot_labels_clean, slot_predictions_clean) if verbose: print( classification_report( intent_labels, intent_predictions, target_names=intent_label_list, labels=labels_intent, digits=4, )) print( seq_metrics.flat_classification_report( slot_labels_clean, slot_predictions_clean, target_names=slot_label_list, labels=labels_slot, digits=4, )) print("sequence accuracy: ", seq_acc) # In efficient # can be done in one run and pretty print output reconstructed from dictionary slot_res_dict = seq_metrics.flat_classification_report( slot_labels_clean, slot_predictions_clean, target_names=slot_label_list, labels=labels_slot, output_dict=True, digits=5, ) intent_res_dict = classification_report( intent_labels, intent_predictions, target_names=intent_label_list, labels=labels_intent, output_dict=True, digits=5, ) return { "sequence_accuracy": seq_acc, "slot_results": slot_res_dict, "intent_results": intent_res_dict, }
X_test = [sent2features(s) for s in test_data] y_test = [sent2labels(s) for s in test_data] y_train = [sent2labels(s) for s in train_data] y_validation = [sent2labels(s) for s in validation_data] labels = list( set([label for labels in y_train + y_test for label in labels])) sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0])) with open("best_crf_model.pkl", "rb") as in_file: crf = pickle.load(in_file) y_pred = crf.predict(X_test) print("### Classification Report ###") print(metrics.flat_classification_report(y_test, y_pred, labels=sorted_labels), end='\n\n') print("### Sequence Accuracy Score ###") print(metrics.sequence_accuracy_score(y_test, y_pred), end='\n\n') print("### Weighted Precision Score ###") print(metrics.flat_precision_score(y_test, y_pred, average='weighted'), end='\n\n') print("### Weighted Recall Score ###") print(flat_recall_score(y_test, y_pred, average='weighted'), end='\n\n')
def testCRF(corpus_file_name, testtype): test_types = ['test', 'evaluate'] if testtype not in test_types: raise ValueError("Invalid test type. Expected one of: %s" % test_types) X_set = [] Y_set = [] #Read the corpus CS_Corpus = open(corpus_file_name, 'rb') CS_Reader = csv.reader(CS_Corpus, delimiter=',', quotechar='"') CS_Reader.next() #Skip first line lines = 0 for row in CS_Reader: (X_set_part, Y_set_part) = TrainTweetToCRF(tweet=Corpus.getTweetTokensTags(row), token_prev_next=token_prev_next, options=options, y_set=True) if X_set_part and Y_set_part: X_set.extend(X_set_part) Y_set.extend(Y_set_part) lines += 1 CS_Corpus.close() print "Tweets read: %d" % lines print "X set: %d" % len(X_set) print "Y set: %d" % len(Y_set) if testtype == "evaluate": train_amount = (len(X_set) * 80 / 100) #80% tweets for the train set test_amount = (len(X_set) * 10 / 100 ) #10% tweets for the evaluation set print "Amount of tweets for training set: %d" % train_amount print "Amount of tweets for evaluation set: %d" % test_amount elif testtype == "test": train_amount = (len(X_set) * 90 / 100) #90% tweets for the train set test_amount = (len(X_set) * 10 / 100) #10% tweets for the test set print "Amount of tweets for training set: %d" % train_amount print "Amount of tweets for testing set: %d" % test_amount crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True) crf.fit(X_set[:train_amount], Y_set[:train_amount]) #Train CRF labels = list(crf.classes_) labels.remove('-') print labels y_pred = crf.predict(X_set[train_amount:train_amount + test_amount]) sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0])) print( metrics.flat_classification_report(Y_set[train_amount:train_amount + test_amount], y_pred, labels=sorted_labels, digits=3)) print "Sequence item accuracy: %.5f" % crf.score( X_set[train_amount:train_amount + test_amount], Y_set[train_amount:train_amount + test_amount]) print "Sequence accuracy : %.5f" % metrics.sequence_accuracy_score( Y_set[train_amount:train_amount + test_amount], y_pred) print "Global tag accuracy: %.5f" % globalTagAccuracy( Y_set[train_amount:train_amount + test_amount], y_pred) F1scores(Y_set[train_amount:train_amount + test_amount], y_pred)
def crossValidation(corpus_file_name, k): k_fold = KFold(n_splits=k, shuffle=False, random_state=None) print "Number of iterations in the cross validator: %d" % k X_set = [] Y_set = [] global options #Read the corpus CS_Corpus = open(corpus_file_name, 'rb') CS_Reader = csv.reader(CS_Corpus, delimiter=',', quotechar='"') CS_Reader.next() #Skip first line lines = 0 for row in CS_Reader: (X_set_part, Y_set_part) = TrainTweetToCRF(tweet=Corpus.getTweetTokensTags(row), token_prev_next=token_prev_next, options=options, y_set=True) if X_set_part and Y_set_part: X_set.extend(X_set_part) Y_set.extend(Y_set_part) lines += 1 CS_Corpus.close() print "Tweets read: %d" % lines print "X set: %d" % len(X_set) print "Y set: %d" % len(Y_set) X = np.array(X_set) Y = np.array(Y_set) crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True) item_scores = [] seq_scores = [] global_scores = [] for train_index, test_index in k_fold.split(X): print "Test set: [" + str(test_index[0]) + " - " + str( test_index[len(test_index) - 1]) + "]" x_train, x_test = X[train_index], X[test_index] y_train, y_test = Y[train_index], Y[test_index] crf.fit(x_train, y_train) item_score = crf.score(x_test, y_test) item_scores.append(item_score) print "Sequence item score: %f" % item_score y_pred = crf.predict(x_test) seq_score = metrics.sequence_accuracy_score(y_test, y_pred) seq_scores.append(seq_score) print "Sequence score: %f" % seq_score global_score = globalTagAccuracy(y_test, y_pred) global_scores.append(global_score) print "Global tag score: %f" % global_score print "" #Mean print "Cross validation results" print "------------------------" print "Sequence item mean score: %.5f" % np.mean(item_scores) print "Sequence mean score: %.5f" % np.mean(seq_scores) print "Global tag mean score: %.5f" % np.mean(global_scores) #Standard deviation print "Sequence item standard deviation: %.5f" % np.std(item_scores) print "Sequence standard deviation: %.5f" % np.std(seq_scores) print "Global tag standard deviation: %.5f" % np.std(global_scores)
labels = list(crf.classes_) labels.remove('O') labels start_test = datetime.datetime.now() print(start_test) # Make prediction... Predict = cross_val_predict(estimator=crf, X=X_te, y=y_te, cv=5) Predict = crf.predict(X_te) end_test = datetime.datetime.now() print(end_test) print('Test time : {}'.format(end_test - start_test)) # Classifier evaluation... report = flat_classification_report(y_te, Predict, labels=labels) print(report) # Compute accuracy... a = sequence_accuracy_score(y_te, Predict) print(a) end_time = datetime.datetime.now() print(end_time) print('Execution Time (Overall Time): {}'.format(end_time - start_time))