def crf_evaluate(self, verbose=False, labels=False): if labels: lab = labels else: lab = self.crf.classes_ lab.remove("O") with warnings.catch_warnings(): warnings.simplefilter("ignore") print("Dev Results\n===========") dev_args = (self.dev_labels, self.dev_predicted) kwargs = {"average": "weighted", "labels": lab} if verbose: print("Precision:", metrics.flat_precision_score(*dev_args, **kwargs)) print("Recall:", metrics.flat_recall_score(*dev_args, **kwargs)) print("F1:", metrics.flat_f1_score(*dev_args, **kwargs)) test_args = (self.test_labels, self.test_predicted) print("\nTest Results\n============") if verbose: print("Precision:", metrics.flat_precision_score(*test_args, **kwargs)) print("Recall:", metrics.flat_recall_score(*test_args, **kwargs)) print("F1:", metrics.flat_f1_score(*test_args, **kwargs))
def eval(self, sentence_result, y_data, progress=False): slot_result, domain_result = list(zip(*y_data)) y_pred, y_pred_target = self.predict(sentence_result, progress=progress) y_test = slot_result y_target = np.array([[x] for x in domain_result]) y_pred_target = np.array([[x] for x in y_pred_target]) # print(y_target.shape) # print(y_pred_target.shape) return OrderedDict(( ('accuracy', metrics.flat_accuracy_score(y_test, y_pred)), ('precision', metrics.flat_precision_score(y_test, y_pred, average='weighted')), ('recall', metrics.flat_recall_score(y_test, y_pred, average='weighted')), ('f1', metrics.flat_f1_score(y_test, y_pred, average='weighted')), ('softmax_accuracy', metrics.flat_accuracy_score(y_target, y_pred_target)), ('softmax_precision', metrics.flat_precision_score(y_target, y_pred_target, average='weighted')), ('softmax_recall', metrics.flat_recall_score(y_target, y_pred_target, average='weighted')), ('softmax_f1', metrics.flat_f1_score(y_target, y_pred_target, average='weighted')), ))
def evaluate_rnn(y, preds): """Because the RNN sequences get clipped as necessary based on the `max_length` parameter, they have to be realigned to get a classification report. This method does that, building in the assumption that any clipped tokens are assigned an incorrect label. Parameters ---------- y : list of list of labels preds : list of list of labels Both of these lists need to have the same length, but the sequences they contain can vary in length. """ labels = sorted({c for ex in y for c in ex}) new_preds = [] for gold, pred in zip(y, preds): delta = len(gold) - len(pred) if delta > 0: # Make a *wrong* guess for these clipped tokens: pred += [random.choice(list(set(labels)-{label})) for label in gold[-delta: ]] new_preds.append(pred) labels = sorted({cls for ex in y for cls in ex} - {'OTHER'}) data = {} data['classification_report'] = flat_classification_report(y, new_preds) data['f1_macro'] = flat_f1_score(y, new_preds, average='macro') data['f1_micro'] = flat_f1_score(y, new_preds, average='micro') data['f1'] = flat_f1_score(y, new_preds, average=None) data['precision_score'] = flat_precision_score(y, new_preds, average=None) data['recall_score'] = flat_recall_score(y, new_preds, average=None) data['accuracy'] = flat_accuracy_score(y, new_preds) data['sequence_accuracy_score'] = sequence_accuracy_score(y, new_preds) return data
def model_testing(Y_test, output_path, testing_start_date, testing_end_date, chain_len): X_test = loadX(testing_start_date, testing_end_date) X_test = dataFillNA(X_test) # fill na tmp_columns = X_test.columns.tolist() tmp_columns.remove('date') all_data = X_test.merge(Y_test, on='date', how='inner') X_test = all_data[tmp_columns] Y_test = all_data['Y'] test_dates = all_data['date'] del all_data gc.collect() X_test = Xpoint2Set(X_test, chain_len) Y_test_pair = Ypoint2Set(Y_test, chain_len) with open(output_path + 'crf_model.pkl', 'rb') as tmp_fi: # dump model crf = pickle.load(tmp_fi) y_pred = crf.predict(X_test) # test pair labels = ['-1.0', '1.0'] print( metrics.flat_classification_report(Y_test_pair, y_pred, labels=labels, digits=3)) # test single y_pred_single = y_pred[0].copy() y_pred_single.pop(-1) y_pred_single.extend([tmp_y[1] for tmp_y in y_pred]) # y_pred_single.insert(0, y_pred[0][0]) y_real_singel = Y_test.astype('str').tolist() prsc = precision_score(y_real_singel, y_pred_single, labels=labels, average='micro') print('%s to %s weighted precision: %f' % (testing_start_date, testing_end_date, prsc)) print('f1 score: %f, precision: %f' % (metrics.flat_f1_score( Y_test_pair, y_pred, labels=labels, average='weighted'), metrics.flat_precision_score( Y_test_pair, y_pred, labels=labels, average='micro'))) prediction = pd.DataFrame(test_dates) prediction.loc[:, 'predict'] = y_pred_single return prediction, prsc
def report(pred, truth): _pred = VecContext.y2lab(pred) _test = VecContext.y2lab(truth) print( metrics.flat_classification_report(_test, _pred, labels=('I', 'E'), digits=4)) label = 'E' P = metrics.flat_precision_score(_test, _pred, pos_label=label) R = metrics.flat_recall_score(_test, _pred, pos_label=label) f1 = metrics.flat_f1_score(_test, _pred, pos_label=label) return {'P': P, 'R': R, 'f1': f1}
def evaluate(self, output_path): loss = self._model.evaluate(self._data_reader.test_X, self._data_reader.test_y) print('Loss is: %f' % loss) all_predicted_labels = [] all_true_labels = [] for i, _test_instance in enumerate(self._data_reader.test_X): test_prediction = self._model.predict( _test_instance.reshape( 1, self._data_reader.max_train_sentence_length))[0] predicted_labels, true_labels = [], [] for encoded_true_label_array, encoded_test_label_array in zip( self._data_reader.test_y[i], test_prediction): contains_all_zeros = not numpy.any(encoded_true_label_array) if not contains_all_zeros: predicted_labels.append( self._data_reader.decode_single_label( encoded_test_label_array)) true_labels.append( self._data_reader.decode_single_label( encoded_true_label_array)) all_predicted_labels.append(predicted_labels) all_true_labels.append(true_labels) classification_report = metrics.flat_classification_report( all_true_labels, all_predicted_labels, labels=self._data_reader.labels) sequence_accuracy = metrics.sequence_accuracy_score( all_true_labels, all_predicted_labels) precision = metrics.flat_precision_score(all_true_labels, all_predicted_labels, average='weighted') recall = metrics.flat_recall_score(all_true_labels, all_predicted_labels, average='weighted') _save_metrics(output_path=output_path, classification_report=classification_report, sequence_accuracy=sequence_accuracy, precision=precision, recall=recall) return classification_report, sequence_accuracy, precision, recall
def report(pred, truth, csv_table, clf_name): label = 'E' pred_lab = VecContext.y2lab(pred) truth_lab = VecContext.y2lab(truth) P = metrics.flat_precision_score(truth_lab, pred_lab, pos_label=label) R = metrics.flat_recall_score(truth_lab, pred_lab, pos_label=label) f1 = metrics.flat_f1_score(truth_lab, pred_lab, pos_label=label) print(clf_name) print( metrics.flat_classification_report(truth_lab, pred_lab, labels=('I', 'E'), digits=4)) csv_table.writerow([clf_name, P, R, f1])
def eval(self, sentence_result, slot_result): """评估结果""" y_pred = self.predict(sentence_result) y_test = slot_result return { 'precision': metrics.flat_precision_score(y_test, y_pred, average='weighted'), 'recall': metrics.flat_recall_score(y_test, y_pred, average='weighted'), 'f1': metrics.flat_f1_score(y_test, y_pred, average='weighted'), 'accuracy': metrics.flat_accuracy_score(y_test, y_pred), }
def evaluate(dataset_name, data_iter, model, full_report=False): model.eval() total_corrects, avg_loss = 0, 0 for batch in data_iter: text, target = batch.Phrase, batch.Sentiment output = model(text) loss = F.nll_loss(output, target, reduction='sum').item() # sum up batch loss pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability correct = pred.eq(target.view_as(pred)).sum().item() avg_loss += loss total_corrects += correct size = len(data_iter.dataset) avg_loss /= size accuracy = 100.0 * total_corrects/size print(' Evaluation on {} - loss: {:.6f} acc: {:.4f}%({}/{})'.format(dataset_name, avg_loss, accuracy, total_corrects, size)) targetList = [] for tar in target: list1 = [] list1.append(tar) targetList.append(list1) pred = pred.tolist() predList = [] for pre in pred: list1 = [] list1.append(pre) predList.append(list1) if full_report: print(sklearn_crfsuite.metrics.flat_classification_report(targetList, predList, labels=[0,1,2,3,4])) print("accuracy_score", flat_accuracy_score(targetList, predList)) print("precision_score", flat_precision_score(targetList, predList, average='weighted')) print("recall_score", flat_recall_score(targetList, predList, average='weighted')) print("f1_score", flat_f1_score(targetList, predList, average='weighted')) return accuracy
def crf(test_loc, train_loc): test_sents = convertCONLLFormJustExtractionSemEval(test_loc) train_sents = convertCONLLFormJustExtractionSemEval(train_loc) #pprint(train_sents[0]) #pprint(test_sents[0]) X_train = [sent2features(s) for s in train_sents] y_train = [sent2labels(s) for s in train_sents] X_test = [sent2features(s) for s in test_sents] y_test = [sent2labels(s) for s in test_sents] crf = sklearn_crfsuite.CRF(\ algorithm='lbfgs',\ c1=0.1,\ c2=0.1,\ max_iterations=100,\ all_possible_transitions=True ) crf.fit(X_train, y_train) labels = list(crf.classes_) labels.remove('O') #print(labels) pickle.dump(crf, open("/data/xwang/models_origin/linear-chain-crf.model.pickle", "wb"), protocol=0, fix_imports=True) y_pred = crf.predict(X_test) sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0])) f1_score = metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=sorted_labels) recall = metrics.flat_recall_score(y_test, y_pred, average='weighted', labels=sorted_labels) precision = metrics.flat_precision_score(y_test, y_pred, average='weighted', labels=sorted_labels) #print(metrics.flat_classification_report(y_test, y_pred, labels=sorted_labels, digits=3)) return (f1_score, recall, precision)
def evaluate_rnn(y, preds): """ Evaluate the RNN performance using various metrics. Parameters ---------- y: list of list of labels preds: list of list of labels Both of these lists need to have the same length, but the sequences they contain can vary in length. Returns ------- data: dict """ labels = sorted({c for ex in y for c in ex}) new_preds = [] for gold, pred in zip(y, preds): delta = len(gold) - len(pred) if delta > 0: # Make a *wrong* guess for these clipped tokens: pred += [ random.choice(list(set(labels) - {label})) for label in gold[-delta:] ] new_preds.append(pred) labels = sorted({cls for ex in y for cls in ex} - {"OTHER"}) data = {} data["classification_report"] = flat_classification_report(y, new_preds, digits=3) data["f1_macro"] = flat_f1_score(y, new_preds, average="macro") data["f1_micro"] = flat_f1_score(y, new_preds, average="micro") data["f1"] = flat_f1_score(y, new_preds, average=None) data["precision_score"] = flat_precision_score(y, new_preds, average=None) data["recall_score"] = flat_recall_score(y, new_preds, average=None) data["accuracy"] = flat_accuracy_score(y, new_preds) data["sequence_accuracy_score"] = sequence_accuracy_score(y, new_preds) return data
def get_crf_metrics(y_pred, y_true, labels): token_acc_score = round(metrics.flat_accuracy_score(y_true, y_pred), 2) token_recall_score = round( metrics.flat_recall_score(y_true, y_pred, average='weighted', labels=labels), 2) token_f1_score = round( metrics.flat_f1_score(y_true, y_pred, average='weighted', labels=labels), 2) token_precision_score = round( metrics.flat_precision_score(y_true, y_pred, average='weighted', labels=labels), 2) report = metrics.flat_classification_report(y_true, y_pred, labels=labels, output_dict=True) report_df = pd.DataFrame(report).T report_df = report_df.round(2) cm_dict = metrics.performance_measure(y_true, y_pred) cm = np.array([[cm_dict['TN'], cm_dict['FP']], [cm_dict['FN'], cm_dict['TP']]]) support = cm_dict['FN'] + cm_dict['TP'] res_d = { 'accuracy': token_acc_score, 'recall': token_recall_score, 'f1_score': token_f1_score, 'precision': token_precision_score, 'support': support, 'cm': cm, 'report': report_df } return res_d
print(crf.sent2features(conll.sentences[0])[0]) train_sents = conll.sentences[:40000] test_sents = conll.sentences[40000:] crf.X_train = [crf.sent2features(s) for s in train_sents] crf.y_train = [crf.sent2labels(s) for s in train_sents] crf.X_test = [crf.sent2features(s) for s in test_sents] crf.y_test = [crf.sent2labels(s) for s in test_sents] crf.train() labels = list(crf.crf_model.classes_) labels.remove('O') print(labels) y_pred = crf.crf_model.predict(crf.X_test) f1_score = metrics.flat_f1_score(crf.y_test, y_pred, average='weighted', labels=labels) precision_score = metrics.flat_precision_score(crf.y_test, y_pred, average='weighted', labels=labels) recall_score = metrics.flat_recall_score(crf.y_test, y_pred, average='weighted', labels=labels) stats = metrics.flat_classification_report(crf.y_test, y_pred, labels=labels) print("Precision: "+str(precision_score)) print("Recall: "+str(recall_score)) print("F1-score: "+str(recall_score)) print(stats) filename = '../Models/crf_baseline_model.sav' pickle.dump(crf.crf_model, open(filename, 'wb')) print("Done with all")
def cross_validate(self, folds=10, training_dataset=None, spacy_model_name=None, epochs=None): """ Runs a cross validation. :param folds: Number of fold to do for the cross validation. :param training_dataset: Path to the directory of BRAT files to use for the training data. :param spacy_model_name: Name of the spaCy model to start from. :param epochs: Number of epochs to us for every fold training. """ if folds <= 1: raise ValueError( "Number of folds for cross validation must be greater than 1") if training_dataset is None: raise ValueError("Need a dataset to evaluate") if spacy_model_name is None: raise ValueError("Need a spacy model to start with") train_data = training_dataset.get_training_data() x_data, y_data = zip(*train_data) skipped_files = [] evaluation_statistics = {} folds = SequenceStratifiedKFold(folds=folds) fold = 1 for train_indices, test_indices in folds(x_data, y_data): logging.info("\n----EVALUATING FOLD %d----", fold) self.model = None fold_statistics = {} x_subdataset = training_dataset.get_subdataset(train_indices) self.fit(x_subdataset, spacy_model_name, epochs) logging.info('Done training!\n') nlp = self.model labels = list(x_subdataset.get_labels()) y_subdataset = training_dataset.get_subdataset(test_indices) y_test = [] y_pred = [] for data_file in y_subdataset.get_data_files(): ann_path = data_file.get_annotation_path() annotations = Annotations(ann_path) txt_path = data_file.get_text_path() with open(txt_path, 'r') as source_text_file: text = source_text_file.read() doc = nlp(text) test_entities = annotations.get_spacy_entities() test_entities = self.entities_to_biluo(doc, test_entities) y_test.append(test_entities) pred_entities = self.predict(text) pred_entities = self.entities_to_biluo(doc, pred_entities) y_pred.append(pred_entities) logging.debug('\n------y_test------') logging.debug(y_test) logging.debug('\n------y_pred------') logging.debug(y_pred) # Write the metrics for this fold. for label in labels: fold_statistics[label] = {} recall = metrics.flat_recall_score(y_test, y_pred, average='weighted', labels=[label]) precision = metrics.flat_precision_score(y_test, y_pred, average='weighted', labels=[label]) f1_score = metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=[label]) fold_statistics[label]['precision'] = precision fold_statistics[label]['recall'] = recall fold_statistics[label]['f1'] = f1_score # add averages fold_statistics['system'] = {} recall = metrics.flat_recall_score(y_test, y_pred, average='weighted', labels=labels) precision = metrics.flat_precision_score(y_test, y_pred, average='weighted', labels=labels) f1_score = metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels) fold_statistics['system']['precision'] = precision fold_statistics['system']['recall'] = recall fold_statistics['system']['f1'] = f1_score table_data = [[ label, format(fold_statistics[label]['precision'], ".3f"), format(fold_statistics[label]['recall'], ".3f"), format(fold_statistics[label]['f1'], ".3f") ] for label in labels + ['system']] logging.info( tabulate(table_data, headers=['Entity', 'Precision', 'Recall', 'F1'], tablefmt='orgtbl')) evaluation_statistics[fold] = fold_statistics fold += 1 if skipped_files: logging.info('\nWARNING. SKIPPED THE FOLLOWING ANNOTATIONS:') logging.info(skipped_files) statistics_all_folds = {} for label in labels + ['system']: statistics_all_folds[label] = {} statistics_all_folds[label]['precision_average'] = mean([ evaluation_statistics[fold][label]['precision'] for fold in evaluation_statistics ]) statistics_all_folds[label]['precision_max'] = max([ evaluation_statistics[fold][label]['precision'] for fold in evaluation_statistics ]) statistics_all_folds[label]['precision_min'] = min([ evaluation_statistics[fold][label]['precision'] for fold in evaluation_statistics ]) statistics_all_folds[label]['recall_average'] = mean([ evaluation_statistics[fold][label]['recall'] for fold in evaluation_statistics ]) statistics_all_folds[label]['recall_max'] = max([ evaluation_statistics[fold][label]['recall'] for fold in evaluation_statistics ]) statistics_all_folds[label]['recall_min'] = min([ evaluation_statistics[fold][label]['recall'] for fold in evaluation_statistics ]) statistics_all_folds[label]['f1_average'] = mean([ evaluation_statistics[fold][label]['f1'] for fold in evaluation_statistics ]) statistics_all_folds[label]['f1_max'] = max([ evaluation_statistics[fold][label]['f1'] for fold in evaluation_statistics ]) statistics_all_folds[label]['f1_min'] = min([ evaluation_statistics[fold][label]['f1'] for fold in evaluation_statistics ]) table_data = [[ label, format(statistics_all_folds[label]['precision_average'], ".3f"), format(statistics_all_folds[label]['recall_average'], ".3f"), format(statistics_all_folds[label]['f1_average'], ".3f"), format(statistics_all_folds[label]['f1_min'], ".3f"), format(statistics_all_folds[label]['f1_max'], ".3f") ] for label in labels + ['system']] table_string = '\n' + tabulate(table_data, headers=[ 'Entity', 'Precision', 'Recall', 'F1', 'F1_Min', 'F1_Max' ], tablefmt='orgtbl') logging.info(table_string)
print("=======================") print("Load trained model ...") model = pickle.load(open("./models/" + MODEL_NAME, "rb")) print("Done!!!") predict = model.predict(X_test) print("=======================") print("Testing ....") print(len(y_test), len(predict)) avg_count = 0 print(predict[0]) for i in range(len(y_test)): acc = evaluate(predict[i], y_test[i]) # print(acc) avg_count += acc # print(score) print("Avg acc:", avg_count / float(len(y_test))) print(model.classes_) print("Accuracy\t:", metrics.flat_accuracy_score(y_test, predict)) print("Precision\t:", metrics.flat_precision_score(y_test, predict, average=None)) print("Recall\t:", len(metrics.flat_recall_score(y_test, predict, average=None))) print("F1\t:", metrics.flat_f1_score(y_test, predict, average=None)) print("Done!!!")
def cross_validate(self, training_dataset=None, num_folds=5, prediction_directory=None, groundtruth_directory=None, asynchronous=False): """ Performs k-fold stratified cross-validation using our model and pipeline. If the training dataset, groundtruth_directory and prediction_directory are passed, intermediate predictions during cross validation are written to the directory `write_predictions`. This allows one to construct a confusion matrix or to compute the prediction ambiguity with the methods present in the Dataset class to support pipeline development without a designated evaluation set. :param training_dataset: Dataset that is being cross validated (optional) :param num_folds: number of folds to split training data into for cross validation :param prediction_directory: directory to write predictions of cross validation to or `True` for default predictions sub-directory. :param groundtruth_directory: directory to write the ground truth MedaCy evaluates on :param asynchronous: Boolean for whether the preprocessing should be done asynchronously. :return: Prints out performance metrics, if prediction_directory """ if num_folds <= 1: raise ValueError( "Number of folds for cross validation must be greater than 1, but is %s" % repr(num_folds)) if prediction_directory is not None and training_dataset is None: raise ValueError( "Cannot generate predictions during cross validation if training dataset is not given." " Please pass the training dataset in the 'training_dataset' parameter." ) if groundtruth_directory is not None and training_dataset is None: raise ValueError( "Cannot generate groundtruth during cross validation if training dataset is not given." " Please pass the training dataset in the 'training_dataset' parameter." ) pipeline_report = self.pipeline.get_report() self.preprocess(training_dataset, asynchronous) if not (self.X_data and self.y_data): raise RuntimeError( "Must have features and labels extracted for cross validation") tags = sorted(training_dataset.get_labels(as_list=True)) self.pipeline.entities = tags logging.info('Tagset: %s', tags) eval_stats = {} # Dict for storing mapping of sequences to their corresponding file groundtruth_by_document = { filename: [] for filename in {x[2] for x in self.X_data} } preds_by_document = { filename: [] for filename in {x[2] for x in self.X_data} } folds = create_folds(self.y_data, num_folds) for fold_num, fold_data in enumerate(folds, 1): train_indices, test_indices = fold_data fold_statistics = {} learner_name, learner = self.pipeline.get_learner() X_train = [self.X_data[index] for index in train_indices] y_train = [self.y_data[index] for index in train_indices] X_test = [self.X_data[index] for index in test_indices] y_test = [self.y_data[index] for index in test_indices] logging.info("Training Fold %i", fold_num) train_data = [x[0] for x in X_train] test_data = [x[0] for x in X_test] learner.fit(train_data, y_train) y_pred = learner.predict(test_data) if groundtruth_directory is not None: # Flattening nested structures into 2d lists document_indices = [] span_indices = [] for sequence in X_test: document_indices += [sequence[2]] * len(sequence[0]) span_indices += list(sequence[1]) groundtruth = [ element for sentence in y_test for element in sentence ] # Map the predicted sequences to their corresponding documents i = 0 while i < len(groundtruth): if groundtruth[i] == 'O': i += 1 continue entity = groundtruth[i] document = document_indices[i] first_start, first_end = span_indices[i] # Ensure that consecutive tokens with the same label are merged while i < len(groundtruth) - 1 and groundtruth[ i + 1] == entity: # If inside entity, keep incrementing i += 1 last_start, last_end = span_indices[i] groundtruth_by_document[document].append( (entity, first_start, last_end)) i += 1 if prediction_directory is not None: # Flattening nested structures into 2d lists document_indices = [] span_indices = [] for sequence in X_test: document_indices += [sequence[2]] * len(sequence[0]) span_indices += list(sequence[1]) predictions = [ element for sentence in y_pred for element in sentence ] # Map the predicted sequences to their corresponding documents i = 0 while i < len(predictions): if predictions[i] == 'O': i += 1 continue entity = predictions[i] document = document_indices[i] first_start, first_end = span_indices[i] # Ensure that consecutive tokens with the same label are merged while i < len(predictions) - 1 and predictions[ i + 1] == entity: # If inside entity, keep incrementing i += 1 last_start, last_end = span_indices[i] preds_by_document[document].append( (entity, first_start, last_end)) i += 1 # Write the metrics for this fold. for label in tags: fold_statistics[label] = { "recall": metrics.flat_recall_score(y_test, y_pred, average='weighted', labels=[label]), "precision": metrics.flat_precision_score(y_test, y_pred, average='weighted', labels=[label]), "f1": metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=[label]) } # add averages fold_statistics['system'] = { "recall": metrics.flat_recall_score(y_test, y_pred, average='weighted', labels=tags), "precision": metrics.flat_precision_score(y_test, y_pred, average='weighted', labels=tags), "f1": metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=tags) } table_data = [[ label, format(fold_statistics[label]['precision'], ".3f"), format(fold_statistics[label]['recall'], ".3f"), format(fold_statistics[label]['f1'], ".3f") ] for label in tags + ['system']] logging.info( '\n' + tabulate(table_data, headers=['Entity', 'Precision', 'Recall', 'F1'], tablefmt='orgtbl')) eval_stats[fold_num] = fold_statistics statistics_all_folds = {} for label in tags + ['system']: statistics_all_folds[label] = { 'precision_average': mean(eval_stats[fold][label]['precision'] for fold in eval_stats), 'precision_max': max(eval_stats[fold][label]['precision'] for fold in eval_stats), 'precision_min': min(eval_stats[fold][label]['precision'] for fold in eval_stats), 'recall_average': mean(eval_stats[fold][label]['recall'] for fold in eval_stats), 'recall_max': max(eval_stats[fold][label]['recall'] for fold in eval_stats), 'f1_average': mean(eval_stats[fold][label]['f1'] for fold in eval_stats), 'f1_max': max(eval_stats[fold][label]['f1'] for fold in eval_stats), 'f1_min': min(eval_stats[fold][label]['f1'] for fold in eval_stats), } entity_counts = training_dataset.compute_counts() table_data = [ [ f"{label} ({entity_counts[label]})", # Entity (Count) format(statistics_all_folds[label]['precision_average'], ".3f"), format(statistics_all_folds[label]['recall_average'], ".3f"), format(statistics_all_folds[label]['f1_average'], ".3f"), format(statistics_all_folds[label]['f1_min'], ".3f"), format(statistics_all_folds[label]['f1_max'], ".3f") ] for label in tags + ['system'] ] # Combine the pipeline report and the resulting data, then log it or print it (whichever ensures that it prints) output_str = '\n' + pipeline_report + '\n\n' + tabulate( table_data, headers=[ 'Entity (Count)', 'Precision', 'Recall', 'F1', 'F1_Min', 'F1_Max' ], tablefmt='orgtbl') if logging.root.level > logging.INFO: print(output_str) else: logging.info(output_str) if prediction_directory: prediction_directory = os.path.join( training_dataset.data_directory, "predictions") groundtruth_directory = os.path.join( training_dataset.data_directory, "groundtruth") # Write annotations generated from cross-validation self.create_annotation_directory(directory=prediction_directory, training_dataset=training_dataset, option="predictions") # Write medaCy ground truth generated from cross-validation self.create_annotation_directory(directory=groundtruth_directory, training_dataset=training_dataset, option="groundtruth") # Add predicted/known annotations to the folders containing groundtruth and predictions respectively self.predict_annotation_evaluation( directory=groundtruth_directory, training_dataset=training_dataset, preds_by_document=preds_by_document, groundtruth_by_document=groundtruth_by_document, option="groundtruth") self.predict_annotation_evaluation( directory=prediction_directory, training_dataset=training_dataset, preds_by_document=preds_by_document, groundtruth_by_document=groundtruth_by_document, option="predictions") return Dataset(prediction_directory) else: return statistics_all_folds
labels = [[tuple2label(t) for t in tuples] for tuples in tuple_sets] return features, labels print("Training CRF model on training data...") train_features, train_labels = file2features_labels("train.txt") crf = CRF() crf.fit(train_features, train_labels) print("Making predictions for test data...") test_features, test_labels = file2features_labels("test.txt") test_preds = crf.predict(test_features) print("Performing own evaluation...") labels = crf.classes_ p = flat_precision_score(test_labels, test_preds, labels=labels, average="micro") r = flat_recall_score(test_labels, test_preds, labels=labels, average="micro") f1 = flat_f1_score(test_labels, test_preds, labels=labels, average="micro") print("p(micro)={} r(micro)={} f1(micro)={}".format(p, r, f1)) def to_conllevalfile(features, labels, preds, filename): with open(filename, "w") as conlleval_input_file: for feature_set, label_set, pred_set in zip(features, labels, preds): for feature, label, pred in zip(feature_set, label_set, pred_set): conlleval_input_file.write("{} {} {} {}\n".format( feature["token"], feature["pos tag"], label, pred)) conlleval_input_file.write("\n") to_conllevalfile(test_features, test_preds, test_labels, "conlleval_input_crf.txt")
def cross_validate(self, num_folds=5, training_dataset=None, epochs=20, prediction_directory=None, groundtruth_directory=None, asynchronous=None): """ Runs a cross validation. :param folds: Number of fold to do for the cross validation. :param training_dataset: Path to the directory of BRAT files to use for the training data. :param spacy_model_name: Name of the spaCy model to start from. :param epochs: Number of epochs to us for every fold training. """ if num_folds <= 1: raise ValueError( "Number of folds for cross validation must be greater than 1") if training_dataset is None: raise ValueError("Need a dataset to evaluate") train_data = training_dataset.get_training_data() labels = set() for document in train_data: for entity in document[1]['entities']: tag = entity[2] labels.add(tag) labels = list(labels) labels.sort() logging.info('Labels: %s', labels) x_data, y_data = zip(*train_data) skipped_files = [] eval_stats = {} folds = create_folds(y_data, num_folds) for fold_num, fold_data in enumerate(folds, 1): train_indices, test_indices = fold_data logging.info("\n----EVALUATING FOLD %d----", fold_num) self.model = None fold_statistics = {} x_subdataset = training_dataset.get_subdataset(train_indices) self.fit(x_subdataset, iterations=epochs, labels=labels) logging.info('Done training!\n') nlp = self.model y_subdataset = training_dataset.get_subdataset(test_indices) y_test = [] y_pred = [] for ann in y_subdataset.generate_annotations(): with open(ann.source_text_path, 'r') as source_text_file: text = source_text_file.read() doc = nlp(text) # test_entities = annotations.get_entities(format='spacy')[1]['entities'] test_entities = ann.get_entity_annotations( format='spacy')[1]['entities'] test_entities = self.entities_to_biluo(doc, test_entities) y_test.append(test_entities) pred_entities = self.predict(text) pred_entities = self.entities_to_biluo(doc, pred_entities) y_pred.append(pred_entities) logging.debug('\n------y_test------') logging.debug(y_test) logging.debug('\n------y_pred------') logging.debug(y_pred) # Write the metrics for this fold. for label in labels: fold_statistics[label] = { 'recall': metrics.flat_recall_score(y_test, y_pred, average='weighted', labels=[label]), 'precision': metrics.flat_precision_score(y_test, y_pred, average='weighted', labels=[label]), 'f1': metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=[label]) } # add averages fold_statistics['system'] = { 'recall': metrics.flat_recall_score(y_test, y_pred, average='weighted', labels=labels), 'precision': metrics.flat_precision_score(y_test, y_pred, average='weighted', labels=labels), 'f1': metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels) } table_data = [[ label, format(fold_statistics[label]['precision'], ".3f"), format(fold_statistics[label]['recall'], ".3f"), format(fold_statistics[label]['f1'], ".3f") ] for label in labels + ['system']] logging.info( '\n' + tabulate(table_data, headers=['Entity', 'Precision', 'Recall', 'F1'], tablefmt='orgtbl')) eval_stats[fold_num] = fold_statistics if skipped_files: logging.info('\nWARNING. SKIPPED THE FOLLOWING ANNOTATIONS:') logging.info(skipped_files) statistics_all_folds = {} for label in labels + ['system']: statistics_all_folds[label] = { 'precision_average': mean(eval_stats[fold][label]['precision'] for fold in eval_stats), 'precision_max': max(eval_stats[fold][label]['precision'] for fold in eval_stats), 'precision_min': min(eval_stats[fold][label]['precision'] for fold in eval_stats), 'recall_average': mean(eval_stats[fold][label]['recall'] for fold in eval_stats), 'recall_max': max(eval_stats[fold][label]['recall'] for fold in eval_stats), 'f1_average': mean(eval_stats[fold][label]['f1'] for fold in eval_stats), 'f1_max': max(eval_stats[fold][label]['f1'] for fold in eval_stats), 'f1_min': min(eval_stats[fold][label]['f1'] for fold in eval_stats), } table_data = [[ label, format(statistics_all_folds[label]['precision_average'], ".3f"), format(statistics_all_folds[label]['recall_average'], ".3f"), format(statistics_all_folds[label]['f1_average'], ".3f"), format(statistics_all_folds[label]['f1_min'], ".3f"), format(statistics_all_folds[label]['f1_max'], ".3f") ] for label in labels + ['system']] table_string = '\n' + tabulate(table_data, headers=[ 'Entity', 'Precision', 'Recall', 'F1', 'F1_Min', 'F1_Max' ], tablefmt='orgtbl') logging.info(table_string)
X_test = [sent2features(s) for s in test_data] y_test = [sent2labels(s) for s in test_data] y_train = [sent2labels(s) for s in train_data] y_validation = [sent2labels(s) for s in validation_data] labels = list( set([label for labels in y_train + y_test for label in labels])) sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0])) with open("best_crf_model.pkl", "rb") as in_file: crf = pickle.load(in_file) y_pred = crf.predict(X_test) print("### Classification Report ###") print(metrics.flat_classification_report(y_test, y_pred, labels=sorted_labels), end='\n\n') print("### Sequence Accuracy Score ###") print(metrics.sequence_accuracy_score(y_test, y_pred), end='\n\n') print("### Weighted Precision Score ###") print(metrics.flat_precision_score(y_test, y_pred, average='weighted'), end='\n\n') print("### Weighted Recall Score ###") print(flat_recall_score(y_test, y_pred, average='weighted'), end='\n\n')
def test_flat_precision(): score = metrics.flat_precision_score(y1, y2, average='micro') assert score == 3 / 5
crf.fit(X_train, y_train) labels = list(crf.classes_) labels.remove('O') print(labels) y_pred = crf.predict(X_val) actual_preds = crf.predict(X_test) writeOutput(actual_preds) print(metrics.flat_accuracy_score( y_val, y_pred, )) print( metrics.flat_precision_score(y_val, y_pred, average='weighted', labels=labels)) print( metrics.flat_recall_score(y_val, y_pred, average='weighted', labels=labels)) print(metrics.flat_f1_score(y_val, y_pred, average='weighted', labels=labels)) y_flat_pred = [] y_flat_val = [] x_flat_val = [] [y_flat_pred.extend(x) for x in y_pred] [y_flat_val.extend(x) for x in y_val] [x_flat_val.extend(x) for x in X_val] validate_NER(y_flat_val, y_flat_pred, x_flat_val)
y_train = labelData for i in range(0,len(X_train)): X_train_list.append([X_train[i]]) y_train_list.append([y_train[i]]) crf = sklearn_crfsuite.CRF( algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=70, all_possible_transitions=True) crf.fit(X_train_list,y_train_list) tokens = te_list tags = te_tags labelData = te_labels test_featuresets = [punct_features(tokens, tags, i, labelData) for i in range(0, len(tokens))] X_test = test_featuresets y_test = labelData y_pred = crf.predict([X_test]) f1_Score = metrics.flat_f1_score([y_test],y_pred,average='weighted') precision = metrics.flat_precision_score([y_test], y_pred, average='weighted') accuracy = metrics.flat_accuracy_score([y_test],y_pred) print("F1 Score :" , f1_Score, "Precision :" , precision, "Accuracy :" , accuracy)
crf.fit(X_train, y_train) # Predicting on the test set. y_pred = crf.predict(X_test) # Performance f1_score = flat_f1_score(y_test, y_pred, average='weighted') print("F1 score: ", f1_score) acc = flat_accuracy_score(y_test, y_pred) print("Accuracy: ", acc) rec = flat_recall_score(y_test, y_pred, average='weighted') print("Recall: ", rec) prec = flat_precision_score(y_test, y_pred, average='weighted') print("Precision: ", prec) report = flat_classification_report(y_test, y_pred) print(report) def print_transitions(trans_features): for (label_from, label_to), weight in trans_features: print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight)) print("Top likely transitions:") print_transitions(Counter(crf.transition_features_).most_common(20)) print("\nTop unlikely transitions:")
def cross_validate(self, num_folds=10, training_dataset=None, prediction_directory=None): """ Performs k-fold stratified cross-validation using our model and pipeline. If the training dataset and prediction_directory are passed, intermediate predictions during cross validation are written to the directory `write_predictions`. This allows one to construct a confusion matrix or to compute the prediction ambiguity with the methods present in the Dataset class to support pipeline development without a designated evaluation set. :param num_folds: number of folds to split training data into for cross validation :param training_dataset: Dataset that is being cross validated (optional) :param prediction_directory: directory to write predictions of cross validation to or `True` for default predictions sub-directory. :return: Prints out performance metrics, if prediction_directory """ if num_folds <= 1: raise ValueError( "Number of folds for cross validation must be greater than 1") if prediction_directory is not None and training_dataset is None: raise ValueError( "Cannot generated predictions during cross validation if training dataset is not given." " Please pass the training dataset in the 'training_dataset' parameter." ) assert self.model is not None, "Cannot cross validate a un-fit model" assert self.X_data is not None and self.y_data is not None, \ "Must have features and labels extracted for cross validation" X_data = self.X_data Y_data = self.y_data medacy_pipeline = self.pipeline cv = SequenceStratifiedKFold(folds=num_folds) named_entities = medacy_pipeline.entities evaluation_statistics = {} fold = 1 for train_indices, test_indices in cv(X_data, Y_data): fold_statistics = {} learner_name, learner = medacy_pipeline.get_learner() X_train = [X_data[index] for index in train_indices] y_train = [Y_data[index] for index in train_indices] X_test = [X_data[index] for index in test_indices] y_test = [Y_data[index] for index in test_indices] logging.info("Training Fold %i", fold) train_data = [x[0] for x in X_train] test_data = [x[0] for x in X_test] learner.fit(train_data, y_train) y_pred = learner.predict(test_data) if prediction_directory is not None: # Dict for storing mapping of sequences to their corresponding file preds_by_document = { filename: [] for filename in list(set([x[2] for x in X_data])) } # Flattening nested structures into 2d lists document_indices = [] span_indices = [] for sequence in X_test: document_indices += [ sequence[2] for x in range(len(sequence[0])) ] span_indices += [element for element in sequence[1]] predictions = [ element for sentence in y_pred for element in sentence ] # Map the predicted sequences to their corresponding documents i = 0 while i < len(predictions): if predictions[i] == 'O': i += 1 continue entity = predictions[i] document = document_indices[i] first_start, first_end = span_indices[i] # Ensure that consecutive tokens with the same label are merged while i < len(predictions) - 1 and predictions[ i + 1] == entity: # If inside entity, keep incrementing i += 1 last_start, last_end = span_indices[i] preds_by_document[document].append( (entity, first_start, last_end)) i += 1 # Write the metrics for this fold. for label in named_entities: fold_statistics[label] = {} recall = metrics.flat_recall_score(y_test, y_pred, average='weighted', labels=[label]) precision = metrics.flat_precision_score(y_test, y_pred, average='weighted', labels=[label]) f1 = metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=[label]) fold_statistics[label]['precision'] = precision fold_statistics[label]['recall'] = recall fold_statistics[label]['f1'] = f1 # add averages fold_statistics['system'] = {} recall = metrics.flat_recall_score(y_test, y_pred, average='weighted', labels=named_entities) precision = metrics.flat_precision_score(y_test, y_pred, average='weighted', labels=named_entities) f1 = metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=named_entities) fold_statistics['system']['precision'] = precision fold_statistics['system']['recall'] = recall fold_statistics['system']['f1'] = f1 table_data = [[ label, format(fold_statistics[label]['precision'], ".3f"), format(fold_statistics[label]['recall'], ".3f"), format(fold_statistics[label]['f1'], ".3f") ] for label in named_entities + ['system']] logging.info( tabulate(table_data, headers=['Entity', 'Precision', 'Recall', 'F1'], tablefmt='orgtbl')) evaluation_statistics[fold] = fold_statistics fold += 1 statistics_all_folds = {} for label in named_entities + ['system']: statistics_all_folds[label] = {} statistics_all_folds[label]['precision_average'] = mean([ evaluation_statistics[fold][label]['precision'] for fold in evaluation_statistics ]) statistics_all_folds[label]['precision_max'] = max([ evaluation_statistics[fold][label]['precision'] for fold in evaluation_statistics ]) statistics_all_folds[label]['precision_min'] = min([ evaluation_statistics[fold][label]['precision'] for fold in evaluation_statistics ]) statistics_all_folds[label]['recall_average'] = mean([ evaluation_statistics[fold][label]['recall'] for fold in evaluation_statistics ]) statistics_all_folds[label]['recall_max'] = max([ evaluation_statistics[fold][label]['recall'] for fold in evaluation_statistics ]) statistics_all_folds[label]['recall_min'] = min([ evaluation_statistics[fold][label]['recall'] for fold in evaluation_statistics ]) statistics_all_folds[label]['f1_average'] = mean([ evaluation_statistics[fold][label]['f1'] for fold in evaluation_statistics ]) statistics_all_folds[label]['f1_max'] = max([ evaluation_statistics[fold][label]['f1'] for fold in evaluation_statistics ]) statistics_all_folds[label]['f1_min'] = min([ evaluation_statistics[fold][label]['f1'] for fold in evaluation_statistics ]) table_data = [[ label, format(statistics_all_folds[label]['precision_average'], ".3f"), format(statistics_all_folds[label]['recall_average'], ".3f"), format(statistics_all_folds[label]['f1_average'], ".3f"), format(statistics_all_folds[label]['f1_min'], ".3f"), format(statistics_all_folds[label]['f1_max'], ".3f") ] for label in named_entities + ['system']] logging.info("\n" + tabulate(table_data, headers=[ 'Entity', 'Precision', 'Recall', 'F1', 'F1_Min', 'F1_Max' ], tablefmt='orgtbl')) if prediction_directory: # Write annotations generated from cross-validation if isinstance(prediction_directory, str): prediction_directory = prediction_directory else: prediction_directory = training_dataset.data_directory + "/predictions/" if os.path.isdir(prediction_directory): logging.warning("Overwritting existing predictions") else: os.makedirs(prediction_directory) for data_file in training_dataset.get_data_files(): logging.info("Predicting file: %s", data_file.file_name) with open(data_file.raw_path, 'r') as raw_text: doc = medacy_pipeline.spacy_pipeline.make_doc( raw_text.read()) preds = preds_by_document[data_file.file_name] annotations = construct_annotations_from_tuples(doc, preds) annotations.to_ann(write_location=os.path.join( prediction_directory, data_file.file_name + ".ann")) return Dataset(data_directory=prediction_directory)
def cross_validate(x_folds, y_folds, params): f1_per = [] f1_org = [] f1_misc = [] f1_loc = [] f1_not = [] precision_per = [] precision_org = [] precision_misc = [] precision_loc = [] precision_not = [] recall_per = [] recall_org = [] recall_misc = [] recall_loc = [] recall_not = [] for i in range(len(x_folds)): print('\rWorking on fold {}/{} ...'.format(i + 1, len(x_folds)), end='') crf = sklearn_crfsuite.CRF(**params) test_x, test_y, train_x, train_y = folds_2_tt(x_folds, y_folds, i) train_x, train_y = balance(train_x, train_y) crf.fit(train_x, train_y) pred_y = crf.predict(test_x) f1_per.append( metrics.flat_f1_score(test_y, pred_y, average=None, labels=['per'])) f1_org.append( metrics.flat_f1_score(test_y, pred_y, average=None, labels=['org'])) f1_misc.append( metrics.flat_f1_score(test_y, pred_y, average=None, labels=['misc'])) f1_loc.append( metrics.flat_f1_score(test_y, pred_y, average=None, labels=['loc'])) f1_not.append( metrics.flat_f1_score(test_y, pred_y, average=None, labels=['notpropn'])) precision_per.append( metrics.flat_precision_score(test_y, pred_y, average=None, labels=['per'])) precision_org.append( metrics.flat_precision_score(test_y, pred_y, average=None, labels=['org'])) precision_misc.append( metrics.flat_precision_score(test_y, pred_y, average=None, labels=['misc'])) precision_loc.append( metrics.flat_precision_score(test_y, pred_y, average=None, labels=['loc'])) precision_not.append( metrics.flat_precision_score(test_y, pred_y, average=None, labels=['notpropn'])) recall_per.append( metrics.flat_recall_score(test_y, pred_y, average=None, labels=['per'])) recall_org.append( metrics.flat_recall_score(test_y, pred_y, average=None, labels=['org'])) recall_misc.append( metrics.flat_recall_score(test_y, pred_y, average=None, labels=['misc'])) recall_loc.append( metrics.flat_recall_score(test_y, pred_y, average=None, labels=['loc'])) recall_not.append( metrics.flat_recall_score(test_y, pred_y, average=None, labels=['notpropn'])) print() avg_per_f1 = sum(f1_per) / len(f1_per) avg_org_f1 = sum(f1_org) / len(f1_org) avg_loc_f1 = sum(f1_loc) / len(f1_loc) avg_misc_f1 = sum(f1_misc) / len(f1_misc) avg_not_f1 = sum(f1_not) / len(f1_not) avg_per_precision = sum(precision_per) / len(precision_per) avg_org_precision = sum(precision_org) / len(precision_org) avg_loc_precision = sum(precision_loc) / len(precision_loc) avg_misc_precision = sum(precision_misc) / len(precision_misc) avg_not_precision = sum(precision_not) / len(precision_not) avg_per_recall = sum(recall_per) / len(recall_per) avg_org_recall = sum(recall_org) / len(recall_org) avg_loc_recall = sum(recall_loc) / len(recall_loc) avg_misc_recall = sum(recall_misc) / len(recall_misc) avg_not_recall = sum(recall_not) / len(recall_not) result = { 'per': (avg_per_precision, avg_per_recall, avg_per_f1), 'org': (avg_org_precision, avg_org_recall, avg_org_f1), 'misc': (avg_misc_precision, avg_misc_recall, avg_misc_f1), 'loc': (avg_loc_precision, avg_loc_recall, avg_loc_f1), 'not': (avg_not_precision, avg_not_recall, avg_not_f1) } return result
def validate_performance(self, test_set): sentences = self.__load_corpus__(test_set) y_test = [self.model.sentence2labels(s) for s in sentences] y_prediction = [] for i, sent in enumerate(sentences): new_sent = ' '.join([word[0] for word in sent]) prediction = self.model.predict(new_sent) new_prediction = [] if len(prediction) > 1: for p in prediction: new_prediction += [p1 for p1 in p] # print(prediction) # print(new_prediction) prediction = new_prediction else: prediction = prediction[0] try: pred = [w[1] for w in prediction] except Exception: print(prediction) return # if len(pred) != len(y_test[i]): # print(sent) # print(new_sent) # print(y_test[i]) # print(len(y_test[i])) # print(pred) # print(len(pred)) y_prediction.append(pred) labels = [ 'O-DOS', 'B-DOS', 'I-UNIT', 'B-UNIT', 'O-UNIT', 'I-FREQ', 'B-FREQ', 'O-FREQ', 'I-DUR', 'B-DUR', 'O-DUR', 'I-WHO', 'B-WHO', 'O-WHO' ] for i in range(len(y_prediction)): for j in range(len(y_prediction[i])): y_prediction[i][j] = y_prediction[i][j].replace('B-', '') y_prediction[i][j] = y_prediction[i][j].replace('O-', '') y_prediction[i][j] = y_prediction[i][j].replace('I-', '') for i in range(len(y_test)): for j in range(len(y_test[i])): y_test[i][j] = y_test[i][j].replace('B-', '') y_test[i][j] = y_test[i][j].replace('O-', '') y_test[i][j] = y_test[i][j].replace('I-', '') labels = ['DOS', 'UNIT', 'FREQ', 'DUR', 'WHO'] # labels = ['DOS', 'UNIT', 'WHO', 'DUR', 'FREQ'] sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0])) precision = metrics.flat_precision_score(y_test, y_prediction, labels=sorted_labels, average='micro') recall = metrics.flat_recall_score(y_test, y_prediction, labels=sorted_labels, average='micro') f1 = metrics.flat_f1_score(y_test, y_prediction, labels=sorted_labels, average='micro') print('MICRO') print(precision, recall, f1) precision = metrics.flat_precision_score(y_test, y_prediction, labels=sorted_labels, average='macro') recall = metrics.flat_recall_score(y_test, y_prediction, labels=sorted_labels, average='macro') f1 = metrics.flat_f1_score(y_test, y_prediction, labels=sorted_labels, average='macro') print('MACRO') print(precision, recall, f1) print( metrics.flat_classification_report(y_test, y_prediction, labels=sorted_labels, digits=3))
open(os.path.join(config.dump_address, "y_dev_pred.pkl"), "wb")) test_loss = total_test_loss / len(corpus.test.labels) print( "->>>>>>>>>>>>>TOTAL>>>>>>>>>>>>>>>>>>>>>>> test_loss: {}, test_accuracy: {}, test_f1_score_micro: {} ROC:{}" .format(test_loss, (test_right_preds / test_total_preds), (test_f1_total_micro), roc_score)) print() print( metrics.flat_classification_report(test_total_y_true, test_total_y_pred)) print("test_f1_total_binary: ", test_f1_total_binary) print( "precision binary: ", metrics.flat_precision_score(test_total_y_true, test_total_y_pred, average="binary")) print( "recall binary: ", metrics.flat_recall_score(test_total_y_true, test_total_y_pred, average="binary")) print("[LOG] dumping results in ", config.dump_address) pickle.dump( np.array(total_scores_numpy_probs), open(os.path.join(config.dump_address, "dev_score_pobs.pkl"), "wb")) pickle.dump( np.array(total_labels_numpy_probs), open(os.path.join(config.dump_address, "dev_label_pobs.pkl"), "wb")) pickle.dump(
def cross_validate(self, num_folds=10): """ Performs k-fold stratified cross-validation using our model and pipeline. :param num_folds: number of folds to split training data into for cross validation :return: Prints out performance metrics """ assert num_folds > 1, "Number of folds for cross validation must be greater than 1" assert self.model is not None, "Cannot cross validate a un-fit model" assert self.X_data is not None and self.y_data is not None, \ "Must have features and labels extracted for cross validation" X_data = self.X_data Y_data = self.y_data medacy_pipeline = self.pipeline cv = SequenceStratifiedKFold(folds=num_folds) named_entities = medacy_pipeline.entities evaluation_statistics = {} fold = 1 for train_indices, test_indices in cv(X_data, Y_data): fold_statistics = {} learner_name, learner = medacy_pipeline.get_learner() X_train = [X_data[index] for index in train_indices] y_train = [Y_data[index] for index in train_indices] X_test = [X_data[index] for index in test_indices] y_test = [Y_data[index] for index in test_indices] logging.info("Training Fold %i", fold) learner.fit(X_train, y_train) y_pred = learner.predict(X_test) for label in named_entities: fold_statistics[label] = {} recall = metrics.flat_recall_score(y_test, y_pred, average='weighted', labels=[label]) precision = metrics.flat_precision_score(y_test, y_pred, average='weighted', labels=[label]) f1 = metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=[label]) fold_statistics[label]['precision'] = precision fold_statistics[label]['recall'] = recall fold_statistics[label]['f1'] = f1 # add averages fold_statistics['system'] = {} recall = metrics.flat_recall_score(y_test, y_pred, average='weighted', labels=named_entities) precision = metrics.flat_precision_score(y_test, y_pred, average='weighted', labels=named_entities) f1 = metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=named_entities) fold_statistics['system']['precision'] = precision fold_statistics['system']['recall'] = recall fold_statistics['system']['f1'] = f1 evaluation_statistics[fold] = fold_statistics fold += 1 statistics_all_folds = {} for label in named_entities + ['system']: statistics_all_folds[label] = {} statistics_all_folds[label]['precision_average'] = mean([ evaluation_statistics[fold][label]['precision'] for fold in evaluation_statistics ]) statistics_all_folds[label]['precision_max'] = max([ evaluation_statistics[fold][label]['precision'] for fold in evaluation_statistics ]) statistics_all_folds[label]['precision_min'] = min([ evaluation_statistics[fold][label]['precision'] for fold in evaluation_statistics ]) statistics_all_folds[label]['recall_average'] = mean([ evaluation_statistics[fold][label]['recall'] for fold in evaluation_statistics ]) statistics_all_folds[label]['recall_max'] = max([ evaluation_statistics[fold][label]['recall'] for fold in evaluation_statistics ]) statistics_all_folds[label]['recall_min'] = min([ evaluation_statistics[fold][label]['recall'] for fold in evaluation_statistics ]) statistics_all_folds[label]['f1_average'] = mean([ evaluation_statistics[fold][label]['f1'] for fold in evaluation_statistics ]) statistics_all_folds[label]['f1_max'] = max([ evaluation_statistics[fold][label]['f1'] for fold in evaluation_statistics ]) statistics_all_folds[label]['f1_min'] = min([ evaluation_statistics[fold][label]['f1'] for fold in evaluation_statistics ]) table_data = [[ label, format(statistics_all_folds[label]['precision_average'], ".3f"), format(statistics_all_folds[label]['recall_average'], ".3f"), format(statistics_all_folds[label]['f1_average'], ".3f"), format(statistics_all_folds[label]['f1_min'], ".3f"), format(statistics_all_folds[label]['f1_max'], ".3f") ] for label in named_entities + ['system']] logging.info( tabulate(table_data, headers=[ 'Entity', 'Precision', 'Recall', 'F1', 'F1_Min', 'F1_Max' ], tablefmt='orgtbl'))
def gen_model(self, x_train, y_train, x_test, y_test): for i in range(len(y_train)): for j in range(len(y_train[i])): y_train[i][j] = y_train[i][j].replace('B-', '') y_train[i][j] = y_train[i][j].replace('O-', '') y_train[i][j] = y_train[i][j].replace('I-', '') for i in range(len(y_test)): for j in range(len(y_test[i])): y_test[i][j] = y_test[i][j].replace('B-', '') y_test[i][j] = y_test[i][j].replace('O-', '') y_test[i][j] = y_test[i][j].replace('I-', '') labels = ['DOS', 'UNIT', 'FREQ', 'DUR', 'WHO'] # labels = ['O-DOS', 'B-DOS', 'I-UNIT', 'B-UNIT', 'O-UNIT', 'I-FREQ', 'B-FREQ', 'O-FREQ', 'I-DUR', 'B-DUR', 'O-DUR', 'I-WHO', 'B-WHO', 'O-WHO'] # labels = ['m', 'r', 'f', 'do', 'du', 'mo'] crf = sklearn_crfsuite.CRF(algorithm='lbfgs', max_iterations=100, all_possible_transitions=True) params_space = { 'c1': scipy.stats.expon(scale=0.5), 'c2': scipy.stats.expon(scale=0.05), } # use the same metric for evaluation f1_scorer = make_scorer(metrics.flat_f1_score, average='weighted', labels=labels) # search rand_search = RandomizedSearchCV(crf, params_space, cv=3, verbose=1, n_jobs=-1, n_iter=50, scoring=f1_scorer) rand_search.fit(x_train, y_train) crf = rand_search.best_estimator_ y_prediction = crf.predict(x_test) # group B and I results sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0])) joblib.dump(crf, 'model.pkl') precision = metrics.flat_precision_score(y_test, y_prediction, labels=sorted_labels, average='micro') recall = metrics.flat_recall_score(y_test, y_prediction, labels=sorted_labels, average='micro') f1 = metrics.flat_f1_score(y_test, y_prediction, labels=sorted_labels, average='micro') print('MICRO') print(precision, recall, f1) precision = metrics.flat_precision_score(y_test, y_prediction, labels=sorted_labels, average='macro') recall = metrics.flat_recall_score(y_test, y_prediction, labels=sorted_labels, average='macro') f1 = metrics.flat_f1_score(y_test, y_prediction, labels=sorted_labels, average='macro') print('MACRO') print(precision, recall, f1) return metrics.flat_classification_report(y_test, y_prediction, labels=sorted_labels, digits=3)
max_iterations=100, all_possible_transitions=True) crf.fit(X_train, y_train) labels = list(crf.classes_) # labels # In[67]: y_pred = crf.predict(X_test) overall_f1 = metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels) overall_prec = metrics.flat_precision_score(y_test, y_pred, average='weighted', labels=labels) overall_recall = metrics.flat_recall_score(y_test, y_pred, average='weighted', labels=labels) print("Overall F1:", overall_f1) print("Overall Precision:", overall_prec) print("Overall Recall", overall_recall) # Inspect per-class results in more detail: print( metrics.flat_classification_report(y_test, y_pred, labels=labels, digits=3))