def test_entities_at_the_end(): words = "Shyam lives in New York".split() gold = "B-PER O O B-LOC I-LOC".split() pred = "B-PER O O B-LOC O".split() print("Input gold. This should be perfect.") counts = evaluate(map(lambda p: " ".join(p), zip(words, gold, gold))) overall, by_type = metrics(counts) report(counts) assert overall.fscore == 1.0 print("This should be 50% F1") counts = evaluate(map(lambda p: " ".join(p), zip(words, gold, pred))) overall, by_type = metrics(counts) report(counts) assert overall.fscore == 0.5 assert by_type["PER"].fscore == 1.0 assert by_type["LOC"].fscore == 0.0 print("This should be 50% F1") counts = evaluate(map(lambda p: " ".join(p), zip(words, pred, gold))) overall, by_type = metrics(counts) report(counts) assert overall.fscore == 0.5 assert by_type["PER"].fscore == 1.0 assert by_type["LOC"].fscore == 0.0
def eval(self, dataset_name, log_output=None): dataset = self.datasets.get(dataset_name, None) if dataset is None: return results = [] logger.info('Evaluating {} ({})'.format(self.name, dataset_name)) set_loss = 0 for tokens, labels, chars, seq_lens, char_lens in dataset.get_dataset( volatile=True, gpu=self.gpu): preds, loss = self.model.predict(tokens, labels, seq_lens, chars, char_lens) set_loss += float(loss.data[0]) for pred, gold, seq_len, ts in zip(preds, labels, seq_lens, tokens): l = int(seq_len.data[0]) pred = pred.data.tolist()[:l] gold = gold.data.tolist()[:l] ts = ts.data.tolist()[:l] for p, g, t in zip(pred, gold, ts): t = self.idx_token.get(t, 'UNK') results.append('{} {} {}'.format(t, self.idx_label[g], self.idx_label[p])) results.append('') counts = evaluate(results) overall, by_type = metrics(counts) report(counts) logger.info('Loss: {:.5f}'.format(set_loss)) return SCORES(fscore=overall.fscore, precision=overall.prec, recall=overall.rec, loss=set_loss)
def evaluate(results, idx_token, idx_label, writer=None): """Evaluate prediction results. :param results: A List of which each item is a tuple (predictions, gold labels, sequence lengths, tokens) of a batch. :param idx_token: Index to token dictionary. :param idx_label: Index to label dictionary. :param writer: An object (file object) with a write() function. Extra output. :return: F-score, precision, and recall. """ # b: batch, s: sequence outputs = [] for preds_b, golds_b, len_b, tokens_b in results: for preds_s, golds_s, len_s, tokens_s in zip(preds_b, golds_b, len_b, tokens_b): l = int(len_s.item()) preds_s = preds_s.data.tolist()[:l] golds_s = golds_s.data.tolist()[:l] tokens_s = tokens_s.data.tolist()[:l] for p, g, t in zip(preds_s, golds_s, tokens_s): token = idx_token.get(t, C.UNK_INDEX) outputs.append('{} {} {}'.format(token, idx_label.get(g, 0), idx_label.get(p, 0))) outputs.append('') counts = conlleval.evaluate(outputs) overall, by_type = conlleval.metrics(counts) conlleval.report(counts) if writer: conlleval.report(counts, out=writer) writer.flush() return overall.fscore, overall.prec, overall.rec
def get_results(self, name): p = (float(self.main_correct_count) / float(self.main_predicted_count)) if (self.main_predicted_count > 0) else 0.0 r = (float(self.main_correct_count) / float(self.main_total_count)) if (self.main_total_count > 0) else 0.0 f = (2.0 * p * r / (p + r)) if (p+r > 0.0) else 0.0 f05 = ((1.0 + 0.5*0.5) * p * r / ((0.5*0.5 * p) + r)) if (p+r > 0.0) else 0.0 results = collections.OrderedDict() results[name + "_cost_avg"] = self.cost_sum / float(self.token_count) results[name + "_cost_sum"] = self.cost_sum results[name + "_main_predicted_count"] = self.main_predicted_count results[name + "_main_total_count"] = self.main_total_count results[name + "_main_correct_count"] = self.main_correct_count results[name + "_p"] = p results[name + "_r"] = r results[name + "_f"] = f results[name + "_f05"] = f05 results[name + "_accuracy"] = self.correct_sum / float(self.token_count) results[name + "_token_count"] = self.token_count results[name + "_time"] = float(time.time()) - float(self.start_time) results[name + "_correct_sum"] = self.correct_sum if self.label2id is not None and self.conll_eval == True: conll_counts = conlleval.evaluate(self.conll_format) conll_metrics_overall, conll_metrics_by_type = conlleval.metrics(conll_counts) results[name + "_conll_accuracy"] = float(conll_counts.correct_tags) / float(conll_counts.token_counter) results[name + "_conll_p"] = conll_metrics_overall.prec results[name + "_conll_r"] = conll_metrics_overall.rec results[name + "_conll_f"] = conll_metrics_overall.fscore # for i, m in sorted(conll_metrics_by_type.items()): # results[name + "_conll_p_" + str(i)] = m.prec # results[name + "_conll_r_" + str(i)] = m.rec # results[name + "_conll_f_" + str(i)] = m.fscore #str(m.fscore) + " " + str(conll_counts.t_found_guessed[i]) return results, self.conll_format
def conll_summary(tokens, gold, pred, config): """Return string summarizing performance using CoNLL criteria.""" index_to_label = {v: k for k, v in config.label_to_index.items()} acc = accuracy(gold, pred) gold = map(lambda i: index_to_label[i], as_dense(gold)) pred = map(lambda i: index_to_label[i], as_dense(pred)) # Format as space-separated (token, gold, pred) strings for CoNLL eval. if len(tokens) != len(gold) or len(gold) != len(pred): raise ValueError('counts do not match') formatted = [' '.join(t) for t in zip(tokens, gold, pred)] o, by_type = conlleval.metrics(conlleval.evaluate(formatted)) nlen = max(len(name) for name in by_type.keys()) summaries = [ '%.2f%% acc %.2f%% f (%.1fp %.1fr %dtp %dfp %dfn)' % (100. * acc, 100. * o.fscore, 100. * o.prec, 100. * o.rec, o.tp, o.fp, o.fn) ] config.results_log[config.model_name_log][ config.dataset_name_log] = o.fscore for name, r in sorted(by_type.items()): summaries.append('%*s %.2f%% f (%.1fp %.1fr %dtp %dfp %dfn)' % (nlen, name, 100. * r.fscore, 100. * r.prec, 100. * r.rec, r.tp, r.fp, r.fn)) return '\n'.join(summaries)
def evaluate(args, data, model, id2label, all_ori_tokens): model.eval() sampler = SequentialSampler(data) dataloader = DataLoader(data, sampler=sampler, batch_size=args.train_batch_size) logger.info("***** Running eval *****") # logger.info(f" Num examples = {len(data)}") # logger.info(f" Batch size = {args.eval_batch_size}") pred_labels = [] ori_labels = [] for b_i, (input_ids, input_mask, segment_ids, label_ids, bbox, bbox_pos_id, bbox_num) in enumerate(tqdm(dataloader, desc="Evaluating")): input_ids = input_ids.to(args.device) input_mask = input_mask.to(args.device) segment_ids = segment_ids.to(args.device) label_ids = label_ids.to(args.device) bbox = bbox.to(args.device) bbox_pos_id = bbox_pos_id.to(args.device) bbox_num = bbox_num.to(args.device) with torch.no_grad(): logits = model.predict(input_ids, segment_ids, input_mask, bbox, bbox_pos_id, bbox_num) # logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2) # logits = logits.detach().cpu().numpy() for l in logits: # logits-> List[List[int]] pred_labels.append([id2label[idx] for idx in l]) for l in label_ids: # tensor ori_labels.append([id2label[idx.item()] for idx in l]) eval_list = [] for ori_tokens, oril, prel in zip(all_ori_tokens, ori_labels, pred_labels): for ot, ol, pl in zip(ori_tokens, oril, prel): if ot in ["[CLS]", "[SEP]"]: continue if len(f"{ot} {ol} {pl}\n".split(" ")) != 3: continue eval_list.append(f"{ot} {ol} {pl}\n") eval_list.append("\n") # eval the model counts = conlleval.evaluate(eval_list) conlleval.report(counts) # namedtuple('Metrics', 'tp fp fn prec rec fscore') overall, by_type = conlleval.metrics(counts) return overall, by_type
def on_epoch_end(self, epoch, logs=None): ypred = self.model.predict(self.test_features) c, cmat = conll_eval_counts(ypred, self.test_ground_truth, self.labels) ceval.report(c, prefix=self.prefix) print_cm(cmat, ordered_label_keys(self.labels)) o, b = ceval.metrics(c) # tensorboard requires those logs to be float64 with attribute item(), thus we create them with numpy logs[self.prefix + "_conll_f1"] = np.float64(o.fscore) logs[self.prefix + "_conll_prec"] = np.float64(o.prec) logs[self.prefix + "_conll_rec"] = np.float64(o.rec)
def calculate_labeling_scores(results, report=True): outputs = [] for p_b, g_b, t_b, l_b in results: for p_s, g_s, t_s, l_s in zip(p_b, g_b, t_b, l_b): p_s = p_s[:l_s] for p, g, t in zip(p_s, g_s, t_s): outputs.append('{} {} {}'.format(t, g, p)) outputs.append('') counts = conlleval.evaluate(outputs) overall, by_type = conlleval.metrics(counts) if report: conlleval.report(counts) return (overall.fscore * 100.0, overall.prec * 100.0, overall.rec * 100.0)
def test_format(): words = "Shyam lives in New York .".split() gold = "B-PER O O B-LOC I-LOC O".split() pred = "B-PER O O B-LOC O O".split() print("Testing inputting the wrong format. This should get an exception") try: evaluate([1, 2, 3]) except Exception as e: print(e) pred = "B-PER O O B-LOC I-MISC O".split() print("This should be 50% F1") counts = evaluate(map(lambda p: " ".join(p), zip(words, gold, pred))) overall, by_type = metrics(counts) report(counts) assert overall.fscore == 0.4
def evaluate(results, idx_token, idx_label, writer=None): """Evaluate prediction results. :param results: A List of which each item is a tuple (predictions, gold labels, sequence lengths, tokens) of a batch. :param idx_token: Index to token dictionary. :param idx_label: Index to label dictionary. :param writer: An object (file object) with a write() function. Extra output. :return: F-score, precision, and recall. """ # b: batch, s: sequence outputs = [] # preds: predictions # golds: answers? # len: length of something # tokens: original words? for preds_b, golds_b, len_b, tokens_b in results: for preds_s, golds_s, len_s, tokens_s in zip(preds_b, golds_b, len_b, tokens_b): l = int(len_s.item()) preds_s = preds_s.data.tolist()[:l] golds_s = golds_s.data.tolist()[:l] tokens_s = tokens_s.data.tolist()[:l] for p, g, t in zip(preds_s, golds_s, tokens_s): token = idx_token.get(t, C.UNK) # if token == '': # debug # token = '<$UNK$>' # print(idx_token) # debug # print("p: ", p, ", g: ", g, ", t: ", t, ", corresponding token:", token, "|") # DEBUG outputs.append('{} {} {}'.format( token, idx_label.get(g, 0), idx_label.get(p, 0))) outputs.append('') # print("OUTPUTS: ", outputs) # DEBUG # seems like outputs is right but counts is wrong # Why is english-covered-test not like the other, uncovered datasets? is this causing an issue? counts = conlleval.evaluate(outputs) # print("counts: ", counts) # DEBUG overall, by_type = conlleval.metrics(counts) conlleval.report(counts) if writer: conlleval.report(counts, out=writer) writer.flush() return overall.fscore, overall.prec, overall.rec
def conll_summary(sentences): eval_sentences = [[(t.target_str, t.prediction_str) for t in s] for s in sentences] gold = [t.target_str for s in sentences for t in s] pred = [t.prediction_str for s in sentences for t in s] acc = accuracy(gold, pred) counts = conlleval.evaluate_sentences(eval_sentences) overall, by_type = conlleval.metrics(counts) #print("By type keys: ", len(by_type.keys())) #nlen = max(len(name) for name in by_type.keys()) if len(by_type.keys()) > 0 else 0 nlen = max(len(name) for name in by_type.keys()) summaries = [( 'acc: {acc:.2%} f: {m.fscore:.2%} ' + '(p:{m.prec:.1%} r:{m.rec:.1%} tp:{m.tp} fp:{m.fp} fn:{m.fn})').format( acc=acc, m=overall)] for name, r in sorted(by_type.items()): summaries.append( ('{name:{nlen}} f: {m.fscore:.2%} ' + '(p:{m.prec:.1%} r:{m.rec:.1%} tp:{m.tp} fp:{m.fp} fn:{m.fn})' ).format(name=name, nlen=nlen, m=r)) return '\n'.join(summaries)
def conll_summary(tokens, gold, pred, config): """Return string summarizing performance using CoNLL criteria.""" index_to_label = { v: k for k, v in config.label_to_index.items() } acc = accuracy(gold, pred) gold = map(lambda i: index_to_label[i], as_dense(gold)) pred = map(lambda i: index_to_label[i], as_dense(pred)) # Format as space-separated (token, gold, pred) strings for CoNLL eval. if len(tokens) != len(gold) or len(gold) != len(pred): raise ValueError('counts do not match') formatted = [' '.join(t) for t in zip(tokens, gold, pred)] o, by_type = conlleval.metrics(conlleval.evaluate(formatted)) nlen = max(len(name) for name in by_type.keys()) summaries = ['%.2f%% acc %.2f%% f (%.1fp %.1fr %dtp %dfp %dfn)' % ( 100.*acc, 100.*o.fscore, 100.*o.prec, 100.*o.rec, o.tp, o.fp, o.fn )] for name, r in sorted(by_type.items()): summaries.append('%*s %.2f%% f (%.1fp %.1fr %dtp %dfp %dfn)' % ( nlen, name, 100.*r.fscore, 100.*r.prec, 100.*r.rec, r.tp, r.fp, r.fn )) return '\n'.join(summaries)
def conlleval_overall_results(documents): """Return overall conlleval results for Documents.""" counts = conlleval_evaluate(documents) overall, by_type = metrics(counts) return overall
def main(argv): argparser = argument_parser() args = argparser.parse_args(argv[1:]) seq_len = args.max_seq_length # abbreviation pretrained_model, tokenizer = load_pretrained(args) train_words, train_tags = read_conll(args.train_data) test_words, test_tags = read_conll(args.test_data) print(args.no_context) if args.no_context: train_data = process_no_context(train_words, train_tags, tokenizer, seq_len) test_data = process_no_context(test_words, test_tags, tokenizer, seq_len) elif args.documentwise: tr_docs, tr_doc_tags, tr_line_ids = split_to_documents(train_words, train_tags) te_docs, te_doc_tags, te_line_ids = split_to_documents(test_words, test_tags) train_data = process_docs(tr_docs, tr_doc_tags, tr_line_ids, tokenizer, seq_len) test_data = process_docs(te_docs, te_doc_tags, te_line_ids, tokenizer, seq_len) else: train_data = process_sentences(train_words, train_tags, tokenizer, seq_len, args.predict_position) test_data = process_sentences(test_words, test_tags, tokenizer, seq_len, args.predict_position) label_list = get_labels(train_data.labels) tag_map = { l: i for i, l in enumerate(label_list) } inv_tag_map = { v: k for k, v in tag_map.items() } train_x = encode(train_data.combined_tokens, tokenizer, seq_len) test_x = encode(test_data.combined_tokens, tokenizer, seq_len) train_y, train_weights = label_encode(train_data.combined_labels, tag_map, seq_len) test_y, test_weights = label_encode(test_data.combined_labels, tag_map, seq_len) if args.use_ner_model and (args.ner_model_dir is not None): ner_model, tokenizer, labels, config = load_ner_model(args.ner_model_dir) else: optimizer = create_optimizer(len(train_x[0]), args) model = create_ner_model(pretrained_model, len(tag_map)) if args.num_gpus > 1: ner_model = multi_gpu_model(model, args.num_gpus) else: ner_model = model ner_model.compile( optimizer, loss='sparse_categorical_crossentropy', sample_weight_mode='temporal', metrics=['sparse_categorical_accuracy'] ) ner_model.fit( train_x, train_y, sample_weight=train_weights, epochs=args.num_train_epochs, batch_size=args.batch_size ) if args.ner_model_dir is not None: label_list = [v for k, v in sorted(list(inv_tag_map.items()))] save_ner_model(ner_model, tokenizer, label_list, args) probs = ner_model.predict(test_x, batch_size=args.batch_size) preds = np.argmax(probs, axis=-1) results = [] m_names = [] if args.no_context: pr_ensemble, pr_test_first = get_predictions(preds, test_data.tokens, test_data.sentence_numbers) output_file = "output/{}-NC.tsv".format(args.output_file) m_names.append('NC') ensemble = [] for i,pred in enumerate(pr_test_first): ensemble.append([inv_tag_map[t] for t in pred]) lines_ensemble, sentences_ensemble = write_result( output_file, test_data.words, test_data.lengths, test_data.tokens, test_data.labels, ensemble ) c = conlleval.evaluate(lines_ensemble) conlleval.report(c) results.append([conlleval.metrics(c)[0].prec, conlleval.metrics(c)[0].rec, conlleval.metrics(c)[0].fscore]) else: # First tag then vote pr_ensemble, pr_test_first = get_predictions(preds, test_data.tokens, test_data.sentence_numbers) # Accumulate probabilities, then vote prob_ensemble, prob_test_first = get_predictions2(probs, test_data.tokens, test_data.sentence_numbers) ens = [pr_ensemble, prob_ensemble, pr_test_first, prob_test_first] if args.documentwise: # D-CMV: Documentwise CMV # D-CMVP: Documetwise CMV, probs summed, argmax after that # D-F: Documentwise First # D-FP: Same as D-FP method_names = ['D-CMV','D-CMVP','D-F','D-FP'] else: method_names = ['CMV','CMVP','F','FP'] for i, ensem in enumerate(ens): ensemble = [] for j,pred in enumerate(ensem): ensemble.append([inv_tag_map[t] for t in pred]) output_file = "output/{}-{}.tsv".format(args.output_file, method_names[i]) lines_ensemble, sentences_ensemble = write_result( output_file, test_data.words, test_data.lengths, test_data.tokens, test_data.labels, ensemble) print("Model trained: ", args.ner_model_dir) print("Seq-len: ", args.max_seq_length) print("Learning rate: ", args.learning_rate) print("Batch Size: ", args.batch_size) print("Epochs: ", args.num_train_epochs) print("Training data: ", args.train_data) print("Testing data: ", args.test_data) print("") print("Results with {}".format(method_names[i])) c = conlleval.evaluate(lines_ensemble) print("") conlleval.report(c) results.append([conlleval.metrics(c)[0].prec, conlleval.metrics(c)[0].rec, conlleval.metrics(c)[0].fscore]) m_names.extend(method_names) if args.sentence_in_context: starting_pos = np.arange(0,seq_len+1,32) starting_pos[0] = 1 m_names.extend(starting_pos) for start_p in starting_pos: tt_lines, tt_tags, line_nos, line_starts = combine_sentences2(test_data.tokens, test_data.labels, seq_len-1, start_p-1) tt_x = encode(tt_lines, tokenizer, seq_len) tt_y, train_weights = label_encode(tt_tags, tag_map, seq_len) probs = ner_model.predict(tt_x, batch_size=args.batch_size) preds = np.argmax(probs, axis=-1) pred_tags = [] for i, pred in enumerate(preds): idx = line_nos[i].index(i) pred_tags.append([inv_tag_map[t] for t in pred[line_starts[i][idx]+1:line_starts[i][idx]+len(test_data.tokens[i])+1]]) output_file = "output/{}-{}.tsv".format(args.output_file, start_p) lines_first, sentences_first = write_result( output_file, test_data.words, test_data.lengths, test_data.tokens, test_data.labels, pred_tags ) print("") print("Results with prediction starting position ", start_p) c = conlleval.evaluate(lines_first) conlleval.report(c) results.append([conlleval.metrics(c)[0].prec, conlleval.metrics(c)[0].rec, conlleval.metrics(c)[0].fscore]) result_file = "./results/results-{}.csv".format(args.output_file) with open(result_file, 'w+') as f: for i, line in enumerate(results): params = "{},{},{},{},{},{},{},{},{}".format(args.output_file, args.max_seq_length, args.bert_config_file, args.num_train_epochs, args.learning_rate, args.batch_size, args.predict_position, args.train_data, args.test_data) f.write(params) f.write(",{}".format(m_names[i])) for item in line: f.write(",{}".format(item)) f.write('\n') for i in results: print(i) return 0
def evaluate_conlleval_string(self, conlleval_string): counts = conlleval.evaluate(conlleval_string.split('\n'), {'delimiter': self.separator}) full_report = conlleval.report(counts) overall, per_label = conlleval.metrics(counts) return overall, per_label, full_report
histogram_freq=1) ] model.fit(xtr, ytr, batch_size=batch_size, epochs=nb_epoch, verbose=1, validation_data=(xte, yte), callbacks=callbacks) print('loading the currently best model for final evaluation...') model = load_model(checkPointPath) print('--------------------------------------------------') print('Fold ', currentFold, ' performance') counts, cmat = conll_eval_counts(model.predict(xte), yte, labels) overall, byType = ceval.metrics(counts) ceval.report(counts) print_cm(cmat, ordered_label_keys(labels)) foldScores.append(overall.fscore) print('\n') print('avg f1 fold scores so far: ', np.mean(foldScores)) currentFold += 1 # we clear the tensorflow session after each fold to not leak resources K.clear_session() print('f1 fold scores: ', foldScores) print('final avg f1 fold scores: ', np.mean(foldScores))
def get_results(self, name, token_labels_available=True): """ Gets the statistical results both at the sentence and at the token level. :param name: train, dev or test (+ epoch number). :param token_labels_available: whether there are token annotations. :return: an ordered dictionary containing the collection of results. """ results = OrderedDict() results["name"] = name results["cost_sum"] = self.cost_sum results["cost_avg"] = (self.cost_sum / float(self.count_sent) if self.count_sent else 0.0) results["count_sent"] = self.count_sent results["total_correct_sent"] = self.correct_binary_sent results["accuracy_sent"] = (self.correct_binary_sent / float(self.count_sent) if self.count_sent else 0.0) # Calculate the micro and macro averages for the sentence predictions f_macro_sent, p_macro_sent, r_macro_sent, f05_macro_sent = 0.0, 0.0, 0.0, 0.0 f_non_default_macro_sent, p_non_default_macro_sent, \ r_non_default_macro_sent, f05_non_default_macro_sent = 0.0, 0.0, 0.0, 0.0 for key in self.id2label_sent.keys(): p, r, f, f05 = self.calculate_metrics( self.sentence_correct[key], self.sentence_predicted[key], self.sentence_total[key]) label = "label=%s" % self.id2label_sent[key] results[label + "_predicted_sent"] = self.sentence_predicted[key] results[label + "_correct_sent"] = self.sentence_correct[key] results[label + "_total_sent"] = self.sentence_total[key] results[label + "_precision_sent"] = p results[label + "_recall_sent"] = r results[label + "_f-score_sent"] = f results[label + "_f05-score_sent"] = f05 p_macro_sent += p r_macro_sent += r f_macro_sent += f f05_macro_sent += f05 if key != 0: p_non_default_macro_sent += p r_non_default_macro_sent += r f_non_default_macro_sent += f f05_non_default_macro_sent += f05 p_macro_sent /= len(self.id2label_sent.keys()) r_macro_sent /= len(self.id2label_sent.keys()) f_macro_sent /= len(self.id2label_sent.keys()) f05_macro_sent /= len(self.id2label_sent.keys()) p_non_default_macro_sent /= (len(self.id2label_sent.keys()) - 1) r_non_default_macro_sent /= (len(self.id2label_sent.keys()) - 1) f_non_default_macro_sent /= (len(self.id2label_sent.keys()) - 1) f05_non_default_macro_sent /= (len(self.id2label_sent.keys()) - 1) p_micro_sent, r_micro_sent, f_micro_sent, f05_micro_sent = self.calculate_metrics( sum(self.sentence_correct.values()), sum(self.sentence_predicted.values()), sum(self.sentence_total.values())) p_non_default_micro_sent, r_non_default_micro_sent, \ f_non_default_micro_sent, f05_non_default_micro_sent = self.calculate_metrics( sum([value for key, value in self.sentence_correct.items() if key != 0]), sum([value for key, value in self.sentence_predicted.items() if key != 0]), sum([value for key, value in self.sentence_total.items() if key != 0])) results["precision_macro_sent"] = p_macro_sent results["recall_macro_sent"] = r_macro_sent results["f-score_macro_sent"] = f_macro_sent results["f05-score_macro_sent"] = f05_macro_sent results["precision_micro_sent"] = p_micro_sent results["recall_micro_sent"] = r_micro_sent results["f-score_micro_sent"] = f_micro_sent results["f05-score_micro_sent"] = f05_micro_sent results["precision_non_default_macro_sent"] = p_non_default_macro_sent results["recall_non_default_macro_sent"] = r_non_default_macro_sent results["f-score_non_default_macro_sent"] = f_non_default_macro_sent results["f05-score_non_default_macro_sent"] = f05_non_default_macro_sent results["precision_non_default_micro_sent"] = p_non_default_micro_sent results["recall_non_default_micro_sent"] = r_non_default_micro_sent results["f-score_non_default_micro_sent"] = f_non_default_micro_sent results["f05-score_non_default_micro_sent"] = f05_non_default_micro_sent if token_labels_available or "test" in name: results["count_tok"] = self.count_tok results["total_correct_tok"] = self.correct_binary_tok results["accuracy_tok"] = (self.correct_binary_tok / float(self.count_tok) if self.count_tok else 0.0) # Calculate the micro and macro averages for the token predictions. f_tok_macro, p_tok_macro, r_tok_macro, f05_tok_macro = 0.0, 0.0, 0.0, 0.0 f_non_default_macro_tok, p_non_default_macro_tok, \ r_non_default_macro_tok, f05_non_default_macro_tok = 0.0, 0.0, 0.0, 0.0 for key in self.id2label_tok.keys(): p, r, f, f05 = self.calculate_metrics( self.token_correct[key], self.token_predicted[key], self.token_total[key]) label = "label=%s" % self.id2label_tok[key] results[label + "_predicted_tok"] = self.token_predicted[key] results[label + "_correct_tok"] = self.token_correct[key] results[label + "_total_tok"] = self.token_total[key] results[label + "_precision_tok"] = p results[label + "_recall_tok"] = r results[label + "_f-score_tok"] = f results[label + "_tok_f05"] = f05 p_tok_macro += p r_tok_macro += r f_tok_macro += f f05_tok_macro += f05 if key != 0: p_non_default_macro_tok += p r_non_default_macro_tok += r f_non_default_macro_tok += f f05_non_default_macro_tok += f05 p_tok_macro /= len(self.id2label_tok.keys()) r_tok_macro /= len(self.id2label_tok.keys()) f_tok_macro /= len(self.id2label_tok.keys()) f05_tok_macro /= len(self.id2label_tok.keys()) p_non_default_macro_tok /= (len(self.id2label_tok.keys()) - 1) r_non_default_macro_tok /= (len(self.id2label_tok.keys()) - 1) f_non_default_macro_tok /= (len(self.id2label_tok.keys()) - 1) f05_non_default_macro_tok /= (len(self.id2label_tok.keys()) - 1) p_tok_micro, r_tok_micro, f_tok_micro, f05_tok_micro = self.calculate_metrics( sum(self.token_correct.values()), sum(self.token_predicted.values()), sum(self.token_total.values())) p_non_default_micro_tok, r_non_default_micro_tok, \ f_non_default_micro_tok, f05_non_default_micro_tok = self.calculate_metrics( sum([value for key, value in self.token_correct.items() if key != 0]), sum([value for key, value in self.token_predicted.items() if key != 0]), sum([value for key, value in self.token_total.items() if key != 0])) results["precision_macro_tok"] = p_tok_macro results["recall_macro_tok"] = r_tok_macro results["f-score_macro_tok"] = f_tok_macro results["f05-score_macro_tok"] = f05_tok_macro results["precision_micro_tok"] = p_tok_micro results["recall_micro_tok"] = r_tok_micro results["f-score_micro_tok"] = f_tok_micro results["f05-score_micro_tok"] = f05_tok_micro results["precision_non_default_macro_tok"] = p_non_default_macro_tok results["recall_non_default_macro_tok"] = r_non_default_macro_tok results["f-score_non_default_macro_tok"] = f_non_default_macro_tok results["f05-score_non_default_macro_tok"] = f05_non_default_macro_tok results["precision_non_default_micro_tok"] = p_non_default_micro_tok results["recall_non_default_micro_tok"] = r_non_default_micro_tok results["f-score_non_default_micro_tok"] = f_non_default_micro_tok results["f05-score_non_default_micro_tok"] = f05_non_default_micro_tok if self.id2label_tok is not None and self.conll03_eval is True: conll_counts = conlleval.evaluate(self.conll_format) conll_metrics_overall, conll_metrics_by_type = conlleval.metrics(conll_counts) results["conll_accuracy"] = (float(conll_counts.correct_tags) / float(conll_counts.token_counter)) results["conll_p"] = conll_metrics_overall.prec results["conll_r"] = conll_metrics_overall.rec results["conll_f"] = conll_metrics_overall.fscore results["time"] = float(time.time()) - float(self.start_time) return results