def test_entities_at_the_end(): words = "Shyam lives in New York".split() gold = "B-PER O O B-LOC I-LOC".split() pred = "B-PER O O B-LOC O".split() print("Input gold. This should be perfect.") counts = evaluate(map(lambda p: " ".join(p), zip(words, gold, gold))) overall, by_type = metrics(counts) report(counts) assert overall.fscore == 1.0 print("This should be 50% F1") counts = evaluate(map(lambda p: " ".join(p), zip(words, gold, pred))) overall, by_type = metrics(counts) report(counts) assert overall.fscore == 0.5 assert by_type["PER"].fscore == 1.0 assert by_type["LOC"].fscore == 0.0 print("This should be 50% F1") counts = evaluate(map(lambda p: " ".join(p), zip(words, pred, gold))) overall, by_type = metrics(counts) report(counts) assert overall.fscore == 0.5 assert by_type["PER"].fscore == 1.0 assert by_type["LOC"].fscore == 0.0
def test(data): if not os.path.exists(RESULT_PATH): os.makedirs(RESULT_PATH) with tf.Graph().as_default() as g: x = tf.placeholder(tf.int32, [None, None]) y_ = tf.placeholder(tf.int64, [None, None]) word_emb = tf.Variable(datautil._word_emb, dtype=tf.float32, name='word_emb') x_emb = tf.nn.embedding_lookup(word_emb, x) y = ner_forward.forward(x_emb, is_train=False, regularizer=None) predict = tf.argmax(y, -1) saver = tf.train.Saver() x_batch = [] for i in range(len(data)): pad_lst = [0] * (MAX_SEQ_LEN - len(data[i][0])) x_pad = data[i][0] + pad_lst x_batch.append(x_pad) while True: with tf.Session() as sess: # This will return a dict like variable (with key: value) # ckpt.model_checkpoint_path will get the string value of "model_checkpoint_path" # tf.train.latest_checkpoint can do the same thing ckpt = tf.train.get_checkpoint_state( ner_backward.MODEL_SAVE_PATH) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) # Parse the string to extract the number of the ckpt filename global_step = ckpt.model_checkpoint_path.split( '/')[-1].split('-')[-1] predict_id = sess.run(predict, feed_dict={x: x_batch}) filename = os.path.join(RESULT_PATH, 'ner.result') fw = open(filename, 'w') for i in range(len(data)): fw.write('{} {} {}\n'.format("<S>", "O", "O")) for j in range(len(data[i][0])): word = data[i][2][j] predict_str = datautil.id2label(predict_id[i][j]) label_str = datautil.id2label(data[i][1][j]) fw.write('{} {} {}\n'.format( word, label_str, predict_str)) fw.write('{} {} {}\n\n'.format("<E>", "O", "O")) fw.close() print("After %s training step(s), test result is:" % (global_step)) conlleval.evaluate(filename) time.sleep(TEST_INTERVAL_SECS)
def eval(self, sess, dataset): IOB_output = [] for batch in dataset: input_sents, input_words, input_tags, input_lengths, input_words_lens = batch feed_dict = { self.input_sents: input_sents, self.input_words: input_words, self.input_tags: input_tags, self.input_lengths: input_lengths, self.input_words_lens: input_words_lens, self.dropout_keep_prob: 1.0 } predicted_seqs = sess.run(self.predicted_seqs, feed_dict=feed_dict) batch_size = predicted_seqs.shape[0] for i in range(batch_size): seq_length = input_lengths[i] to_tag = FLAGS.id_to_tags_map for j in range(seq_length): line = ['-'] * 2 + [to_tag[input_tags[i, j]] ] + [to_tag[predicted_seqs[i, j]]] IOB_output.append(' '.join(line)) IOB_output.append('\n') print(len(IOB_output)) print(IOB_output[:10]) return conlleval.evaluate(IOB_output)
def conll_summary(tokens, gold, pred, config): """Return string summarizing performance using CoNLL criteria.""" index_to_label = {v: k for k, v in config.label_to_index.items()} acc = accuracy(gold, pred) gold = map(lambda i: index_to_label[i], as_dense(gold)) pred = map(lambda i: index_to_label[i], as_dense(pred)) # Format as space-separated (token, gold, pred) strings for CoNLL eval. if len(tokens) != len(gold) or len(gold) != len(pred): raise ValueError('counts do not match') formatted = [' '.join(t) for t in zip(tokens, gold, pred)] o, by_type = conlleval.metrics(conlleval.evaluate(formatted)) nlen = max(len(name) for name in by_type.keys()) summaries = [ '%.2f%% acc %.2f%% f (%.1fp %.1fr %dtp %dfp %dfn)' % (100. * acc, 100. * o.fscore, 100. * o.prec, 100. * o.rec, o.tp, o.fp, o.fn) ] config.results_log[config.model_name_log][ config.dataset_name_log] = o.fscore for name, r in sorted(by_type.items()): summaries.append('%*s %.2f%% f (%.1fp %.1fr %dtp %dfp %dfn)' % (nlen, name, 100. * r.fscore, 100. * r.prec, 100. * r.rec, r.tp, r.fp, r.fn)) return '\n'.join(summaries)
def validate_one_batch(self, test_batch, task_name, log_writer, epoch): S, Q = test_batch Q_tag_ids = Q['tag_ids'] S_tag_ids = S['tag_ids'] Q_seq_len_list = Q['lens'] Q_seq_len_list_plus2 = [x + 2 for x in Q_seq_len_list] Q_tag_ids_padded = pad_tag_ids(Q_tag_ids) S_tag_ids_padded = pad_tag_ids(S_tag_ids) Q['tag_ids'] = Q_tag_ids_padded S['tag_ids'] = S_tag_ids_padded logits = self([S, Q]) loss = self.crf_loss(logits, Q_tag_ids_padded, Q_seq_len_list_plus2) pred_tags, pred_best_score = crf.crf_decode( potentials=logits, transition_params=self.trans_p, sequence_length=Q_seq_len_list_plus2) pred_tags_masked = seq_masking(pred_tags, Q_seq_len_list_plus2) p_tags_char, _ = get_id2tag_V2(pred_tags_masked, Q_seq_len_list_plus2, taskname=task_name) t_tags_char, _ = get_id2tag_V2(Q_tag_ids_padded, Q_seq_len_list_plus2, taskname=task_name) (P, R, F1), _ = evaluate(t_tags_char, p_tags_char, verbose=True) write_to_log(loss, P, R, F1, t_tags_char, log_writer, epoch) return (loss, pred_tags_masked, Q_tag_ids_padded, P, R, F1)
def evaluate(model, x_test, y_test, labels, MAX_SEQUENCE_LENGTH): total = [] for lang in get_lang_list(): # x_test.keys(): y_pred = model.predict(x_test[lang]) pred_tags_all = [] true_tags_all = [] for i, seq in enumerate(y_pred): for j in range(MAX_SEQUENCE_LENGTH): indx = np.argmax(y_test[lang][i][j]) true_label = labels[indx] if "[PAD]" in true_label or "[CLS]" in true_label in true_label: continue true_tags_all.append(true_label) indx = np.argmax(seq[j]) pred_label = labels[indx] pred_tags_all.append(pred_label) prec, rec, f1 = conlleval.evaluate(true_tags_all, pred_tags_all, verbose=False) print("Lang {} scores {} {} {}".format(lang, prec, rec, f1)) total.append(f1) print("All f-scores {}".format(total)) print("Overall average f-score mean {} and variance {}".format( np.mean(total), np.var(total)))
def eval(self, sess, dataset): iterator = dataset.make_one_shot_iterator() next_batch_op = iterator.get_next() IOB_output = [] while True: try: next_batch = sess.run(next_batch_op) input_x, input_y, input_lengths = next_batch except: break feed_dict = { self.input_sents: input_x, self.input_lengths: input_lengths, self.dropout_keep_prob: 1.0 } predicted_seqs = sess.run(self.predicted_seqs, feed_dict=feed_dict) batch_size = predicted_seqs.shape[0] for i in range(batch_size): seq_length = input_lengths[i] to_tag = FLAGS.id_to_tag_map for j in range(seq_length): line = ['-'] * 2 + [to_tag[input_y[i, j]] ] + [to_tag[predicted_seqs[i, j]]] IOB_output.append(' '.join(line)) IOB_output.append('\n') print(len(IOB_output)) print(IOB_output[:10]) return conlleval.evaluate(IOB_output)
def chunking_eval(self, dataloader): self.coarse_tagger.eval() binary_preds, binary_golds = [], [] pbar = tqdm(enumerate(dataloader), total=len(dataloader)) for i, (X, lengths, y_0, y_bin, y_final, y_dm) in pbar: binary_golds.extend(y_0) X, lengths = X.cuda(), lengths.cuda() preds = self.coarse_tagger.chunking(X, y_0, True, lengths) binary_preds.extend(preds) binary_preds = np.concatenate(binary_preds, axis=0) binary_preds = list(binary_preds) binary_golds = np.concatenate(binary_golds, axis=0) binary_golds = list(binary_golds) _bin_pred = [] _bin_gold = [] temp = {"B": "B-A", "I": "I-A", "O": "O"} for bin_pred, bin_gold in zip(binary_preds, binary_golds): bin_slot_pred = y0_set[bin_pred] bin_slot_gold = y0_set[bin_gold] _bin_gold.append(temp[bin_slot_gold]) _bin_pred.append(temp[bin_slot_pred]) (pre, rec, f1), d = conlleval.evaluate(_bin_gold, _bin_pred, logger) return f1
def evaluate(results, idx_token, idx_label, writer=None): """Evaluate prediction results. :param results: A List of which each item is a tuple (predictions, gold labels, sequence lengths, tokens) of a batch. :param idx_token: Index to token dictionary. :param idx_label: Index to label dictionary. :param writer: An object (file object) with a write() function. Extra output. :return: F-score, precision, and recall. """ # b: batch, s: sequence outputs = [] for preds_b, golds_b, len_b, tokens_b in results: for preds_s, golds_s, len_s, tokens_s in zip(preds_b, golds_b, len_b, tokens_b): l = int(len_s.item()) preds_s = preds_s.data.tolist()[:l] golds_s = golds_s.data.tolist()[:l] tokens_s = tokens_s.data.tolist()[:l] for p, g, t in zip(preds_s, golds_s, tokens_s): token = idx_token.get(t, C.UNK_INDEX) outputs.append('{} {} {}'.format(token, idx_label.get(g, 0), idx_label.get(p, 0))) outputs.append('') counts = conlleval.evaluate(outputs) overall, by_type = conlleval.metrics(counts) conlleval.report(counts) if writer: conlleval.report(counts, out=writer) writer.flush() return overall.fscore, overall.prec, overall.rec
def evaluate(model, dataset, word_vocab, label_vocab): model.eval() losses = [] scores = [] true_tags = [] pred_tags = [] sents = [] for i, (sent, tags) in enumerate(dataset): with torch.no_grad(): sent, tags = sent.to(device), tags.to(device) sent = sent.unsqueeze(0) tags = tags.unsqueeze(0) losses.append(model.loss(sent, tags).cpu().detach().item()) score, pred_tag_seq = model(sent) scores.append(score.cpu().detach().numpy()) true_tags.append([label_vocab.itos[i] for i in tags.tolist()[0]]) pred_tags.append([label_vocab.itos[i] for i in pred_tag_seq[0]]) sents.append([word_vocab.itos[i] for i in sent[0]]) print('Avg evaluation loss:', np.mean(losses)) acc, rec, f1 = conlleval.evaluate( [tag for tags in true_tags for tag in tags], [tag for tags in pred_tags for tag in tags], verbose=True) # print('\n5 random evaluation samples:') # for idx in np.random.randint(0, len(sents), size=5): # print('SENT:', ' '.join(sents[idx])) # print('TRUE:', ' '.join(true_tags[idx])) # print('PRED:', ' '.join(pred_tags[idx])) # print('-'*20) return sents, true_tags, pred_tags, f1
def get_results(self, name): p = (float(self.main_correct_count) / float(self.main_predicted_count)) if (self.main_predicted_count > 0) else 0.0 r = (float(self.main_correct_count) / float(self.main_total_count)) if (self.main_total_count > 0) else 0.0 f = (2.0 * p * r / (p + r)) if (p+r > 0.0) else 0.0 f05 = ((1.0 + 0.5*0.5) * p * r / ((0.5*0.5 * p) + r)) if (p+r > 0.0) else 0.0 results = collections.OrderedDict() results[name + "_cost_avg"] = self.cost_sum / float(self.token_count) results[name + "_cost_sum"] = self.cost_sum results[name + "_main_predicted_count"] = self.main_predicted_count results[name + "_main_total_count"] = self.main_total_count results[name + "_main_correct_count"] = self.main_correct_count results[name + "_p"] = p results[name + "_r"] = r results[name + "_f"] = f results[name + "_f05"] = f05 results[name + "_accuracy"] = self.correct_sum / float(self.token_count) results[name + "_token_count"] = self.token_count results[name + "_time"] = float(time.time()) - float(self.start_time) results[name + "_correct_sum"] = self.correct_sum if self.label2id is not None and self.conll_eval == True: conll_counts = conlleval.evaluate(self.conll_format) conll_metrics_overall, conll_metrics_by_type = conlleval.metrics(conll_counts) results[name + "_conll_accuracy"] = float(conll_counts.correct_tags) / float(conll_counts.token_counter) results[name + "_conll_p"] = conll_metrics_overall.prec results[name + "_conll_r"] = conll_metrics_overall.rec results[name + "_conll_f"] = conll_metrics_overall.fscore # for i, m in sorted(conll_metrics_by_type.items()): # results[name + "_conll_p_" + str(i)] = m.prec # results[name + "_conll_r_" + str(i)] = m.rec # results[name + "_conll_f_" + str(i)] = m.fscore #str(m.fscore) + " " + str(conll_counts.t_found_guessed[i]) return results, self.conll_format
def test_format(): words = "Shyam lives in New York .".split() gold = "B-PER O O B-LOC I-LOC O".split() pred = "B-PER O O B-LOC O O".split() print("Testing inputting the wrong format. This should get an exception") try: evaluate([1, 2, 3]) except Exception as e: print(e) pred = "B-PER O O B-LOC I-MISC O".split() print("This should be 50% F1") counts = evaluate(map(lambda p: " ".join(p), zip(words, gold, pred))) overall, by_type = metrics(counts) report(counts) assert overall.fscore == 0.4
def conlleval_evaluate(documents): """Return conlleval evaluation results for Documents as counts.""" # conlleval.py has a file-based API, so use StringIO conll_string = StringIO() write_conll(documents, out=conll_string) conll_string.seek(0) return evaluate(conll_string)
def eval(self, dataset_name, log_output=None): dataset = self.datasets.get(dataset_name, None) if dataset is None: return results = [] logger.info('Evaluating {} ({})'.format(self.name, dataset_name)) set_loss = 0 for tokens, labels, chars, seq_lens, char_lens in dataset.get_dataset( volatile=True, gpu=self.gpu): preds, loss = self.model.predict(tokens, labels, seq_lens, chars, char_lens) set_loss += float(loss.data[0]) for pred, gold, seq_len, ts in zip(preds, labels, seq_lens, tokens): l = int(seq_len.data[0]) pred = pred.data.tolist()[:l] gold = gold.data.tolist()[:l] ts = ts.data.tolist()[:l] for p, g, t in zip(pred, gold, ts): t = self.idx_token.get(t, 'UNK') results.append('{} {} {}'.format(t, self.idx_label[g], self.idx_label[p])) results.append('') counts = evaluate(results) overall, by_type = metrics(counts) report(counts) logger.info('Loss: {:.5f}'.format(set_loss)) return SCORES(fscore=overall.fscore, precision=overall.prec, recall=overall.rec, loss=set_loss)
def main(argv): argparser = argument_parser() args = argparser.parse_args(argv[1:]) seq_len = args.max_seq_length # abbreviation pretrained_model, tokenizer = load_pretrained(args) train_words, train_tags = read_conll(args.train_data) test_words, test_tags = read_conll(args.test_data) train_data = process_sentences(train_words, train_tags, tokenizer, seq_len) test_data = process_sentences(test_words, test_tags, tokenizer, seq_len) label_list = get_labels(train_data.labels) tag_map = {l: i for i, l in enumerate(label_list)} inv_tag_map = {v: k for k, v in tag_map.items()} init_prob, trans_prob = viterbi_probabilities(train_data.labels, tag_map) train_x = encode(train_data.combined_tokens, tokenizer, seq_len) test_x = encode(test_data.combined_tokens, tokenizer, seq_len) train_y, train_weights = label_encode(train_data.combined_labels, tag_map, seq_len) test_y, test_weights = label_encode(test_data.combined_labels, tag_map, seq_len) ner_model = create_ner_model(pretrained_model, len(tag_map)) optimizer = create_optimizer(len(train_x[0]), args) ner_model.compile(optimizer, loss='sparse_categorical_crossentropy', sample_weight_mode='temporal', metrics=['sparse_categorical_accuracy']) ner_model.fit(train_x, train_y, sample_weight=train_weights, epochs=args.num_train_epochs, batch_size=args.batch_size) if args.ner_model_dir is not None: label_list = [v for k, v in sorted(list(inv_tag_map.items()))] save_ner_model(ner_model, tokenizer, label_list, args) save_viterbi_probabilities(init_prob, trans_prob, inv_tag_map, args) probs = ner_model.predict(test_x, batch_size=args.batch_size) preds = np.argmax(probs, axis=-1) pred_tags = [] for i, pred in enumerate(preds): pred_tags.append( [inv_tag_map[t] for t in pred[1:len(test_data.tokens[i]) + 1]]) lines = write_result(args.output_file, test_data.words, test_data.lengths, test_data.tokens, test_data.labels, pred_tags) c = conlleval.evaluate(lines) conlleval.report(c) return 0
def check_path(self, path, files, zip_data): logging.info("path: {}".format(path)) logging.info("files: {}".format(files)) testfile_path = '' testfile_key = '' path_key = '' for filename in files: logging.info("testing filename: {}".format(filename)) if path is None or path == '': logging.info("filename={}".format(filename)) testfile_path = os.path.abspath( os.path.join(self.ref_dir, filename)) testfile_key = filename path_key = filename else: logging.info("path={}".format(path)) testfile_path = os.path.abspath( os.path.join(self.ref_dir, path, filename)) testfile_key = os.path.join(path, filename) path_key = path logging.info("path_key={}".format(path_key)) # set up score value for matching output correctly score = self.default_score if path_key in self.path_score: score = self.path_score[path_key] tally = 0.0 self.perf[path_key] = 0.0 logging.info("Checking {}".format(testfile_key)) if testfile_key in zip_data: with open(testfile_path, 'rt') as ref: ref_data = list( filter( lambda k: k, [str(x).strip() for x in ref.read().splitlines()])) output_data = list( filter(lambda k: k, [ str(x, 'utf-8').strip() for x in zip_data[testfile_key].splitlines() ])) output_data = output_data[:len(ref_data)] if len(ref_data) == len(output_data): logging.info("ref, output {}".format( list(zip(ref_data, output_data)))) (prec, recall, tally) = conlleval.evaluate(ref_data, output_data) logging.info("score {}: {}".format( testfile_key, tally)) else: logging.info( "length mismatch between output and reference") tally = 0. self.perf[path_key] = tally
def evaluate(args, data, model, id2label, all_ori_tokens): model.eval() sampler = SequentialSampler(data) dataloader = DataLoader(data, sampler=sampler, batch_size=args.train_batch_size) logger.info("***** Running eval *****") # logger.info(f" Num examples = {len(data)}") # logger.info(f" Batch size = {args.eval_batch_size}") pred_labels = [] ori_labels = [] for b_i, (input_ids, input_mask, segment_ids, label_ids, bbox, bbox_pos_id, bbox_num) in enumerate(tqdm(dataloader, desc="Evaluating")): input_ids = input_ids.to(args.device) input_mask = input_mask.to(args.device) segment_ids = segment_ids.to(args.device) label_ids = label_ids.to(args.device) bbox = bbox.to(args.device) bbox_pos_id = bbox_pos_id.to(args.device) bbox_num = bbox_num.to(args.device) with torch.no_grad(): logits = model.predict(input_ids, segment_ids, input_mask, bbox, bbox_pos_id, bbox_num) # logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2) # logits = logits.detach().cpu().numpy() for l in logits: # logits-> List[List[int]] pred_labels.append([id2label[idx] for idx in l]) for l in label_ids: # tensor ori_labels.append([id2label[idx.item()] for idx in l]) eval_list = [] for ori_tokens, oril, prel in zip(all_ori_tokens, ori_labels, pred_labels): for ot, ol, pl in zip(ori_tokens, oril, prel): if ot in ["[CLS]", "[SEP]"]: continue if len(f"{ot} {ol} {pl}\n".split(" ")) != 3: continue eval_list.append(f"{ot} {ol} {pl}\n") eval_list.append("\n") # eval the model counts = conlleval.evaluate(eval_list) conlleval.report(counts) # namedtuple('Metrics', 'tp fp fn prec rec fscore') overall, by_type = conlleval.metrics(counts) return overall, by_type
def evaluating_batch(model, datas): save = False adv = 0 true_tags_all=[] pred_tags_all=[] macro=[] for data in datas: true_tags = [] pred_tags = [] sentence_in=Variable(torch.LongTensor(data['words'])) chars2_mask=Variable(torch.LongTensor(data['chars'])) caps=Variable(torch.LongTensor(data['caps'])) targets = torch.LongTensor(data['tags']) chars2_length = data['char_length'] word_length=data['word_length'] ground_truth_id = data['tags'][0] if use_gpu: sentence_in = sentence_in.cuda() targets = targets.cuda() chars2_mask = chars2_mask.cuda() caps = caps.cuda() val, out = model(sentence= sentence_in, caps = caps,chars = chars2_mask, chars2_length= chars2_length,word_length=word_length) predicted_id = out for (true_id, pred_id) in zip(ground_truth_id, predicted_id[0]): true_tags.append(id_to_tag[true_id]) pred_tags.append(id_to_tag[pred_id]) true_tags_all.extend(true_tags) pred_tags_all.extend(pred_tags) df = pd.DataFrame({'true':true_tags,'pred':pred_tags}) if sum(df['true']!=df['pred'])>0: adv = adv+1 df=df[df['true']!='O'] #only tags if len(df)!=0: macro.append(sum(df['true']==df['pred'])/len(df)) df_tags = pd.DataFrame({'true':true_tags_all,'pred':pred_tags_all}) df_tags = df_tags[df_tags['true']!='O'] print('Micro acc_tag:', sum(df_tags['true']==df_tags['pred'])/len(df_tags)) print('Macro acc_tag:', np.mean(macro)) prec, rec, new_F = evaluate(true_tags_all, pred_tags_all, verbose=False) print('F 1:',new_F) print('Hit:', adv/len(datas))
def evaluate(args, task_id, data, model, id2label, all_ori_words, file_name=None): model.eval() sampler = SequentialSampler(data) dataloader = DataLoader(data, sampler=sampler, batch_size=args.train_batch_size) task_id = torch.tensor(task_id, dtype=torch.long).to(args.device) logger.info("***** Running eval *****") logger.info(f" Num examples = {len(data)}") pred_labels = [] ori_labels = [] for b_i, batch in enumerate(tqdm(dataloader, desc="Evaluating")): batch = tuple(t.to(args.device) for t in batch) if args.need_charcnn: input_word_ids, input_mask, label_ids, label_mask, char_ids = batch else: input_word_ids, input_mask, label_ids, label_mask = batch char_ids = None with torch.no_grad(): logits = model.predict(task_id, input_word_ids, char_ids, input_mask) # print(len(all_ori_words), [len(x) for x in all_ori_words]) # print(len(logits), [len(x) for x in logits]) # print(len(label_ids), [len(x) for x in label_ids]) # print(len(input_mask), [sum(x) for x in input_mask]) # print(len(label_mask), [sum(x) for x in label_mask]) for predL, goldL, maskL in zip(logits, label_ids, label_mask): for p, g, mask in zip(predL, goldL, maskL): if mask.item() == 1: pred_labels.append(id2label[p]) ori_labels.append(id2label[g.item()]) pred_labels.append(None) ori_labels.append(None) ori_words = [] for sent in all_ori_words: ori_words.extend(sent+[None]) eval_list = [] # print(len(pred_labels), len(ori_labels), len(ori_words)) for plabel, olabel, word in zip(pred_labels, ori_labels, ori_words): if plabel is not None: eval_list.append(f"{word} {olabel} {plabel}\n") else: eval_list.append("\n") if file_name is not None: with open(file_name, "w", encoding="utf-8") as f: for line in eval_list: f.write(line) # eval the model counts = conlleval.evaluate(eval_list) conlleval.report(counts)
def evaluate_f1(labels_pred_list, label_true_list): id2tag = { test_data_loader.tag2label[key]: key for key in test_data_loader.tag2label } res_pred = [] for label_ in labels_pred_list: res_pred.extend([id2tag[l] for l in label_]) res_true = [] for label_ in label_true_list: res_true.extend([id2tag[l] for l in label_]) prec, rec, f1 = evaluate(res_pred, res_true, verbose=True) return prec, rec, f1
def calculate_labeling_scores(results, report=True): outputs = [] for p_b, g_b, t_b, l_b in results: for p_s, g_s, t_s, l_s in zip(p_b, g_b, t_b, l_b): p_s = p_s[:l_s] for p, g, t in zip(p_s, g_s, t_s): outputs.append('{} {} {}'.format(t, g, p)) outputs.append('') counts = conlleval.evaluate(outputs) overall, by_type = conlleval.metrics(counts) if report: conlleval.report(counts) return (overall.fscore * 100.0, overall.prec * 100.0, overall.rec * 100.0)
def eval2(model): y_true = [] y_pred = [] for sentence, tags in zip(test_sentences, test_tags): tags_pred = model.decode([sentence]) for word, true_tag, pred_tag in zip(sentence, tags, tags_pred[0]): y_true.append(true_tag) y_pred.append(pred_tag) precision, recall, f1_score = evaluate(y_true, y_pred, verbose=False) return "Precision: %.2f%%\tRecall: %.2f%%\tF1_score: %.2f%%".format( precision, recall, f1_score)
def inner_train_one_step(self, batches, epochNum, task_name, log_writer, log_dir): ''' :param self: :param batches: one batch data: [[sentence],[sentence],....] sentence=[[chars],[charids],[tags],[tag_ids]] :param inner_epochNum: :return: ''' # tf.summary.trace_on(graph=True,profiler=True) # 开启Trace(可选) batch_Nums = len(batches) losses, P_ts, R_ts, F1_ts = [], [], [], [] # =====run model======= with tqdm(total=batch_Nums) as bar: for batch_num in range(batch_Nums): batch = batches[batch_num] seq_ids_padded, tag_ids_padded, seq_len_list = get_train_data_from_batch( batch) with tf.GradientTape() as tape: # print(batch[0]) # 调试用 logits = self(seq_ids_padded) loss = self.crf_loss(logits, tag_ids_padded, seq_len_list) pred_tags, pred_best_score = crf.crf_decode( potentials=logits, transition_params=self.trans_p, sequence_length=seq_len_list) grads = tape.gradient(loss, self.trainable_variables) self.optimizer.apply_gradients( zip(grads, self.trainable_variables)) # optimizer.minimize(loss, [myModel_bilstm.trainable_variables]) pred_tags_masked = seq_masking(pred_tags, seq_len_list) p_tags_char, p_tagsid_flatten = get_id2tag(pred_tags_masked, taskname=task_name) t_tags_char, t_tagsid_flatten = get_id2tag(tag_ids_padded, taskname=task_name) (P_t, R_t, F1_t), _ = evaluate(t_tags_char, p_tags_char, verbose=False) losses.append(loss) P_ts.append(P_t) R_ts.append(R_t) F1_ts.append(F1_t) print('train_loss:{}, train_P:{}'.format(loss, P_t)) bar.update(1) with log_writer.as_default(): tf.summary.scalar("loss", np.mean(losses), step=epochNum) tf.summary.scalar("P", np.mean(P_ts), step=epochNum) tf.summary.scalar("R", np.mean(R_ts), step=epochNum) tf.summary.scalar("F1", np.mean(F1_ts), step=epochNum)
def compare(gold_toks, gold_tags, pred_toks, pred_tags): if len(gold_toks) != len(pred_toks): raise ValueError('sentence count mismatch: {} in gold, {} in pred'.\ format(len(gold_toks), len(pred_toks))) lines = [] for g_toks, g_tags, p_toks, p_tags in zip(gold_toks, gold_tags, pred_toks, pred_tags): if g_toks != p_toks: raise ValueError('text mismatch: gold "{}", pred "{}"'.\ format(g_toks, p_toks)) for (g_tok, g_tag, p_tag) in zip(g_toks, g_tags, p_tags): lines.append('{}\t{}\t{}'.format(g_tok, g_tag, p_tag)) return conlleval.report(conlleval.evaluate(lines))
def calculate_metrics(dataset): all_true_tag_ids, all_predicted_tag_ids = [], [] for x, y in dataset: output = ner_model.predict(x) predictions = np.argmax(output, axis=-1) predictions = np.reshape(predictions, [-1]) true_tag_ids = np.reshape(y, [-1]) mask = (true_tag_ids > 0) & (predictions > 0) true_tag_ids = true_tag_ids[mask] predicted_tag_ids = predictions[mask] all_true_tag_ids.append(true_tag_ids) all_predicted_tag_ids.append(predicted_tag_ids) all_true_tag_ids = np.concatenate(all_true_tag_ids) all_predicted_tag_ids = np.concatenate(all_predicted_tag_ids) predicted_tags = [mapping[tag] for tag in all_predicted_tag_ids] real_tags = [mapping[tag] for tag in all_true_tag_ids] evaluate(real_tags, predicted_tags)
def get_output_file(all_logit, all_label, decode, out): decode.pop(len(decode) - 1) assert len(all_logit) == len(all_label) evalseq = [] for i in range(len(all_logit)): evalseq.append("{} {} {}".format( i, decode[int(all_label[i])] if int(all_label[i]) in decode.keys() else "O", decode[int(all_logit[i])] if int(all_logit[i]) in decode.keys() else "O", )) count = conlleval.evaluate(evalseq) conlleval.report(count, out)
def conll_eval_counts(ypred, ytruth, labels): ytruth_max = ytruth.argmax(axis=2) ypred_max = ypred.argmax(axis=2) conf_matrix = None eval_counts = ceval.EvalCounts() label_keys = ordered_label_keys(labels) for i in range(len(ypred_max)): true_seq = [labels[x] for x in ytruth_max[i].tolist()] pred_seq = [labels[x] for x in ypred_max[i].tolist()] c = ceval.evaluate(['%s %s' % x for x in zip(true_seq, pred_seq)]) eval_counts.add(c) cm = confusion_matrix(true_seq, pred_seq, label_keys) conf_matrix = cm if conf_matrix is None else conf_matrix + cm return eval_counts, conf_matrix
def inner_train_one_step(self, batches, inner_iters, inner_epochNum, outer_epochNum, task_name, log_writer): ''' :param self: :param batches: one batch data: [[sentence],[sentence],....] sentence=[[chars],[charids],[tags],[tag_ids]] :param inner_epochNum: :return: ''' batches_len = len(batches) # =====run model======= for batch_num in range(batches_len): batch = batches[batch_num] seq_ids_padded, tag_ids_padded, seq_len_list = get_train_data_from_batch( batch) with tf.GradientTape() as tape: logits = self(seq_ids_padded) loss = self.crf_loss(logits, tag_ids_padded, seq_len_list) pred_tags, pred_best_score = crf.crf_decode( potentials=logits, transition_params=self.trans_p, sequence_length=seq_len_list) grads = tape.gradient(loss, self.trainable_variables) self.optimizer.apply_gradients(zip(grads, self.trainable_variables)) # optimizer.minimize(loss, [myModel_bilstm.trainable_variables]) pred_tags_masked = seq_masking(pred_tags, seq_len_list) p_tags_char, p_tagsid_flatten = get_id2tag(pred_tags_masked, taskname=task_name) t_tags_char, t_tagsid_flatten = get_id2tag(tag_ids_padded, taskname=task_name) (P_t, R_t, F1_t), _ = evaluate(t_tags_char, p_tags_char, verbose=False) with log_writer.as_default(): step = batch_num + 1 + inner_epochNum * batches_len tf.summary.scalar("loss", loss, step=inner_epochNum + outer_epochNum * inner_iters) tf.summary.scalar("P", P_t, step=inner_epochNum) tf.summary.scalar("R", R_t, step=inner_epochNum) tf.summary.scalar("F", F1_t, step=inner_epochNum) return (loss, P_t)
def inner_train_one_step(self, batches, inner_epochNum, ckpt_manager, log_writer=None): ''' :param self: :param batches: one batch data: [[sentence],[sentence],....] sentence=[[chars],[charids],[tags],[tag_ids]] :param inner_epochNum: :return: ''' batch_size = len(batches) print('========================batchsiez', batch_size) # =====run model======= with tqdm(total=batch_size) as bar: for batch_num in range(batch_size): batch = batches[batch_num] seq_ids_padded, tag_ids_padded, seq_len_list = get_train_data_from_batch(batch) with tf.GradientTape() as tape: logits = self(seq_ids_padded) loss = self.crf_loss(logits, tag_ids_padded, seq_len_list) pred_tags, pred_best_score = crf.crf_decode(potentials=logits, transition_params=self.trans_p, sequence_length=seq_len_list) grads = tape.gradient(loss, self.trainable_variables) self.optimizer.apply_gradients(zip(grads, self.trainable_variables)) # optimizer.minimize(loss, [myModel_bilstm.trainable_variables]) bar.update(1) pred_tags_masked = seq_masking(pred_tags, seq_len_list) p_tags_char, p_tagsid_flatten = get_id2tag(pred_tags_masked) t_tags_char, t_tagsid_flatten = get_id2tag(tag_ids_padded) try: (P_t, R_t, F1_t),_ = evaluate(t_tags_char, p_tags_char, verbose=True) except Exception as e: print(e) with log_writer.as_default(): step = batch_num + 1 + inner_epochNum * batch_size tf.summary.scalar("loss", loss, step=inner_epochNum) tf.summary.scalar("P", P_t, step=inner_epochNum) tf.summary.scalar("R", R_t, step=inner_epochNum) tf.summary.scalar("F", F1_t, step=inner_epochNum) ckpt_manager.save(checkpoint_number=inner_epochNum)
def evaluate(results, idx_token, idx_label, writer=None): """Evaluate prediction results. :param results: A List of which each item is a tuple (predictions, gold labels, sequence lengths, tokens) of a batch. :param idx_token: Index to token dictionary. :param idx_label: Index to label dictionary. :param writer: An object (file object) with a write() function. Extra output. :return: F-score, precision, and recall. """ # b: batch, s: sequence outputs = [] # preds: predictions # golds: answers? # len: length of something # tokens: original words? for preds_b, golds_b, len_b, tokens_b in results: for preds_s, golds_s, len_s, tokens_s in zip(preds_b, golds_b, len_b, tokens_b): l = int(len_s.item()) preds_s = preds_s.data.tolist()[:l] golds_s = golds_s.data.tolist()[:l] tokens_s = tokens_s.data.tolist()[:l] for p, g, t in zip(preds_s, golds_s, tokens_s): token = idx_token.get(t, C.UNK) # if token == '': # debug # token = '<$UNK$>' # print(idx_token) # debug # print("p: ", p, ", g: ", g, ", t: ", t, ", corresponding token:", token, "|") # DEBUG outputs.append('{} {} {}'.format( token, idx_label.get(g, 0), idx_label.get(p, 0))) outputs.append('') # print("OUTPUTS: ", outputs) # DEBUG # seems like outputs is right but counts is wrong # Why is english-covered-test not like the other, uncovered datasets? is this causing an issue? counts = conlleval.evaluate(outputs) # print("counts: ", counts) # DEBUG overall, by_type = conlleval.metrics(counts) conlleval.report(counts) if writer: conlleval.report(counts, out=writer) writer.flush() return overall.fscore, overall.prec, overall.rec
def conll_summary(tokens, gold, pred, config): """Return string summarizing performance using CoNLL criteria.""" index_to_label = { v: k for k, v in config.label_to_index.items() } acc = accuracy(gold, pred) gold = map(lambda i: index_to_label[i], as_dense(gold)) pred = map(lambda i: index_to_label[i], as_dense(pred)) # Format as space-separated (token, gold, pred) strings for CoNLL eval. if len(tokens) != len(gold) or len(gold) != len(pred): raise ValueError('counts do not match') formatted = [' '.join(t) for t in zip(tokens, gold, pred)] o, by_type = conlleval.metrics(conlleval.evaluate(formatted)) nlen = max(len(name) for name in by_type.keys()) summaries = ['%.2f%% acc %.2f%% f (%.1fp %.1fr %dtp %dfp %dfn)' % ( 100.*acc, 100.*o.fscore, 100.*o.prec, 100.*o.rec, o.tp, o.fp, o.fn )] for name, r in sorted(by_type.items()): summaries.append('%*s %.2f%% f (%.1fp %.1fr %dtp %dfp %dfn)' % ( nlen, name, 100.*r.fscore, 100.*r.prec, 100.*r.rec, r.tp, r.fp, r.fn )) return '\n'.join(summaries)