def testEvaluation_4(self): golden_list = [['B-TAR', 'I-TAR', 'O', 'B-HYP'], ['B-TAR', 'O', 'O', 'B-HYP']] predict_list = [['O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O']] f1 = evaluate(golden_list, predict_list) self.assertEqual(f1, 0.0)
def testEvaluation_8(self): golden_list = [['B-TAR', 'B-TAR', 'I-TAR', 'B-HYP', 'O', 'O'], ['B-TAR', 'O', 'O', 'B-HYP', '0', 'I-HYP']] predict_list = [['B-TAR', 'B-TAR', 'I-TAR', 'O', 'B-TAR', 'I-TAR'], ['I-TAR', 'O', 'B-HYP', 'B-HYP', 'B-TAR']] f1 = evaluate(golden_list, predict_list) self.assertEqual(f1, f1_score(3 / 6, 3 / 5))
def testEvaluation_6(self): golden_list = [['B-TAR', 'I-TAR', 'B-TAR', 'B-HYP'], ['B-TAR', 'O', 'O', 'B-HYP']] predict_list = [['B-TAR', 'I-TAR', 'I-TAR', 'O'], ['I-TAR', 'O', 'O', 'O']] f1 = evaluate(golden_list, predict_list) self.assertEqual(f1, f1_score(1 / 5, 1 / 1))
def testEvaluation_5(self): golden_list = [ ['B-TAR', 'O', 'O', 'B-HYP', 'I-HYP', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-TAR', 'O', 'O', 'O', 'O', 'B-HYP', 'I-HYP', 'I-HYP', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']] predict_list = [ ['B-TAR', 'O', 'O', 'O', 'B-HYP', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], [ 'O', 'I-TAR', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-TAR', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-HYP', 'O']] f1 = evaluate(golden_list, predict_list) self.assertEqual(f1, f1_score(2 / 4, 2 / 3))
def testEvaluation_9(self): golden_list = [[ 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O' ], [ 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O' ]] predict_list = [[ 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O' ], [ 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O' ]] f1 = evaluate(golden_list, predict_list) self.assertEqual(f1, 1)
loss.view(-1).data.tolist()[0]) pbar.update(1) # keep the model with best f1 on development set, if the flag is True if _config.use_f1: model.eval() pred_dev_ins, golden_dev_ins = [], [] for batch_sentence_len_list, batch_word_index_lists, batch_word_mask, batch_char_index_matrices, batch_char_mask, batch_word_len_lists, batch_tag_index_list in dev: pred_batch_tag = model.decode(batch_word_index_lists, batch_sentence_len_list, batch_char_index_matrices, batch_word_len_lists, batch_char_mask) pred_dev_ins += [[ reversed_tag_dict[t] for t in tag[:l] ] for tag, l in zip(pred_batch_tag.data.tolist(), batch_sentence_len_list.data.tolist())] golden_dev_ins += [[ reversed_tag_dict[t] for t in tag[:l] ] for tag, l in zip(batch_tag_index_list.data.tolist(), batch_sentence_len_list.data.tolist())] # print(golden_dev_ins) new_f1 = evaluate(golden_dev_ins, pred_dev_ins) if new_f1 > best_f1: model_state = model.state_dict() torch.save(model_state, _config.model_file) best_f1 = new_f1 # else we just keep the newest model else: model_state = model.state_dict() torch.save(model_state, _config.model_file)
from todo import evaluate list_1 = [['B-TAR', 'I-TAR', 'I-TAR', 'B-HYP'], ['B-TAR', 'O', 'O', 'B-HYP']] #[['B-TAR', 'I-TAR','I-TAR', 'I-TAR','O', 'B-HYP']] #[['B-TAR', 'I-TAR', 'O', 'B-HYP'], ['B-TAR', 'O', 'O', 'B-HYP']] list_2 = [['B-TAR', 'B-TAR', 'I-HYP', 'O'], ['I-TAR', 'O', 'O', 'O']] #[['B-TAR','I-TAR', 'B-HYP','I-HYP','O', 'B-HYP']] #[['B-TAR', 'O', 'O', 'O'], ['B-TAR', 'O', 'B-HYP', 'I-HYP']] print(evaluate(list_1, list_2))
def test_evaluation(self): golden_list = [['B-TAR', 'I-TAR', 'O', 'B-HYP'], ['B-TAR', 'O', 'O', 'B-HYP']] predict_list = [['B-TAR', 'O', 'O', 'O'], ['B-TAR', 'O', 'B-HYP', 'I-HYP']] result = todo.evaluate(golden_list, predict_list) print("answers shuld be this " + str(result)) self.assertEqual(result, 0.286) #auto generate at the end golden_list = [['B-TAR', 'I-TAR', 'O', 'B-HYP'], ['B-TAR', 'I-TAR', 'O', 'B-HYP']] predict_list = [['B-TAR', 'I-TAR', 'O', 'B-HYP'], ['B-TAR', 'I-TAR', 'O', 'B-HYP']] result = todo.evaluate(golden_list, predict_list) print("answers shuld be this " + str(result)) golden_list = [[ 'B-TAR', 'I-TAR', 'I-TAR', 'B-HYP', 'I-HYP', 'I-HYP', 'O' ]] predict_list = [[ 'B-TAR', 'I-TAR', 'O', 'B-HYP', 'I-HYP', 'I-HYP', 'O' ]] result = todo.evaluate(golden_list, predict_list) print("answers shuld be this " + str(result)) golden_list = [['O', 'O']] predict_list = [['O', 'O']] result = todo.evaluate(golden_list, predict_list) print("answers shuld be this " + str(result)) #B-hyp with I-HYP #SIMPLE CASES #2 true positive #2 false negative #2 false positive golden_list = [['B-TAR', 'O', 'B-HYP', 'I-HYP'], ['B-TAR', 'O', 'O', 'B-HYP']] predict_list = [['B-TAR', 'O', 'O', 'B-HYP'], ['B-TAR', 'O', 'B-HYP', 'I-HYP']] result = todo.evaluate(golden_list, predict_list) print("answers shuld be this " + str(result)) self.assertEqual(result, 0.5) #two different way for simple B-HYP prediction mistake #2 true positive #2 false negative golden_list = [['B-TAR', 'O', 'O', 'B-HYP'], ['B-TAR', 'O', 'O', 'B-HYP', 'O']] predict_list = [['B-TAR', 'O', 'O', 'B-HYP'], ['B-TAR', 'O', 'O', 'O', 'O']] result = todo.evaluate(golden_list, predict_list) print("answers shuld be this " + str(result)) self.assertEqual(result, 0.857) #BTAR WITH ITAR in golden and BTAR with ITAR in golden golden_list = [['B-TAR', 'I-TAR', 'O', 'B-HYP'], ['B-TAR', 'O', 'O', 'B-HYP'], ['B-TAR', 'I-TAR', 'O', 'B-HYP']] predict_list = [['B-TAR', 'O', 'O', 'B-HYP'], ['B-TAR', 'I-TAR', 'O', 'B-HYP'], ['B-TAR', 'I-TAR', 'O', 'B-HYP']] result = todo.evaluate(golden_list, predict_list) print("answers shuld be this " + str(result)) self.assertEqual(result, 0.667) """ #when B-Tar is equal at one and not equal in another #simple BTAR AND BHYP """ golden_list = [['B-TAR', 'O', 'O', 'B-HYP'], ['B-TAR', 'O', 'O', 'B-HYP']] predict_list = [['B-TAR', 'O', 'O', 'B-HYP'], ['B-TAR', 'I-TAR', 'O', 'B-HYP']] result = todo.evaluate(golden_list, predict_list) print("answers shuld be this " + str(result)) self.assertEqual(result, 0.75) #more exhaustive test needed golden_list = [['B-TAR', 'O', 'O', 'B-HYP'], ['B-TAR', 'O', 'O', 'B-HYP']] predict_list = [['B-TAR', 'O', 'O', 'O'], ['B-TAR', 'O', 'B-HYP', 'O']] result = todo.evaluate(golden_list, predict_list) print("answers shuld be this " + str(result)) self.assertEqual(result, 0.571) golden_list = [['B-TAR', 'O', 'O', 'B-HYP'], ['B-TAR', 'O', 'O', 'B-HYP']] predict_list = [['B-TAR', 'O', 'O', 'O'], ['B-TAR', 'O', 'B-HYP', 'O']] result = todo.evaluate(golden_list, predict_list) print("answers shuld be this " + str(result)) self.assertEqual(result, 0.571) golden_list = [['B-TAR', 'O', 'O', 'B-HYP'], ['B-TAR', 'O', 'O', 'B-HYP']] predict_list = [['B-TAR', 'O', 'O', 'O'], ['B-TAR', 'O', 'B-HYP', 'O']] result = todo.evaluate(golden_list, predict_list) print("answers shuld be this " + str(result)) self.assertEqual(result, 0.571) golden_list = [['B-TAR', 'O', 'O', 'B-HYP'], ['B-TAR', 'O', 'O', 'B-HYP']] predict_list = [['B-TAR', 'O', 'O', 'O'], ['B-TAR', 'O', 'B-HYP', 'O']] result = todo.evaluate(golden_list, predict_list) print("answers shuld be this " + str(result)) self.assertEqual(result, 0.571) golden_list = [['B-TAR', 'I-TAR', 'I-TAR', 'B-HYP'], ['B-TAR', 'O', 'O', 'B-HYP']] predict_list = [['B-TAR', 'O', 'B-HYP', 'O'], ['B-TAR', 'O', 'B-HYP', 'O']] result = todo.evaluate(golden_list, predict_list) print("answers shuld be this " + str(result)) self.assertEqual(result, 0.571)