predictions[qa['id']] = qa['answers'].pop(1)['text'] evaluator = Evaluator(articles=articles) print 'Exact match:', round(evaluator.ExactMatch(predictions), 1) print 'F1:', round(evaluator.F1(predictions), 1) total_num_same_count = sum(num_same_counts.values()) for num_same, count in sorted(num_same_counts.items()): print num_same, 'same:', round(100.0 * count / total_num_same_count, 1) with open('dataset/dev-answertypetags.json') as fileobj: tags = json.loads(fileobj.read()) print len(tags), 'tagged questions' for tag, _ in Counter(tags.values()).most_common(): num_correct = 0 total_f1 = 0 num_total = 0 for question_id, _ in filter(lambda x: x[1] == tag, tags.items()): num_total += 1 predicted_answer = predictions.get(question_id, None) if predicted_answer is not None: if evaluator.ExactMatchSingle(question_id, predicted_answer): num_correct += 1 total_f1 += evaluator.F1Single(question_id, predicted_answer) print str(round(100.0 * num_total / len(tags), 1)) + '%', tag, 'questions, exact match', str( round(100.0 * num_correct / num_total, 1)) + '%', ', F1', round( 100.0 * total_f1 / num_total, 1)
with open(jsonDataFile, "r") as fp: human_articles = json.load(fp)['data'] for article in human_articles: for paragraph in article['paragraphs']: for qa in paragraph['qas']: if len(qa['answers']) > 1: human_predictions[qa['id']] = qa['answers'].pop(1)['text'] human_evaluator = Evaluator(articles=human_articles) for dist in sorted(editDistGroup.keys()): total_f1 = 0 total_human_f1 = 0 num_q = 0 num_human = 0 for qaId, _ in editDistGroup[dist]: total_f1 += 100.0 * evaluator.F1Single(qaId, predDict[qaId]) num_q += 1 if qaId in human_predictions: total_human_f1 += 100.0 * human_evaluator.F1Single( qaId, human_predictions[qaId]) num_human += 1 exactMatchRate = 0 F1 = total_f1 / num_q exactMatchRateList.append(exactMatchRate) F1List.append(F1) HumanF1List.append(total_human_f1 / num_human) print total_human_f1 / num_human print "edit dist ", dist print "number of sample ", len(editDistGroup[dist]) print "exact match ", exactMatchRate print "F1 ", F1