def evaluate(ckp_dir): db = QuizBowlDataset(guesser_train=True, buzzer_train=True) questions = db.questions_in_folds(["guessdev"]) questions = {x.qnum: x for x in questions} with open(ckp_dir, "rb") as f: checkpoint = pickle.load(f) scores = [0, 0, 0, 0, 0] descriptions = [ "accuracy before", "accuracy after", "before after match", "top 5 accuracy before", "top 5 accuracy after", ] for k, q in checkpoint.items(): page = questions[k].page gb = sorted(q["guesses_before"].items(), key=lambda x: x[1])[::-1] ga = sorted(q["guesses_after"].items(), key=lambda x: x[1])[::-1] scores[0] += gb[0][0] == page # accuracy before scores[1] += ga[0][0] == page # accuracy after scores[2] += ga[0][0] == gb[0][0] # top 1 match before / after scores[3] += page in [x[0] for x in gb[:5]] # top 5 accuracy before scores[4] += page in [x[0] for x in ga[:5]] # top 5 accuracy after scores = [x / len(questions) for x in scores] for s, d in zip(scores, descriptions): print(d, s)
def main(): fold = 'guessdev' db = QuizBowlDataset(1, guesser_train=True, buzzer_train=True) questions = db.questions_in_folds([fold]) first_n = lambda x: len(x) print(guesser.guess_single(' '.join(questions[0].text.values()))) '''
def main(): fold = 'guessdev' db = QuizBowlDataset(1, guesser_train=True, buzzer_train=True) questions = db.questions_in_folds([fold]) first_n = lambda x: len(x) print(guesser.guess_single(' '.join(questions[0].text.values()))) '''
def compute_question_stats(question_db_path: str): dataset = QuizBowlDataset(5, qb_question_db=question_db_path) train_dev_questions = dataset.questions_in_folds(('train', 'dev')) question_lengths = [ len(q.flatten_text().split()) for q in train_dev_questions ] mean = np.mean(question_lengths) std = np.std(question_lengths) stats = (mean, std) with safe_open(SENTENCE_STATS, 'wb') as f: pickle.dump(stats, f)
def main(questions, n_keep, ckp_dir): db = QuizBowlDataset(guesser_train=True, buzzer_train=True) questions = db.questions_in_folds(['guessdev']) questions = {x.qnum: x for x in questions} checkpoint = defaultdict(dict) for qnum, question in questions.items(): text_before = question.flatten_text() guesses_before = guesser.guess_single(text_before) text_after, guesses_after, removed = greedy_remove( text_before, guesses_before, n_keep) checkpoint[qnum]['text_before'] = text_before checkpoint[qnum]['text_after'] = text_after checkpoint[qnum]['guesses_before'] = guesses_before checkpoint[qnum]['guesses_after'] = guesses_after checkpoint[qnum]['removed'] = removed checkpoint = dict(checkpoint) with open(safe_path(ckp_dir), 'wb') as f: pickle.dump(checkpoint, f) evaluate(ckp_dir)
def main(questions, n_keep, ckp_dir): db = QuizBowlDataset(guesser_train=True, buzzer_train=True) questions = db.questions_in_folds(["guessdev"]) questions = {x.qnum: x for x in questions} checkpoint = defaultdict(dict) for qnum, question in questions.items(): text_before = question.flatten_text() guesses_before = guesser.guess_single(text_before) text_after, guesses_after, removed = greedy_remove( text_before, guesses_before, n_keep ) checkpoint[qnum]["text_before"] = text_before checkpoint[qnum]["text_after"] = text_after checkpoint[qnum]["guesses_before"] = guesses_before checkpoint[qnum]["guesses_after"] = guesses_after checkpoint[qnum]["removed"] = removed checkpoint = dict(checkpoint) with open(safe_path(ckp_dir), "wb") as f: pickle.dump(checkpoint, f) evaluate(ckp_dir)
def evaluate(ckp_dir): db = QuizBowlDataset(guesser_train=True, buzzer_train=True) questions = db.questions_in_folds(['guessdev']) questions = {x.qnum: x for x in questions} with open(ckp_dir, 'rb') as f: checkpoint = pickle.load(f) scores = [0, 0, 0, 0, 0] descriptions = ['accuracy before', 'accuracy after', 'before after match', 'top 5 accuracy before', 'top 5 accuracy after'] for k, q in checkpoint.items(): page = questions[k].page gb = sorted(q['guesses_before'].items(), key=lambda x: x[1])[::-1] ga = sorted(q['guesses_after'].items(), key=lambda x: x[1])[::-1] scores[0] += gb[0][0] == page # accuracy before scores[1] += ga[0][0] == page # accuracy after scores[2] += ga[0][0] == gb[0][0] # top 1 match before / after scores[3] += page in [x[0] for x in gb[:5]] # top 5 accuracy before scores[4] += page in [x[0] for x in ga[:5]] # top 5 accuracy after scores = [x / len(questions) for x in scores] for s, d in zip(scores, descriptions): print(d, s)