def output(self): return [LocalTarget(AbstractGuesser.guess_path(bc.GUESSES_DIR, fold)) for fold in c.BUZZER_INPUT_FOLDS]
def stack(model_dir, model_name, fold=BUZZER_DEV_FOLD): guesses_dir = AbstractGuesser.output_path( 'qanta.guesser.rnn', 'RnnGuesser', 0, '') guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, 'char') with open(guesses_dir, 'rb') as f: guesses = pickle.load(f) guesses = guesses.groupby('qanta_id') buzzes_dir = os.path.join(model_dir, '{}_buzzes.pkl'.format(fold)) with open(buzzes_dir, 'rb') as f: buzzes = pickle.load(f) questions = QuizBowlDataset(buzzer_train=True).questions_by_fold() questions = {q.qanta_id: q for q in questions[fold]} stack_freq = {'Position': [], 'Buzzing': []} count = defaultdict(lambda: 0) for qid, (char_indices, scores) in buzzes.items(): gs = guesses.get_group(qid).groupby('char_index') gs = gs.aggregate(lambda x: x.head(1)).to_dict()['guess'] question = questions[qid] q_len = len(question.text) for i, char_index in enumerate(char_indices): buzz_oracle = gs[char_index] == question.page buzz_buzzer = scores[i][1] > scores[i][0] only_oracle = buzz_oracle and (not buzz_buzzer) only_buzzer = buzz_buzzer and (not buzz_oracle) both = buzz_buzzer and buzz_oracle neither = (not buzz_buzzer) and (not buzz_oracle) rel_position = np.round(char_index / q_len, decimals=1) count[rel_position] += 1 if only_oracle: stack_freq['Position'].append(rel_position) stack_freq['Buzzing'].append('Only optimal') if only_buzzer: stack_freq['Position'].append(rel_position) stack_freq['Buzzing'].append('Only buzzer') if both: stack_freq['Position'].append(rel_position) stack_freq['Buzzing'].append('Both') if neither: stack_freq['Position'].append(rel_position) stack_freq['Buzzing'].append('Neither') df = pd.DataFrame(stack_freq) df = df.groupby(['Position', 'Buzzing']) df = df.size().reset_index().rename(columns={0: 'Frequency'}) df['Frequency'] = df.apply( lambda row: row['Frequency'] / count[row['Position']], axis=1) df['Model'] = pd.Series([model_name for _ in range(len(df))]) stack_dir = os.path.join(model_dir, '{}_stack.pkl'.format(fold)) with open(stack_dir, 'wb') as f: pickle.dump(df, f) return df
import pickle from qanta.guesser.abstract import AbstractGuesser from qanta.guesser.dan import DanGuesser from qanta.util.constants import BUZZER_TRAIN_FOLD, BUZZER_DEV_FOLD guesser_directory = AbstractGuesser.output_path( 'qanta.guesser.dan', 'DanGuesser', 0, '') guesser = DanGuesser.load(guesser_directory) # type: AbstractGuesser guesser.batch_size /= 8 word_skip = 2 folds = [BUZZER_TRAIN_FOLD, BUZZER_DEV_FOLD] for fold in folds: df = guesser.generate_guesses(1, [fold], word_skip=word_skip) output_path = AbstractGuesser.guess_path(guesser_directory, fold) with open(output_path, 'wb') as f: pickle.dump(df, f)
def merge_dfs(): GUESSERS = ["{0}.{1}".format( x.guesser_module, x.guesser_class) \ for x in AbstractGuesser.list_enabled_guessers()] log.info("Merging guesser DataFrames.") merged_dir = os.path.join(c.GUESSER_TARGET_PREFIX, 'merged') if not os.path.exists(merged_dir): os.makedirs(merged_dir) for fold in c.BUZZER_INPUT_FOLDS: if os.path.exists(AbstractGuesser.guess_path(merged_dir, fold)): log.info("Merged {0} exists, skipping.".format(fold)) continue new_guesses = pd.DataFrame(columns=[ 'fold', 'guess', 'guesser', 'qnum', 'score', 'sentence', 'token' ], dtype='object') for guesser in GUESSERS: guesser_dir = os.path.join(c.GUESSER_TARGET_PREFIX, guesser) guesses = AbstractGuesser.load_guesses(guesser_dir, folds=[fold]) new_guesses = new_guesses.append(guesses) for col in ['qnum', 'sentence', 'token', 'score']: new_guesses[col] = pd.to_numeric(new_guesses[col], downcast='integer') AbstractGuesser.save_guesses(new_guesses, merged_dir, folds=[fold]) log.info("Merging: {0} finished.".format(fold))
def protobowl(model, fold=BUZZER_DEV_FOLD): buzzes = get_buzzes(model, fold) '''eval''' guesses_dir = AbstractGuesser.output_path('qanta.guesser.rnn', 'RnnGuesser', 0, '') guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, 'char') with open(guesses_dir, 'rb') as f: guesses = pickle.load(f) guesses = guesses.groupby('qanta_id') questions = QuizBowlDataset(buzzer_train=True).questions_by_fold() questions = questions[fold] df = load_protobowl() df = df.groupby('qid') worker = partial(simulate_game, guesses, buzzes, df) possibility = [] outcome = [] for question in tqdm(questions): pos, out = worker(question) possibility += pos outcome += out result_df = pd.DataFrame({ 'Possibility': possibility, 'Outcome': outcome, }) result_dir = os.path.join(model.model_dir, '{}_protobowl.pkl'.format(fold)) with open(result_dir, 'wb') as f: pickle.dump(result_df, f)
def protobowl(model, fold=BUZZER_DEV_FOLD): buzzes = get_buzzes(model, fold) """eval""" guesses_dir = AbstractGuesser.output_path("qanta.guesser.rnn", "RnnGuesser", 0, "") guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, "char") with open(guesses_dir, "rb") as f: guesses = pickle.load(f) guesses = guesses.groupby("qanta_id") questions = QuizBowlDataset(buzzer_train=True).questions_by_fold() questions = questions[fold] df = load_protobowl() df = df.groupby("qid") worker = partial(simulate_game, guesses, buzzes, df) possibility = [] outcome = [] for question in tqdm(questions): pos, out = worker(question) possibility += pos outcome += out result_df = pd.DataFrame({"Possibility": possibility, "Outcome": outcome,}) result_dir = os.path.join(model.model_dir, "{}_protobowl.pkl".format(fold)) with open(result_dir, "wb") as f: pickle.dump(result_df, f)
def read_data(fold, output_type='char', guesser_module='qanta.guesser.dan', guesser_class='DanGuesser', guesser_config_num=0, vector_converter=vector_converter_0): if os.path.isfile(dataset_dir.format(fold)): with open(dataset_dir.format(fold), 'rb') as f: return pickle.load(f) g_dir = AbstractGuesser.output_path(guesser_module, guesser_class, guesser_config_num, '') g_path = AbstractGuesser.guess_path(g_dir, fold, output_type) with open(g_path, 'rb') as f: df = pickle.load(f) df_groups = df.groupby('qanta_id') questions = QuizBowlDataset(buzzer_train=True).questions_by_fold() questions = {q.qanta_id: q for q in questions[fold]} pool = Pool(8) worker = partial(process_question, questions, vector_converter) dataset = pool.map(worker, df_groups) with open(dataset_dir.format(fold), 'wb') as f: return pickle.dump(dataset, f) return dataset
def ew(model, fold=BUZZER_DEV_FOLD): buzzes = get_buzzes(model, fold) guesses_dir = AbstractGuesser.output_path('qanta.guesser.rnn', 'RnnGuesser', 0, '') guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, 'char') with open(guesses_dir, 'rb') as f: guesses = pickle.load(f) guesses = guesses.groupby('qanta_id') answers = dict() for qid, bs in buzzes.items(): answers[qid] = [] groups = guesses.get_group(qid).groupby('char_index') for char_index, scores in zip(*bs): guess = groups.get_group(char_index).head(1)['guess'] guess = guess.values[0] buzz = scores[0] < scores[1] answers[qid].append({ 'char_index': char_index, 'guess': guess, 'buzz': buzz, }) questions = QuizBowlDataset(buzzer_train=True).questions_by_fold() questions = {q.qanta_id: q for q in questions[fold]} curve_score = CurveScore() ew = [] ew_opt = [] for qid, answer in answers.items(): question = questions[qid] q = {'text': question.text, 'page': question.page} ew.append(curve_score.score(answer, q)) ew_opt.append(curve_score.score_optimal(answer, q)) eval_out = { 'expected_wins': sum(ew), 'n_examples': len(ew), 'expected_wins_optimal': sum(ew_opt), } print(json.dumps(eval_out)) return eval_out
def ew(model, fold=BUZZER_DEV_FOLD): buzzes = get_buzzes(model, fold) guesses_dir = AbstractGuesser.output_path( 'qanta.guesser.rnn', 'RnnGuesser', 0, '') guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, 'char') with open(guesses_dir, 'rb') as f: guesses = pickle.load(f) guesses = guesses.groupby('qanta_id') answers = dict() for qid, bs in buzzes.items(): answers[qid] = [] groups = guesses.get_group(qid).groupby('char_index') for char_index, scores in zip(*bs): guess = groups.get_group(char_index).head(1)['guess'] guess = guess.values[0] buzz = scores[0] < scores[1] answers[qid].append({ 'char_index': char_index, 'guess': guess, 'buzz': buzz, }) questions = QuizBowlDataset(buzzer_train=True).questions_by_fold() questions = {q.qanta_id: q for q in questions[fold]} curve_score = CurveScore() ew = [] ew_opt = [] for qid, answer in answers.items(): question = questions[qid] q = {'text': question.text, 'page': question.page} ew.append(curve_score.score(answer, q)) ew_opt.append(curve_score.score_optimal(answer, q)) eval_out = { 'expected_wins': sum(ew), 'n_examples': len(ew), 'expected_wins_optimal': sum(ew_opt), } print(json.dumps(eval_out)) return eval_out
def ew(model, fold=BUZZER_DEV_FOLD): buzzes = get_buzzes(model, fold) guesses_dir = AbstractGuesser.output_path("qanta.guesser.rnn", "RnnGuesser", 0, "") guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, "char") with open(guesses_dir, "rb") as f: guesses = pickle.load(f) guesses = guesses.groupby("qanta_id") answers = dict() for qid, bs in buzzes.items(): answers[qid] = [] groups = guesses.get_group(qid).groupby("char_index") for char_index, scores in zip(*bs): guess = groups.get_group(char_index).head(1)["guess"] guess = guess.values[0] buzz = scores[0] < scores[1] answers[qid].append( {"char_index": char_index, "guess": guess, "buzz": buzz,} ) questions = QuizBowlDataset(buzzer_train=True).questions_by_fold() questions = {q.qanta_id: q for q in questions[fold]} curve_score = CurveScore() ew = [] ew_opt = [] for qid, answer in answers.items(): question = questions[qid] q = {"text": question.text, "page": question.page} ew.append(curve_score.score(answer, q)) ew_opt.append(curve_score.score_optimal(answer, q)) eval_out = { "expected_wins": sum(ew), "n_examples": len(ew), "expected_wins_optimal": sum(ew_opt), } print(json.dumps(eval_out)) return eval_out
def export(output_file: str, fold: str = "buzztest"): fold = "buzztest" guesses_dir = AbstractGuesser.output_path("qanta.guesser.rnn", "RnnGuesser", 0, "") guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, "char") with open(guesses_dir, "rb") as f: guesses = pickle.load(f) guesses = guesses.groupby("qanta_id") questions = QuizBowlDataset(buzzer_train=True).questions_by_fold() questions = {q.qanta_id: q for q in questions[fold]} buzzers = {} for name in ["RNNBuzzer", "ThresholdBuzzer", "MLPBuzzer"]: model_dir = f"output/buzzer/{name}" buzzes_dir = os.path.join(model_dir, "{}_buzzes.pkl".format(fold)) with open(buzzes_dir, "rb") as f: buzzers[name] = pickle.load(f) qid_to_buzzes = defaultdict(dict) for name, buzzes in track(buzzers.items()): for qid, (char_indices, scores) in buzzes.items(): gs = (guesses.get_group(qid).groupby("char_index").aggregate( lambda x: x.head(1)).to_dict()["guess"]) question = questions[qid] q_len = len(question.text) buzz_oracle_position = -1 buzz_model_position = -1 oracle_guess = None buzz_guess = None for i, char_index in enumerate(char_indices): buzz_oracle = gs[char_index] == question.page if buzz_oracle: if buzz_oracle_position == -1 or char_index <= buzz_oracle_position: oracle_guess = question.page buzz_oracle_position = char_index if scores[i][1] > scores[i][0]: if buzz_model_position == -1 or char_index < buzz_model_position: buzz_guess = gs[char_index] buzz_model_position = char_index qid_to_buzzes[qid][name] = { "oracle": buzz_oracle_position, "oracle_fraction": buzz_oracle_position / q_len if buzz_oracle_position != -1 else -1, "position": buzz_model_position, "position_fraction": buzz_model_position / q_len if buzz_model_position != -1 else -1, "q_len": q_len, "oracle_guess": oracle_guess, "buzz_guess": buzz_guess, "answer": question.page, "impossible": oracle_guess is None, } write_json(output_file, qid_to_buzzes)
def output(self): return [ LocalTarget(AbstractGuesser.guess_path(bc.GUESSES_DIR, fold)) for fold in c.BUZZER_INPUT_FOLDS ]
def stack(model_dir, model_name, fold=BUZZER_DEV_FOLD): guesses_dir = AbstractGuesser.output_path('qanta.guesser.rnn', 'RnnGuesser', 0, '') guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, 'char') with open(guesses_dir, 'rb') as f: guesses = pickle.load(f) guesses = guesses.groupby('qanta_id') buzzes_dir = os.path.join(model_dir, '{}_buzzes.pkl'.format(fold)) with open(buzzes_dir, 'rb') as f: buzzes = pickle.load(f) questions = QuizBowlDataset(buzzer_train=True).questions_by_fold() questions = {q.qanta_id: q for q in questions[fold]} stack_freq = {'Position': [], 'Buzzing': []} count = defaultdict(lambda: 0) for qid, (char_indices, scores) in buzzes.items(): gs = guesses.get_group(qid).groupby('char_index') gs = gs.aggregate(lambda x: x.head(1)).to_dict()['guess'] question = questions[qid] q_len = len(question.text) for i, char_index in enumerate(char_indices): buzz_oracle = gs[char_index] == question.page buzz_buzzer = scores[i][1] > scores[i][0] only_oracle = buzz_oracle and (not buzz_buzzer) only_buzzer = buzz_buzzer and (not buzz_oracle) both = buzz_buzzer and buzz_oracle neither = (not buzz_buzzer) and (not buzz_oracle) rel_position = np.round(char_index / q_len, decimals=1) count[rel_position] += 1 if only_oracle: stack_freq['Position'].append(rel_position) stack_freq['Buzzing'].append('Only optimal') if only_buzzer: stack_freq['Position'].append(rel_position) stack_freq['Buzzing'].append('Only buzzer') if both: stack_freq['Position'].append(rel_position) stack_freq['Buzzing'].append('Both') if neither: stack_freq['Position'].append(rel_position) stack_freq['Buzzing'].append('Neither') df = pd.DataFrame(stack_freq) df = df.groupby(['Position', 'Buzzing']) df = df.size().reset_index().rename(columns={0: 'Frequency'}) df['Frequency'] = df.apply( lambda row: row['Frequency'] / count[row['Position']], axis=1) df['Model'] = pd.Series([model_name for _ in range(len(df))]) stack_dir = os.path.join(model_dir, '{}_stack.pkl'.format(fold)) with open(stack_dir, 'wb') as f: pickle.dump(df, f) return df
def stack(model_dir, model_name, fold=BUZZER_DEV_FOLD): guesses_dir = AbstractGuesser.output_path("qanta.guesser.rnn", "RnnGuesser", 0, "") guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, "char") with open(guesses_dir, "rb") as f: guesses = pickle.load(f) guesses = guesses.groupby("qanta_id") buzzes_dir = os.path.join(model_dir, "{}_buzzes.pkl".format(fold)) with open(buzzes_dir, "rb") as f: buzzes = pickle.load(f) questions = QuizBowlDataset(buzzer_train=True).questions_by_fold() questions = {q.qanta_id: q for q in questions[fold]} stack_freq = {"Position": [], "Buzzing": []} count = defaultdict(lambda: 0) for qid, (char_indices, scores) in buzzes.items(): gs = guesses.get_group(qid).groupby("char_index") gs = gs.aggregate(lambda x: x.head(1)).to_dict()["guess"] question = questions[qid] q_len = len(question.text) for i, char_index in enumerate(char_indices): buzz_oracle = gs[char_index] == question.page buzz_buzzer = scores[i][1] > scores[i][0] only_oracle = buzz_oracle and (not buzz_buzzer) only_buzzer = buzz_buzzer and (not buzz_oracle) both = buzz_buzzer and buzz_oracle neither = (not buzz_buzzer) and (not buzz_oracle) rel_position = np.round(char_index / q_len, decimals=1) count[rel_position] += 1 if only_oracle: stack_freq["Position"].append(rel_position) stack_freq["Buzzing"].append("Only optimal") if only_buzzer: stack_freq["Position"].append(rel_position) stack_freq["Buzzing"].append("Only buzzer") if both: stack_freq["Position"].append(rel_position) stack_freq["Buzzing"].append("Both") if neither: stack_freq["Position"].append(rel_position) stack_freq["Buzzing"].append("Neither") df = pd.DataFrame(stack_freq) df = df.groupby(["Position", "Buzzing"]) df = df.size().reset_index().rename(columns={0: "Frequency"}) df["Frequency"] = df.apply( lambda row: row["Frequency"] / count[row["Position"]], axis=1 ) df["Model"] = pd.Series([model_name for _ in range(len(df))]) stack_dir = os.path.join(model_dir, "{}_stack.pkl".format(fold)) with open(stack_dir, "wb") as f: pickle.dump(df, f) return df
import pickle from qanta.guesser.abstract import AbstractGuesser from qanta.guesser.dan import DanGuesser from qanta.util.constants import BUZZER_TRAIN_FOLD, BUZZER_DEV_FOLD guesser_directory = AbstractGuesser.output_path( "qanta.guesser.dan", "DanGuesser", 0, "" ) guesser = DanGuesser.load(guesser_directory) # type: AbstractGuesser guesser.batch_size /= 8 word_skip = 2 folds = [BUZZER_TRAIN_FOLD, BUZZER_DEV_FOLD] for fold in folds: df = guesser.generate_guesses(1, [fold], word_skip=word_skip) output_path = AbstractGuesser.guess_path(guesser_directory, fold) with open(output_path, "wb") as f: pickle.dump(df, f)
def read_data( fold, output_type='char', guesser_module='qanta.guesser.rnn', guesser_class='RnnGuesser', guesser_config_num=0, vector_converter=vector_converter_0): if os.path.isfile(dataset_dir.format(fold)): with open(dataset_dir.format(fold), 'rb') as f: return pickle.load(f) g_dir = AbstractGuesser.output_path( guesser_module, guesser_class, guesser_config_num, '') g_path = AbstractGuesser.guess_path(g_dir, fold, output_type) with open(g_path, 'rb') as f: df = pickle.load(f) df_groups = df.groupby('qanta_id') questions = QuizBowlDataset(buzzer_train=True).questions_by_fold() questions = {q.qanta_id: q for q in questions[fold]} pool = Pool(8) worker = partial(process_question, questions, vector_converter) dataset = pool.map(worker, df_groups) with open(dataset_dir.format(fold), 'wb') as f: pickle.dump(dataset, f) return dataset
def protobowl(model, fold=BUZZER_DEV_FOLD): buzzes = get_buzzes(model, fold) '''eval''' guesses_dir = AbstractGuesser.output_path( 'qanta.guesser.rnn', 'RnnGuesser', 0, '') guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, 'char') with open(guesses_dir, 'rb') as f: guesses = pickle.load(f) guesses = guesses.groupby('qanta_id') questions = QuizBowlDataset(buzzer_train=True).questions_by_fold() questions = questions[fold] df = load_protobowl() df = df.groupby('qid') worker = partial(simulate_game, guesses, buzzes, df) possibility = [] outcome = [] for question in tqdm(questions): pos, out = worker(question) possibility += pos outcome += out result_df = pd.DataFrame({ 'Possibility': possibility, 'Outcome': outcome, }) result_dir = os.path.join( model.model_dir, '{}_protobowl.pkl'.format(fold)) with open(result_dir, 'wb') as f: pickle.dump(result_df, f)