def output(self): return [LocalTarget(AbstractGuesser.guess_path(bc.GUESSES_DIR, fold)) for fold in c.BUZZER_INPUT_FOLDS]
def stack(model_dir, model_name, fold=BUZZER_DEV_FOLD): guesses_dir = AbstractGuesser.output_path( 'qanta.guesser.rnn', 'RnnGuesser', 0, '') guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, 'char') with open(guesses_dir, 'rb') as f: guesses = pickle.load(f) guesses = guesses.groupby('qanta_id') buzzes_dir = os.path.join(model_dir, '{}_buzzes.pkl'.format(fold)) with open(buzzes_dir, 'rb') as f: buzzes = pickle.load(f) questions = QuizBowlDataset(buzzer_train=True).questions_by_fold() questions = {q.qanta_id: q for q in questions[fold]} stack_freq = {'Position': [], 'Buzzing': []} count = defaultdict(lambda: 0) for qid, (char_indices, scores) in buzzes.items(): gs = guesses.get_group(qid).groupby('char_index') gs = gs.aggregate(lambda x: x.head(1)).to_dict()['guess'] question = questions[qid] q_len = len(question.text) for i, char_index in enumerate(char_indices): buzz_oracle = gs[char_index] == question.page buzz_buzzer = scores[i][1] > scores[i][0] only_oracle = buzz_oracle and (not buzz_buzzer) only_buzzer = buzz_buzzer and (not buzz_oracle) both = buzz_buzzer and buzz_oracle neither = (not buzz_buzzer) and (not buzz_oracle) rel_position = np.round(char_index / q_len, decimals=1) count[rel_position] += 1 if only_oracle: stack_freq['Position'].append(rel_position) stack_freq['Buzzing'].append('Only optimal') if only_buzzer: stack_freq['Position'].append(rel_position) stack_freq['Buzzing'].append('Only buzzer') if both: stack_freq['Position'].append(rel_position) stack_freq['Buzzing'].append('Both') if neither: stack_freq['Position'].append(rel_position) stack_freq['Buzzing'].append('Neither') df = pd.DataFrame(stack_freq) df = df.groupby(['Position', 'Buzzing']) df = df.size().reset_index().rename(columns={0: 'Frequency'}) df['Frequency'] = df.apply( lambda row: row['Frequency'] / count[row['Position']], axis=1) df['Model'] = pd.Series([model_name for _ in range(len(df))]) stack_dir = os.path.join(model_dir, '{}_stack.pkl'.format(fold)) with open(stack_dir, 'wb') as f: pickle.dump(df, f) return df
import pickle from qanta.guesser.abstract import AbstractGuesser from qanta.guesser.dan import DanGuesser from qanta.util.constants import BUZZER_TRAIN_FOLD, BUZZER_DEV_FOLD guesser_directory = AbstractGuesser.output_path( 'qanta.guesser.dan', 'DanGuesser', 0, '') guesser = DanGuesser.load(guesser_directory) # type: AbstractGuesser guesser.batch_size /= 8 word_skip = 2 folds = [BUZZER_TRAIN_FOLD, BUZZER_DEV_FOLD] for fold in folds: df = guesser.generate_guesses(1, [fold], word_skip=word_skip) output_path = AbstractGuesser.guess_path(guesser_directory, fold) with open(output_path, 'wb') as f: pickle.dump(df, f)
def output(self): if os.path.exists(c.QANTA_EXPO_DATASET_PATH): folds = [c.GUESSER_DEV_FOLD, c.GUESSER_TEST_FOLD, c.EXPO_FOLD] else: folds = [c.GUESSER_DEV_FOLD, c.GUESSER_TEST_FOLD] targets = [ LocalTarget( AbstractGuesser.reporting_path( self.guesser_module, self.guesser_class, self.config_num, f"guesser_params.pickle", )) ] for f in folds: targets.append( LocalTarget( AbstractGuesser.reporting_path( self.guesser_module, self.guesser_class, self.config_num, f"guesser_report_{f}.pickle", ))) return targets
def protobowl(model, fold=BUZZER_DEV_FOLD): buzzes = get_buzzes(model, fold) '''eval''' guesses_dir = AbstractGuesser.output_path('qanta.guesser.rnn', 'RnnGuesser', 0, '') guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, 'char') with open(guesses_dir, 'rb') as f: guesses = pickle.load(f) guesses = guesses.groupby('qanta_id') questions = QuizBowlDataset(buzzer_train=True).questions_by_fold() questions = questions[fold] df = load_protobowl() df = df.groupby('qid') worker = partial(simulate_game, guesses, buzzes, df) possibility = [] outcome = [] for question in tqdm(questions): pos, out = worker(question) possibility += pos outcome += out result_df = pd.DataFrame({ 'Possibility': possibility, 'Outcome': outcome, }) result_dir = os.path.join(model.model_dir, '{}_protobowl.pkl'.format(fold)) with open(result_dir, 'wb') as f: pickle.dump(result_df, f)
def read_data(fold, output_type='char', guesser_module='qanta.guesser.dan', guesser_class='DanGuesser', guesser_config_num=0, vector_converter=vector_converter_0): if os.path.isfile(dataset_dir.format(fold)): with open(dataset_dir.format(fold), 'rb') as f: return pickle.load(f) g_dir = AbstractGuesser.output_path(guesser_module, guesser_class, guesser_config_num, '') g_path = AbstractGuesser.guess_path(g_dir, fold, output_type) with open(g_path, 'rb') as f: df = pickle.load(f) df_groups = df.groupby('qanta_id') questions = QuizBowlDataset(buzzer_train=True).questions_by_fold() questions = {q.qanta_id: q for q in questions[fold]} pool = Pool(8) worker = partial(process_question, questions, vector_converter) dataset = pool.map(worker, df_groups) with open(dataset_dir.format(fold), 'wb') as f: return pickle.dump(dataset, f) return dataset
def merge_dfs(): GUESSERS = ["{0}.{1}".format( x.guesser_module, x.guesser_class) \ for x in AbstractGuesser.list_enabled_guessers()] log.info("Merging guesser DataFrames.") merged_dir = os.path.join(c.GUESSER_TARGET_PREFIX, 'merged') if not os.path.exists(merged_dir): os.makedirs(merged_dir) for fold in c.BUZZER_INPUT_FOLDS: if os.path.exists(AbstractGuesser.guess_path(merged_dir, fold)): log.info("Merged {0} exists, skipping.".format(fold)) continue new_guesses = pd.DataFrame(columns=[ 'fold', 'guess', 'guesser', 'qnum', 'score', 'sentence', 'token' ], dtype='object') for guesser in GUESSERS: guesser_dir = os.path.join(c.GUESSER_TARGET_PREFIX, guesser) guesses = AbstractGuesser.load_guesses(guesser_dir, folds=[fold]) new_guesses = new_guesses.append(guesses) for col in ['qnum', 'sentence', 'token', 'score']: new_guesses[col] = pd.to_numeric(new_guesses[col], downcast='integer') AbstractGuesser.save_guesses(new_guesses, merged_dir, folds=[fold]) log.info("Merging: {0} finished.".format(fold))
def test(): gspec = AbstractGuesser.list_enabled_guessers()[0] guesser_dir = AbstractGuesser.output_path(gspec.guesser_module, gspec.guesser_class, '') guesser = ElasticSearchWikidataGuesser.load(guesser_dir) torch.cuda.set_device(0) predictor = Predictor() predictor.cuda() dataset = BonusPairsDataset() examples = [x for x in dataset.examples if x['start'] != -1] guesses = [] for example in tqdm(examples): document = example['content'] question = example['query'] answer = example['answer'] predictions = predictor.predict(document, question, top_n=1) prediction = predictions[0][0] gs = guesser.guess_single(example['query']) gs = sorted(gs.items(), key=lambda x: x[1])[::-1] guess = gs[0][0].replace('_', ' ') guesses.append((prediction, guess, example['answer'])) with open('results.pkl', 'wb') as f: pickle.dump(guesses, f)
def scores(self): if not self.initialized: guess_df = AbstractGuesser.load_all_guesses( directory_prefix=self.directory_prefix) self.map = AbstractGuesser.load_guess_score_map(guess_df) self.initialized = True return self.map
def main(): buzzer = RNNBuzzer() # setup questions questions = list(QuestionDatabase().all_questions().values()) dev_questions = [x for x in questions if x.fold == 'dev'] # setup machine agent gspec = AbstractGuesser.list_enabled_guessers()[0] guesser_dir = AbstractGuesser.output_path(gspec.guesser_module, gspec.guesser_class, '') guesser = ElasticSearchWikidataGuesser.load(guesser_dir) guesser = ESGuesserWrapper(guesser) machine_agent = GuesserBuzzerAgent(guesser, buzzer) # setup human agent human_agent = HumanAgent() # setup hook hooks = [] hooks.append(hook.NotifyBuzzingHook) hooks.append(hook.GameInterfaceHook) hooks.append(hook.VisualizeGuesserBuzzerHook(machine_agent)) hooks.append(hook.HighlightHook) # setup game game = Game(dev_questions, [human_agent, machine_agent], hooks) game.run(10)
def protobowl(model, fold=BUZZER_DEV_FOLD): buzzes = get_buzzes(model, fold) """eval""" guesses_dir = AbstractGuesser.output_path("qanta.guesser.rnn", "RnnGuesser", 0, "") guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, "char") with open(guesses_dir, "rb") as f: guesses = pickle.load(f) guesses = guesses.groupby("qanta_id") questions = QuizBowlDataset(buzzer_train=True).questions_by_fold() questions = questions[fold] df = load_protobowl() df = df.groupby("qid") worker = partial(simulate_game, guesses, buzzes, df) possibility = [] outcome = [] for question in tqdm(questions): pos, out = worker(question) possibility += pos outcome += out result_df = pd.DataFrame({"Possibility": possibility, "Outcome": outcome,}) result_dir = os.path.join(model.model_dir, "{}_protobowl.pkl".format(fold)) with open(result_dir, "wb") as f: pickle.dump(result_df, f)
def guesser_api(host, port, debug, guessers): if debug: log.warning( 'WARNING: debug mode can expose environment variables (AWS keys), NEVER use when API is exposed to web') log.warning('Confirm that you would like to enable flask debugging') confirmation = input('yes/no:\n').strip() if confirmation != 'yes': raise ValueError('Most confirm enabling debug mode') AbstractGuesser.multi_guesser_web_api(guessers, host=host, port=port, debug=debug)
def output(self): return [ LocalTarget( AbstractGuesser.output_path(self.guesser_module, self.guesser_class, 'guesser_report.pdf')), LocalTarget( AbstractGuesser.output_path(self.guesser_module, self.guesser_class, 'guesser_report.pickle')) ]
def run(self): guesser_class = get_class(self.guesser_module, self.guesser_class) reporting_directory = AbstractGuesser.reporting_path( self.guesser_module, self.guesser_class, self.config_num, "") # In the cases of huge parameter sweeps on SLURM its easy to accidentally run out of /fs/ storage. # Since we only care about the results we can get them, then delete the models. We can use the regular # GuesserReport to preserve the model guesser_directory = AbstractGuesser.output_path( self.guesser_module, self.guesser_class, self.config_num, "") param_path = AbstractGuesser.output_path( self.guesser_module, self.guesser_class, self.config_num, f"guesser_params.pickle", ) guesses_files = [] if os.path.exists(c.QANTA_EXPO_DATASET_PATH): folds = [c.GUESSER_DEV_FOLD, c.GUESSER_TEST_FOLD, c.EXPO_FOLD] else: folds = [c.GUESSER_DEV_FOLD, c.GUESSER_TEST_FOLD] for f in folds: guesses_files.extend([ f"guesses_char_{f}.pickle", f"guesses_full_{f}.pickle", f"guesses_first_{f}.pickle", ]) guesses_paths = [ AbstractGuesser.output_path(self.guesser_module, self.guesser_class, self.config_num, f) for f in guesses_files ] log.info(f'Running: "cp {param_path} {reporting_directory}"') shell(f"cp {param_path} {reporting_directory}") for g_path in guesses_paths: log.info(f'Running: "cp {g_path} {reporting_directory}"') shell(f"cp {g_path} {reporting_directory}") guesser_instance = guesser_class(self.config_num) for f in folds: guesser_instance.create_report(reporting_directory, f) log.info(f'Running: "rm -rf {guesser_directory}"') shell(f"rm -rf {guesser_directory}") for g_path in guesses_paths: shell(f"rm -f {g_path}")
def guesser_api(host, port, debug, guessers): if debug: log.warning( 'WARNING: debug mode can expose environment variables (AWS keys), NEVER use when API is exposed to web' ) log.warning('Confirm that you would like to enable flask debugging') confirmation = input('yes/no:\n').strip() if confirmation != 'yes': raise ValueError('Most confirm enabling debug mode') AbstractGuesser.multi_guesser_web_api(guessers, host=host, port=port, debug=debug)
def output(self): targets = [] for g_spec in AbstractGuesser.list_enabled_guessers(): guesser = f"{g_spec.guesser_module}.{g_spec.guesser_class}" targets.append( LocalTarget(f"output/guesser/best/{guesser}/best.touch")) return targets
def requires(self): for g_spec in AbstractGuesser.list_enabled_guessers(): yield TrainGuesser( guesser_module=g_spec.guesser_module, guesser_class=g_spec.guesser_class, dependency_module=g_spec.dependency_module, dependency_class=g_spec.dependency_class, )
def output(self): guesser_class = get_class(self.guesser_module, self.guesser_class) guesser_targets = [ LocalTarget(file) for file in guesser_class.files( AbstractGuesser.output_path(self.guesser_module, self.guesser_class, '')) ] return [ LocalTarget( AbstractGuesser.output_path(self.guesser_module, self.guesser_class, '')), LocalTarget( AbstractGuesser.output_path(self.guesser_module, self.guesser_class, 'guesser_params.pickle')) ] + guesser_targets
def run(self): guesser_class = get_class(self.guesser_module, self.guesser_class) guesser_instance = guesser_class() # type: AbstractGuesser qb_dataset = guesser_instance.qb_dataset() start_time = time.time() guesser_instance.train(qb_dataset.training_data()) end_time = time.time() guesser_instance.save( AbstractGuesser.output_path(self.guesser_module, self.guesser_class, '')) params = guesser_instance.parameters() params['training_time'] = end_time - start_time params_path = AbstractGuesser.output_path(self.guesser_module, self.guesser_class, 'guesser_params.pickle') with open(params_path, 'wb') as f: pickle.dump(params, f)
def requires(self): yield AllSingleGuesserReports() for g_spec in AbstractGuesser.list_enabled_guessers(): yield GenerateGuesses(guesser_module=g_spec.guesser_module, guesser_class=g_spec.guesser_class, dependency_module=g_spec.dependency_module, dependency_class=g_spec.dependency_class, fold='expo')
def output(self): return LocalTarget( AbstractGuesser.output_path( self.guesser_module, self.guesser_class, self.config_num, f"guesser_report_{self.fold}.pickle", ))
def run(self): guesser_class = get_class(self.guesser_module, self.guesser_class) guesser_directory = AbstractGuesser.output_path( self.guesser_module, self.guesser_class, self.config_num, "") guesser_instance = guesser_class.load( guesser_directory) # type: AbstractGuesser if self.fold in {c.GUESSER_TRAIN_FOLD, c.GUESSER_DEV_FOLD}: char_skip = conf["guesser_char_skip"] elif self.fold == c.EXPO_FOLD: char_skip = conf["expo_char_skip"] else: char_skip = conf["buzzer_char_skip"] log.info( f"Generating and saving guesses for {self.fold} fold with char_skip={char_skip}..." ) start_time = time.time() guess_df = guesser_instance.generate_guesses(self.n_guesses, [self.fold], char_skip=char_skip) end_time = time.time() elapsed = end_time - start_time log.info( f"Guessing on {self.fold} fold took {elapsed}s, saving guesses...") guesser_class.save_guesses(guess_df, guesser_directory, [self.fold], "char") log.info("Done saving guesses") log.info( f"Generating and saving guesses for {self.fold} fold with full question..." ) start_time = time.time() guess_df = guesser_instance.generate_guesses(self.n_guesses, [self.fold], full_question=True) end_time = time.time() elapsed = end_time - start_time log.info( f"Guessing on {self.fold} fold took {elapsed}s, saving guesses...") guesser_class.save_guesses(guess_df, guesser_directory, [self.fold], "full") log.info("Done saving guesses") log.info( f"Generating and saving guesses for {self.fold} fold with first sentence" ) start_time = time.time() guess_df = guesser_instance.generate_guesses(self.n_guesses, [self.fold], first_sentence=True) end_time = time.time() elapsed = end_time - start_time log.info( f"Guessing on {self.fold} fold took {elapsed}s, saving guesses...") guesser_class.save_guesses(guess_df, guesser_directory, [self.fold], "first") log.info("Done saving guesses")
def requires(self): for g_spec in AbstractGuesser.list_enabled_guessers(): for fold in c.GUESSER_GENERATION_FOLDS: yield GenerateGuesses( guesser_module=g_spec.guesser_module, guesser_class=g_spec.guesser_class, dependency_module=g_spec.dependency_module, dependency_class=g_spec.dependency_class, fold=fold)
def requires(self): for g_spec in AbstractGuesser.list_enabled_guessers(): yield GuesserPerformance( guesser_module=g_spec.guesser_module, guesser_class=g_spec.guesser_class, dependency_module=g_spec.dependency_module, dependency_class=g_spec.dependency_class, config_num=g_spec.config_num, )
def test_buzzer(): questions = QuestionDatabase().all_questions() buzzer = RNNBuzzer(word_skip=conf['buzzer_word_skip']) # setup machine agent gspec = AbstractGuesser.list_enabled_guessers()[0] guesser_dir = AbstractGuesser.output_path(gspec.guesser_module, gspec.guesser_class, '') guesser = ElasticSearchWikidataGuesser.load(guesser_dir) guesser = ESGuesserWrapper(guesser) key = list(questions.keys())[4] question = questions[key].flatten_text().split() for i, word in enumerate(question): clue = ' '.join(question[:i]) guesses = guesser.guess(clue) buzz = buzzer.buzz(guesses) print(buzz)
def run(self): guesser_class = get_class(self.guesser_module, self.guesser_class) guesser_directory = AbstractGuesser.output_path( self.guesser_module, self.guesser_class, self.config_num, '') guesser_instance = guesser_class.load( guesser_directory) # type: AbstractGuesser if self.fold in {c.GUESSER_TRAIN_FOLD, c.GUESSER_DEV_FOLD}: char_skip = conf['guesser_char_skip'] else: char_skip = conf['buzzer_char_skip'] log.info( f'Generating and saving guesses for {self.fold} fold with char_skip={char_skip}...' ) start_time = time.time() guess_df = guesser_instance.generate_guesses(self.n_guesses, [self.fold], char_skip=char_skip) end_time = time.time() elapsed = end_time - start_time log.info( f'Guessing on {self.fold} fold took {elapsed}s, saving guesses...') guesser_class.save_guesses(guess_df, guesser_directory, [self.fold], 'char') log.info('Done saving guesses') log.info( f'Generating and saving guesses for {self.fold} fold with full question...' ) start_time = time.time() guess_df = guesser_instance.generate_guesses(self.n_guesses, [self.fold], full_question=True) end_time = time.time() elapsed = end_time - start_time log.info( f'Guessing on {self.fold} fold took {elapsed}s, saving guesses...') guesser_class.save_guesses(guess_df, guesser_directory, [self.fold], 'full') log.info('Done saving guesses') log.info( f'Generating and saving guesses for {self.fold} fold with first sentence' ) start_time = time.time() guess_df = guesser_instance.generate_guesses(self.n_guesses, [self.fold], first_sentence=True) end_time = time.time() elapsed = end_time - start_time log.info( f'Guessing on {self.fold} fold took {elapsed}s, saving guesses...') guesser_class.save_guesses(guess_df, guesser_directory, [self.fold], 'first') log.info('Done saving guesses')
def ew(model, fold=BUZZER_DEV_FOLD): buzzes = get_buzzes(model, fold) guesses_dir = AbstractGuesser.output_path( 'qanta.guesser.rnn', 'RnnGuesser', 0, '') guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, 'char') with open(guesses_dir, 'rb') as f: guesses = pickle.load(f) guesses = guesses.groupby('qanta_id') answers = dict() for qid, bs in buzzes.items(): answers[qid] = [] groups = guesses.get_group(qid).groupby('char_index') for char_index, scores in zip(*bs): guess = groups.get_group(char_index).head(1)['guess'] guess = guess.values[0] buzz = scores[0] < scores[1] answers[qid].append({ 'char_index': char_index, 'guess': guess, 'buzz': buzz, }) questions = QuizBowlDataset(buzzer_train=True).questions_by_fold() questions = {q.qanta_id: q for q in questions[fold]} curve_score = CurveScore() ew = [] ew_opt = [] for qid, answer in answers.items(): question = questions[qid] q = {'text': question.text, 'page': question.page} ew.append(curve_score.score(answer, q)) ew_opt.append(curve_score.score_optimal(answer, q)) eval_out = { 'expected_wins': sum(ew), 'n_examples': len(ew), 'expected_wins_optimal': sum(ew_opt), } print(json.dumps(eval_out)) return eval_out
def ew(model, fold=BUZZER_DEV_FOLD): buzzes = get_buzzes(model, fold) guesses_dir = AbstractGuesser.output_path('qanta.guesser.rnn', 'RnnGuesser', 0, '') guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, 'char') with open(guesses_dir, 'rb') as f: guesses = pickle.load(f) guesses = guesses.groupby('qanta_id') answers = dict() for qid, bs in buzzes.items(): answers[qid] = [] groups = guesses.get_group(qid).groupby('char_index') for char_index, scores in zip(*bs): guess = groups.get_group(char_index).head(1)['guess'] guess = guess.values[0] buzz = scores[0] < scores[1] answers[qid].append({ 'char_index': char_index, 'guess': guess, 'buzz': buzz, }) questions = QuizBowlDataset(buzzer_train=True).questions_by_fold() questions = {q.qanta_id: q for q in questions[fold]} curve_score = CurveScore() ew = [] ew_opt = [] for qid, answer in answers.items(): question = questions[qid] q = {'text': question.text, 'page': question.page} ew.append(curve_score.score(answer, q)) ew_opt.append(curve_score.score_optimal(answer, q)) eval_out = { 'expected_wins': sum(ew), 'n_examples': len(ew), 'expected_wins_optimal': sum(ew_opt), } print(json.dumps(eval_out)) return eval_out
def output(self): files = [ f'guesses_char_{self.fold}.pickle', f'guesses_full_{self.fold}.pickle', f'guesses_first_{self.fold}.pickle' ] return [ LocalTarget( AbstractGuesser.output_path(self.guesser_module, self.guesser_class, self.config_num, f)) for f in files ]
def load_quizbowl(folds=c.BUZZER_INPUT_FOLDS) \ -> Tuple[Dict[str, int], Dict[str, list]]: # merge_dfs() log.info('Loading data') question_db = QuestionDatabase() quizbowl_db = QuizBowlDataset(bc.MIN_ANSWERS, guesser_train=True, buzzer_train=True) all_questions = question_db.all_questions() if not os.path.isfile(bc.OPTIONS_DIR): log.info('Loading the set of options') all_options = set(quizbowl_db.training_data()[1]) id2option = list(all_options) with open(safe_path(bc.OPTIONS_DIR), 'wb') as outfile: pickle.dump(id2option, outfile) else: with open(safe_path(bc.OPTIONS_DIR), 'rb') as infile: id2option = pickle.load(infile) option2id = {o: i for i, o in enumerate(id2option)} num_options = len(id2option) log.info('Number of options {0}'.format(len(id2option))) guesses_by_fold = dict() for fold in folds: save_dir = '%s_processed.pickle' % (os.path.join(bc.GUESSES_DIR, fold)) if os.path.isfile(save_dir): with open(safe_path(save_dir), 'rb') as infile: guesses_by_fold[fold] = pickle.load(infile) log.info('Loading {0} guesses'.format(fold)) continue log.info('Processing {0} guesses'.format(fold)) guesses = AbstractGuesser.load_guesses(bc.GUESSES_DIR, folds=[fold]) worker = partial(_process_question, option2id, all_questions) inputs = guesses.groupby('qnum') guesses_by_fold[fold] = _multiprocess(worker, inputs, info='df data', multi=True) guesses_by_fold[fold] = [ x for x in guesses_by_fold[fold] if x is not None ] print(len(guesses_by_fold[fold])) with open(safe_path(save_dir), 'wb') as outfile: pickle.dump(guesses_by_fold[fold], outfile) log.info('Processed {0} guesses saved to {1}'.format(fold, save_dir)) return option2id, guesses_by_fold
def run(self): guesser_types = set() for g_spec in AbstractGuesser.list_enabled_guessers(): guesser_types.add( f"{g_spec.guesser_module}.{g_spec.guesser_class}") _, _, all_dfs, _ = merge_reports(guesser_types) best_guessers = find_best_guessers(all_dfs) for g, config_num in best_guessers.items(): inp = f"output/guesser/{g}/{config_num}" out = f"output/guesser/best/{g}/" shell(f"touch {inp}/best.touch") shell(f"mkdir -p {out}") shell(f"cp -r {inp}/* {out}")
def run(self): guesser_class = get_class(self.guesser_module, self.guesser_class) guesser_instance = guesser_class( self.config_num) # type: AbstractGuesser qb_dataset = guesser_instance.qb_dataset() start_time = time.time() guesser_instance.train(qb_dataset.training_data()) end_time = time.time() guesser_instance.save( AbstractGuesser.output_path(self.guesser_module, self.guesser_class, self.config_num, "")) params = guesser_instance.parameters() params["training_time"] = end_time - start_time params_path = AbstractGuesser.output_path( self.guesser_module, self.guesser_class, self.config_num, "guesser_params.pickle", ) with open(params_path, "wb") as f: pickle.dump(params, f)
def ew(model, fold=BUZZER_DEV_FOLD): buzzes = get_buzzes(model, fold) guesses_dir = AbstractGuesser.output_path("qanta.guesser.rnn", "RnnGuesser", 0, "") guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, "char") with open(guesses_dir, "rb") as f: guesses = pickle.load(f) guesses = guesses.groupby("qanta_id") answers = dict() for qid, bs in buzzes.items(): answers[qid] = [] groups = guesses.get_group(qid).groupby("char_index") for char_index, scores in zip(*bs): guess = groups.get_group(char_index).head(1)["guess"] guess = guess.values[0] buzz = scores[0] < scores[1] answers[qid].append( {"char_index": char_index, "guess": guess, "buzz": buzz,} ) questions = QuizBowlDataset(buzzer_train=True).questions_by_fold() questions = {q.qanta_id: q for q in questions[fold]} curve_score = CurveScore() ew = [] ew_opt = [] for qid, answer in answers.items(): question = questions[qid] q = {"text": question.text, "page": question.page} ew.append(curve_score.score(answer, q)) ew_opt.append(curve_score.score_optimal(answer, q)) eval_out = { "expected_wins": sum(ew), "n_examples": len(ew), "expected_wins_optimal": sum(ew_opt), } print(json.dumps(eval_out)) return eval_out
def task_list(): guess_df = AbstractGuesser.load_all_guesses() question_db = QuestionDatabase() question_map = question_db.all_questions() tasks = [] guess_df = guess_df[['qnum', 'sentence', 'token', 'guess', 'fold']].drop_duplicates( ['qnum', 'sentence', 'token', 'guess']) for name, guesses in guess_df.groupby(['qnum', 'sentence', 'token']): qnum = name[0] question = question_map[qnum] tasks.append(Task(question, guesses)) return tasks
def __init__(self, buzzer_model_dir='data/neo_0.npz'): gspec = AbstractGuesser.list_enabled_guessers()[0] guesser_dir = 'data/guesser' self.guesser = ElasticSearchWikidataGuesser.load(guesser_dir) if chainer.cuda.available: self.buzzer = RNNBuzzer(model_dir=buzzer_model_dir, word_skip=conf['buzzer_word_skip']) else: self.buzzer = StupidBuzzer() self.ok_to_buzz = True self.answer = '' self.guesses = [] self.evidence = dict()
def main(folds, model_name): all_questions = QuestionDatabase().all_questions() answers = {k: v.page for k, v in all_questions.items()} question_texts = {k: v.text for k, v in all_questions.items()} protobowl_ids = {k: all_questions[k].protobowl for k in all_questions if all_questions[k].protobowl != ''} protobowl_df = load_protobowl().groupby('qid') save_dir = 'output/summary/new_performance/' if not os.path.exists(save_dir): os.makedirs(save_dir) # feature -> fold -> value variables = defaultdict(lambda: defaultdict()) for fold in folds: guesses_df = AbstractGuesser.load_guesses( bc.GUESSES_DIR, folds=[fold]) questions = guesses_df.groupby('qnum') buzzes_dir = bc.BUZZES_DIR.format(fold, model_name) with open(buzzes_dir, 'rb') as infile: buzzes = pickle.load(infile) log.info('Buzzes loaded from {}.'.format(buzzes_dir)) # qnum -> n_guessers * length top_guesses = _multiprocess(_get_top_guesses, questions, info='Top guesses', multi=True) top_guesses = {k: v for k, v in top_guesses} inputs = [top_guesses, buzzes, answers, variables, fold, save_dir] # get_eop_stats(*inputs) get_his_stats(*inputs) # get_hyper_search(*inputs) p_inputs = [question_texts, protobowl_ids, protobowl_df, questions] + inputs get_protobowl(p_inputs) for key, value in variables.items(): variables[key] = dict(value) variables = dict(variables) report(variables, save_dir, folds)
def generate_guesser_slurm(slurm_config_file, task, output_dir): with open(slurm_config_file) as f: slurm_config = yaml.load(f) default_slurm_config = slurm_config['default'] env = Environment(loader=PackageLoader('qanta', 'slurm/templates')) template = env.get_template('guesser-luigi-template.sh') enabled_guessers = list(AbstractGuesser.list_enabled_guessers()) for i, gs in enumerate(enabled_guessers): if gs.guesser_class == 'ElasticSearchGuesser': raise ValueError('ElasticSearchGuesser is not compatible with slurm') elif gs.guesser_class in slurm_config: guesser_slurm_config = slurm_config[gs.guesser_class] else: guesser_slurm_config = None partition = get_slurm_config_value('partition', default_slurm_config, guesser_slurm_config) qos = get_slurm_config_value('qos', default_slurm_config, guesser_slurm_config) mem_per_cpu = get_slurm_config_value('mem_per_cpu', default_slurm_config, guesser_slurm_config) gres = get_slurm_config_value('gres', default_slurm_config, guesser_slurm_config) max_time = get_slurm_config_value('max_time', default_slurm_config, guesser_slurm_config) cpus_per_task = get_slurm_config_value('cpus_per_task', default_slurm_config, guesser_slurm_config) account = get_slurm_config_value('account', default_slurm_config, guesser_slurm_config) if task == 'GuesserReport': folds = GUESSER_GENERATION_FOLDS else: folds = [] script = template.render({ 'task': task, 'guesser_module': gs.guesser_module, 'guesser_class': gs.guesser_class, 'dependency_module': gs.dependency_module, 'dependency_class': gs.dependency_class, 'config_num': gs.config_num, 'partition': partition, 'qos': qos, 'mem_per_cpu': mem_per_cpu, 'max_time': max_time, 'gres': gres, 'cpus_per_task': cpus_per_task, 'account': account, 'folds': folds }) slurm_file = path.join(output_dir, f'slurm-{i}.sh') with safe_open(slurm_file, 'w') as f: f.write(script) singleton_path = 'qanta/slurm/templates/guesser-singleton.sh' singleton_output = path.join(output_dir, 'guesser-singleton.sh') shell(f'cp {singleton_path} {singleton_output}') master_template = env.get_template('guesser-master-template.sh') master_script = master_template.render({ 'script_list': [ path.join(output_dir, f'slurm-{i}.sh') for i in range(len(enabled_guessers)) ] + [singleton_output], 'gres': gres, 'partition': partition, 'qos': qos, 'mem_per_cpu': mem_per_cpu, 'max_time': max_time, 'gres': gres, 'cpus_per_task': cpus_per_task, 'account': account }) with safe_open(path.join(output_dir, 'slurm-master.sh'), 'w') as f: f.write(master_script)
from tqdm import tqdm from elasticsearch_dsl.connections import connections from elasticsearch_dsl import DocType, Text, Keyword, Search, Index from qanta.util.constants import GUESSER_DEV_FOLD from qanta.guesser.abstract import AbstractGuesser from qanta.datasets.quiz_bowl import QuizBowlDataset from qanta.guesser.experimental.elasticsearch_instance_of import ElasticSearchWikidataGuesser from qanta.guesser.experimental.elasticsearch_instance_of import ElasticSearchIndex INDEX_NAME = 'qb_ir_instance_of' gspec = AbstractGuesser.list_enabled_guessers()[0] guesser_dir = AbstractGuesser.output_path(gspec.guesser_module, gspec.guesser_class, '') guesser = ElasticSearchWikidataGuesser.load(guesser_dir) es_index = ElasticSearchIndex() def recursive_guess(question, k=0): p_class, p_prob = guesser.test_instance_of([question])[0] first_guesses = search_not(question, p_class) print('First round') for x in first_guesses: print(x) print() print('Second round') new_guesses = [] for i in range(k): guess = first_guesses[i][0] question += ' ' + ' '.join(guess.split('_')) guesses = es_index.search(question, p_class, p_prob, 0.6)
def read_data( fold, output_type='char', guesser_module='qanta.guesser.rnn', guesser_class='RnnGuesser', guesser_config_num=0, vector_converter=vector_converter_0): if os.path.isfile(dataset_dir.format(fold)): with open(dataset_dir.format(fold), 'rb') as f: return pickle.load(f) g_dir = AbstractGuesser.output_path( guesser_module, guesser_class, guesser_config_num, '') g_path = AbstractGuesser.guess_path(g_dir, fold, output_type) with open(g_path, 'rb') as f: df = pickle.load(f) df_groups = df.groupby('qanta_id') questions = QuizBowlDataset(buzzer_train=True).questions_by_fold() questions = {q.qanta_id: q for q in questions[fold]} pool = Pool(8) worker = partial(process_question, questions, vector_converter) dataset = pool.map(worker, df_groups) with open(dataset_dir.format(fold), 'wb') as f: pickle.dump(dataset, f) return dataset
def protobowl(model, fold=BUZZER_DEV_FOLD): buzzes = get_buzzes(model, fold) '''eval''' guesses_dir = AbstractGuesser.output_path( 'qanta.guesser.rnn', 'RnnGuesser', 0, '') guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, 'char') with open(guesses_dir, 'rb') as f: guesses = pickle.load(f) guesses = guesses.groupby('qanta_id') questions = QuizBowlDataset(buzzer_train=True).questions_by_fold() questions = questions[fold] df = load_protobowl() df = df.groupby('qid') worker = partial(simulate_game, guesses, buzzes, df) possibility = [] outcome = [] for question in tqdm(questions): pos, out = worker(question) possibility += pos outcome += out result_df = pd.DataFrame({ 'Possibility': possibility, 'Outcome': outcome, }) result_dir = os.path.join( model.model_dir, '{}_protobowl.pkl'.format(fold)) with open(result_dir, 'wb') as f: pickle.dump(result_df, f)