Beispiel #1
1
def stack(model_dir, model_name, fold=BUZZER_DEV_FOLD):
    guesses_dir = AbstractGuesser.output_path(
        'qanta.guesser.rnn', 'RnnGuesser', 0, '')
    guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, 'char')
    with open(guesses_dir, 'rb') as f:
        guesses = pickle.load(f)
    guesses = guesses.groupby('qanta_id')

    buzzes_dir = os.path.join(model_dir, '{}_buzzes.pkl'.format(fold))
    with open(buzzes_dir, 'rb') as f:
        buzzes = pickle.load(f)

    questions = QuizBowlDataset(buzzer_train=True).questions_by_fold()
    questions = {q.qanta_id: q for q in questions[fold]}

    stack_freq = {'Position': [], 'Buzzing': []}
    count = defaultdict(lambda: 0)
    for qid, (char_indices, scores) in buzzes.items():
        gs = guesses.get_group(qid).groupby('char_index')
        gs = gs.aggregate(lambda x: x.head(1)).to_dict()['guess']
        question = questions[qid]
        q_len = len(question.text)
        for i, char_index in enumerate(char_indices):
            buzz_oracle = gs[char_index] == question.page
            buzz_buzzer = scores[i][1] > scores[i][0]

            only_oracle = buzz_oracle and (not buzz_buzzer)
            only_buzzer = buzz_buzzer and (not buzz_oracle)
            both = buzz_buzzer and buzz_oracle
            neither = (not buzz_buzzer) and (not buzz_oracle)

            rel_position = np.round(char_index / q_len, decimals=1)
            count[rel_position] += 1

            if only_oracle:
                stack_freq['Position'].append(rel_position)
                stack_freq['Buzzing'].append('Only optimal')

            if only_buzzer:
                stack_freq['Position'].append(rel_position)
                stack_freq['Buzzing'].append('Only buzzer')

            if both:
                stack_freq['Position'].append(rel_position)
                stack_freq['Buzzing'].append('Both')

            if neither:
                stack_freq['Position'].append(rel_position)
                stack_freq['Buzzing'].append('Neither')

    df = pd.DataFrame(stack_freq)
    df = df.groupby(['Position', 'Buzzing'])
    df = df.size().reset_index().rename(columns={0: 'Frequency'})
    df['Frequency'] = df.apply(
        lambda row: row['Frequency'] / count[row['Position']],
        axis=1)
    df['Model'] = pd.Series([model_name for _ in range(len(df))])
    stack_dir = os.path.join(model_dir, '{}_stack.pkl'.format(fold))
    with open(stack_dir, 'wb') as f:
        pickle.dump(df, f)

    return df
Beispiel #2
1
import pickle
from qanta.guesser.abstract import AbstractGuesser
from qanta.guesser.dan import DanGuesser
from qanta.util.constants import BUZZER_TRAIN_FOLD, BUZZER_DEV_FOLD

guesser_directory = AbstractGuesser.output_path(
    'qanta.guesser.dan', 'DanGuesser', 0, '')
guesser = DanGuesser.load(guesser_directory)  # type: AbstractGuesser
guesser.batch_size /= 8

word_skip = 2
folds = [BUZZER_TRAIN_FOLD, BUZZER_DEV_FOLD]
for fold in folds:
    df = guesser.generate_guesses(1, [fold], word_skip=word_skip)
    output_path = AbstractGuesser.guess_path(guesser_directory, fold)
    with open(output_path, 'wb') as f:
        pickle.dump(df, f)
Beispiel #3
0
 def output(self):
     return [
         LocalTarget(
             AbstractGuesser.output_path(self.guesser_module,
                                         self.guesser_class,
                                         'guesser_report.pdf')),
         LocalTarget(
             AbstractGuesser.output_path(self.guesser_module,
                                         self.guesser_class,
                                         'guesser_report.pickle'))
     ]
Beispiel #4
0
    def run(self):
        guesser_class = get_class(self.guesser_module, self.guesser_class)
        reporting_directory = AbstractGuesser.reporting_path(
            self.guesser_module, self.guesser_class, self.config_num, "")

        # In the cases of huge parameter sweeps on SLURM its easy to accidentally run out of /fs/ storage.
        # Since we only care about the results we can get them, then delete the models. We can use the regular
        # GuesserReport to preserve the model
        guesser_directory = AbstractGuesser.output_path(
            self.guesser_module, self.guesser_class, self.config_num, "")

        param_path = AbstractGuesser.output_path(
            self.guesser_module,
            self.guesser_class,
            self.config_num,
            f"guesser_params.pickle",
        )
        guesses_files = []
        if os.path.exists(c.QANTA_EXPO_DATASET_PATH):
            folds = [c.GUESSER_DEV_FOLD, c.GUESSER_TEST_FOLD, c.EXPO_FOLD]
        else:
            folds = [c.GUESSER_DEV_FOLD, c.GUESSER_TEST_FOLD]

        for f in folds:
            guesses_files.extend([
                f"guesses_char_{f}.pickle",
                f"guesses_full_{f}.pickle",
                f"guesses_first_{f}.pickle",
            ])

        guesses_paths = [
            AbstractGuesser.output_path(self.guesser_module,
                                        self.guesser_class, self.config_num, f)
            for f in guesses_files
        ]

        log.info(f'Running: "cp {param_path} {reporting_directory}"')
        shell(f"cp {param_path} {reporting_directory}")

        for g_path in guesses_paths:
            log.info(f'Running: "cp {g_path} {reporting_directory}"')
            shell(f"cp {g_path} {reporting_directory}")

        guesser_instance = guesser_class(self.config_num)
        for f in folds:
            guesser_instance.create_report(reporting_directory, f)

        log.info(f'Running: "rm -rf {guesser_directory}"')
        shell(f"rm -rf {guesser_directory}")
        for g_path in guesses_paths:
            shell(f"rm -f {g_path}")
Beispiel #5
0
def read_data(fold,
              output_type='char',
              guesser_module='qanta.guesser.dan',
              guesser_class='DanGuesser',
              guesser_config_num=0,
              vector_converter=vector_converter_0):

    if os.path.isfile(dataset_dir.format(fold)):
        with open(dataset_dir.format(fold), 'rb') as f:
            return pickle.load(f)

    g_dir = AbstractGuesser.output_path(guesser_module, guesser_class,
                                        guesser_config_num, '')
    g_path = AbstractGuesser.guess_path(g_dir, fold, output_type)
    with open(g_path, 'rb') as f:
        df = pickle.load(f)
    df_groups = df.groupby('qanta_id')

    questions = QuizBowlDataset(buzzer_train=True).questions_by_fold()
    questions = {q.qanta_id: q for q in questions[fold]}

    pool = Pool(8)
    worker = partial(process_question, questions, vector_converter)
    dataset = pool.map(worker, df_groups)

    with open(dataset_dir.format(fold), 'wb') as f:
        return pickle.dump(dataset, f)

    return dataset
Beispiel #6
0
def main():
    buzzer = RNNBuzzer()

    # setup questions
    questions = list(QuestionDatabase().all_questions().values())
    dev_questions = [x for x in questions if x.fold == 'dev']

    # setup machine agent
    gspec = AbstractGuesser.list_enabled_guessers()[0]
    guesser_dir = AbstractGuesser.output_path(gspec.guesser_module,
                                              gspec.guesser_class, '')
    guesser = ElasticSearchWikidataGuesser.load(guesser_dir)
    guesser = ESGuesserWrapper(guesser)
    machine_agent = GuesserBuzzerAgent(guesser, buzzer)

    # setup human agent
    human_agent = HumanAgent()

    # setup hook
    hooks = []
    hooks.append(hook.NotifyBuzzingHook)
    hooks.append(hook.GameInterfaceHook)
    hooks.append(hook.VisualizeGuesserBuzzerHook(machine_agent))
    hooks.append(hook.HighlightHook)

    # setup game
    game = Game(dev_questions, [human_agent, machine_agent], hooks)

    game.run(10)
Beispiel #7
0
def protobowl(model, fold=BUZZER_DEV_FOLD):
    buzzes = get_buzzes(model, fold)
    '''eval'''
    guesses_dir = AbstractGuesser.output_path('qanta.guesser.rnn',
                                              'RnnGuesser', 0, '')
    guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, 'char')
    with open(guesses_dir, 'rb') as f:
        guesses = pickle.load(f)
    guesses = guesses.groupby('qanta_id')

    questions = QuizBowlDataset(buzzer_train=True).questions_by_fold()
    questions = questions[fold]

    df = load_protobowl()
    df = df.groupby('qid')

    worker = partial(simulate_game, guesses, buzzes, df)

    possibility = []
    outcome = []
    for question in tqdm(questions):
        pos, out = worker(question)
        possibility += pos
        outcome += out

    result_df = pd.DataFrame({
        'Possibility': possibility,
        'Outcome': outcome,
    })

    result_dir = os.path.join(model.model_dir, '{}_protobowl.pkl'.format(fold))
    with open(result_dir, 'wb') as f:
        pickle.dump(result_df, f)
Beispiel #8
0
def protobowl(model, fold=BUZZER_DEV_FOLD):
    buzzes = get_buzzes(model, fold)

    """eval"""
    guesses_dir = AbstractGuesser.output_path("qanta.guesser.rnn", "RnnGuesser", 0, "")
    guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, "char")
    with open(guesses_dir, "rb") as f:
        guesses = pickle.load(f)
    guesses = guesses.groupby("qanta_id")

    questions = QuizBowlDataset(buzzer_train=True).questions_by_fold()
    questions = questions[fold]

    df = load_protobowl()
    df = df.groupby("qid")

    worker = partial(simulate_game, guesses, buzzes, df)

    possibility = []
    outcome = []
    for question in tqdm(questions):
        pos, out = worker(question)
        possibility += pos
        outcome += out

    result_df = pd.DataFrame({"Possibility": possibility, "Outcome": outcome,})

    result_dir = os.path.join(model.model_dir, "{}_protobowl.pkl".format(fold))
    with open(result_dir, "wb") as f:
        pickle.dump(result_df, f)
Beispiel #9
0
def test():
    gspec = AbstractGuesser.list_enabled_guessers()[0]
    guesser_dir = AbstractGuesser.output_path(gspec.guesser_module,
            gspec.guesser_class, '')
    guesser = ElasticSearchWikidataGuesser.load(guesser_dir)

    torch.cuda.set_device(0)
    predictor = Predictor()
    predictor.cuda()

    dataset = BonusPairsDataset()
    examples = [x for x in dataset.examples if x['start'] != -1]
    
    guesses = []
    for example in tqdm(examples):
        document = example['content']
        question = example['query']
        answer = example['answer']
        predictions = predictor.predict(document, question, top_n=1)
        prediction = predictions[0][0]

        gs = guesser.guess_single(example['query'])
        gs = sorted(gs.items(), key=lambda x: x[1])[::-1]
        guess = gs[0][0].replace('_', ' ')

        guesses.append((prediction, guess, example['answer']))

    with open('results.pkl', 'wb') as f:
        pickle.dump(guesses, f)
Beispiel #10
0
    def output(self):
        guesser_class = get_class(self.guesser_module, self.guesser_class)
        guesser_targets = [
            LocalTarget(file) for file in guesser_class.files(
                AbstractGuesser.output_path(self.guesser_module,
                                            self.guesser_class, ''))
        ]

        return [
            LocalTarget(
                AbstractGuesser.output_path(self.guesser_module,
                                            self.guesser_class, '')),
            LocalTarget(
                AbstractGuesser.output_path(self.guesser_module,
                                            self.guesser_class,
                                            'guesser_params.pickle'))
        ] + guesser_targets
Beispiel #11
0
 def run(self):
     guesser_class = get_class(self.guesser_module, self.guesser_class)
     guesser_instance = guesser_class()  # type: AbstractGuesser
     qb_dataset = guesser_instance.qb_dataset()
     start_time = time.time()
     guesser_instance.train(qb_dataset.training_data())
     end_time = time.time()
     guesser_instance.save(
         AbstractGuesser.output_path(self.guesser_module,
                                     self.guesser_class, ''))
     params = guesser_instance.parameters()
     params['training_time'] = end_time - start_time
     params_path = AbstractGuesser.output_path(self.guesser_module,
                                               self.guesser_class,
                                               'guesser_params.pickle')
     with open(params_path, 'wb') as f:
         pickle.dump(params, f)
Beispiel #12
0
 def output(self):
     return LocalTarget(
         AbstractGuesser.output_path(
             self.guesser_module,
             self.guesser_class,
             self.config_num,
             f"guesser_report_{self.fold}.pickle",
         ))
Beispiel #13
0
    def run(self):
        guesser_class = get_class(self.guesser_module, self.guesser_class)
        guesser_directory = AbstractGuesser.output_path(
            self.guesser_module, self.guesser_class, self.config_num, "")
        guesser_instance = guesser_class.load(
            guesser_directory)  # type: AbstractGuesser

        if self.fold in {c.GUESSER_TRAIN_FOLD, c.GUESSER_DEV_FOLD}:
            char_skip = conf["guesser_char_skip"]
        elif self.fold == c.EXPO_FOLD:
            char_skip = conf["expo_char_skip"]
        else:
            char_skip = conf["buzzer_char_skip"]

        log.info(
            f"Generating and saving guesses for {self.fold} fold with char_skip={char_skip}..."
        )
        start_time = time.time()
        guess_df = guesser_instance.generate_guesses(self.n_guesses,
                                                     [self.fold],
                                                     char_skip=char_skip)
        end_time = time.time()
        elapsed = end_time - start_time
        log.info(
            f"Guessing on {self.fold} fold took {elapsed}s, saving guesses...")
        guesser_class.save_guesses(guess_df, guesser_directory, [self.fold],
                                   "char")
        log.info("Done saving guesses")

        log.info(
            f"Generating and saving guesses for {self.fold} fold with full question..."
        )
        start_time = time.time()
        guess_df = guesser_instance.generate_guesses(self.n_guesses,
                                                     [self.fold],
                                                     full_question=True)
        end_time = time.time()
        elapsed = end_time - start_time
        log.info(
            f"Guessing on {self.fold} fold took {elapsed}s, saving guesses...")
        guesser_class.save_guesses(guess_df, guesser_directory, [self.fold],
                                   "full")
        log.info("Done saving guesses")

        log.info(
            f"Generating and saving guesses for {self.fold} fold with first sentence"
        )
        start_time = time.time()
        guess_df = guesser_instance.generate_guesses(self.n_guesses,
                                                     [self.fold],
                                                     first_sentence=True)
        end_time = time.time()
        elapsed = end_time - start_time
        log.info(
            f"Guessing on {self.fold} fold took {elapsed}s, saving guesses...")
        guesser_class.save_guesses(guess_df, guesser_directory, [self.fold],
                                   "first")
        log.info("Done saving guesses")
Beispiel #14
0
    def run(self):
        guesser_class = get_class(self.guesser_module, self.guesser_class)
        guesser_directory = AbstractGuesser.output_path(
            self.guesser_module, self.guesser_class, self.config_num, '')
        guesser_instance = guesser_class.load(
            guesser_directory)  # type: AbstractGuesser

        if self.fold in {c.GUESSER_TRAIN_FOLD, c.GUESSER_DEV_FOLD}:
            char_skip = conf['guesser_char_skip']
        else:
            char_skip = conf['buzzer_char_skip']

        log.info(
            f'Generating and saving guesses for {self.fold} fold with char_skip={char_skip}...'
        )
        start_time = time.time()
        guess_df = guesser_instance.generate_guesses(self.n_guesses,
                                                     [self.fold],
                                                     char_skip=char_skip)
        end_time = time.time()
        elapsed = end_time - start_time
        log.info(
            f'Guessing on {self.fold} fold took {elapsed}s, saving guesses...')
        guesser_class.save_guesses(guess_df, guesser_directory, [self.fold],
                                   'char')
        log.info('Done saving guesses')

        log.info(
            f'Generating and saving guesses for {self.fold} fold with full question...'
        )
        start_time = time.time()
        guess_df = guesser_instance.generate_guesses(self.n_guesses,
                                                     [self.fold],
                                                     full_question=True)
        end_time = time.time()
        elapsed = end_time - start_time
        log.info(
            f'Guessing on {self.fold} fold took {elapsed}s, saving guesses...')
        guesser_class.save_guesses(guess_df, guesser_directory, [self.fold],
                                   'full')
        log.info('Done saving guesses')

        log.info(
            f'Generating and saving guesses for {self.fold} fold with first sentence'
        )
        start_time = time.time()
        guess_df = guesser_instance.generate_guesses(self.n_guesses,
                                                     [self.fold],
                                                     first_sentence=True)
        end_time = time.time()
        elapsed = end_time - start_time
        log.info(
            f'Guessing on {self.fold} fold took {elapsed}s, saving guesses...')
        guesser_class.save_guesses(guess_df, guesser_directory, [self.fold],
                                   'first')
        log.info('Done saving guesses')
Beispiel #15
0
 def output(self):
     files = [
         f'guesses_char_{self.fold}.pickle',
         f'guesses_full_{self.fold}.pickle',
         f'guesses_first_{self.fold}.pickle'
     ]
     return [
         LocalTarget(
             AbstractGuesser.output_path(self.guesser_module,
                                         self.guesser_class,
                                         self.config_num, f)) for f in files
     ]
Beispiel #16
0
 def run(self):
     guesser_class = get_class(self.guesser_module, self.guesser_class)
     guesser_instance = guesser_class(
         self.config_num)  # type: AbstractGuesser
     qb_dataset = guesser_instance.qb_dataset()
     start_time = time.time()
     guesser_instance.train(qb_dataset.training_data())
     end_time = time.time()
     guesser_instance.save(
         AbstractGuesser.output_path(self.guesser_module,
                                     self.guesser_class, self.config_num,
                                     ""))
     params = guesser_instance.parameters()
     params["training_time"] = end_time - start_time
     params_path = AbstractGuesser.output_path(
         self.guesser_module,
         self.guesser_class,
         self.config_num,
         "guesser_params.pickle",
     )
     with open(params_path, "wb") as f:
         pickle.dump(params, f)
Beispiel #17
0
def test_buzzer():
    questions = QuestionDatabase().all_questions()
    buzzer = RNNBuzzer(word_skip=conf['buzzer_word_skip'])

    # setup machine agent
    gspec = AbstractGuesser.list_enabled_guessers()[0]
    guesser_dir = AbstractGuesser.output_path(gspec.guesser_module,
                                              gspec.guesser_class, '')
    guesser = ElasticSearchWikidataGuesser.load(guesser_dir)
    guesser = ESGuesserWrapper(guesser)

    key = list(questions.keys())[4]
    question = questions[key].flatten_text().split()
    for i, word in enumerate(question):
        clue = ' '.join(question[:i])
        guesses = guesser.guess(clue)
        buzz = buzzer.buzz(guesses)
        print(buzz)
Beispiel #18
0
def ew(model, fold=BUZZER_DEV_FOLD):
    buzzes = get_buzzes(model, fold)

    guesses_dir = AbstractGuesser.output_path('qanta.guesser.rnn',
                                              'RnnGuesser', 0, '')
    guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, 'char')
    with open(guesses_dir, 'rb') as f:
        guesses = pickle.load(f)
    guesses = guesses.groupby('qanta_id')

    answers = dict()
    for qid, bs in buzzes.items():
        answers[qid] = []
        groups = guesses.get_group(qid).groupby('char_index')
        for char_index, scores in zip(*bs):
            guess = groups.get_group(char_index).head(1)['guess']
            guess = guess.values[0]
            buzz = scores[0] < scores[1]
            answers[qid].append({
                'char_index': char_index,
                'guess': guess,
                'buzz': buzz,
            })

    questions = QuizBowlDataset(buzzer_train=True).questions_by_fold()
    questions = {q.qanta_id: q for q in questions[fold]}

    curve_score = CurveScore()
    ew = []
    ew_opt = []
    for qid, answer in answers.items():
        question = questions[qid]
        q = {'text': question.text, 'page': question.page}
        ew.append(curve_score.score(answer, q))
        ew_opt.append(curve_score.score_optimal(answer, q))
    eval_out = {
        'expected_wins': sum(ew),
        'n_examples': len(ew),
        'expected_wins_optimal': sum(ew_opt),
    }
    print(json.dumps(eval_out))
    return eval_out
Beispiel #19
0
def ew(model, fold=BUZZER_DEV_FOLD):
    buzzes = get_buzzes(model, fold)

    guesses_dir = AbstractGuesser.output_path(
        'qanta.guesser.rnn', 'RnnGuesser', 0, '')
    guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, 'char')
    with open(guesses_dir, 'rb') as f:
        guesses = pickle.load(f)
    guesses = guesses.groupby('qanta_id')

    answers = dict()
    for qid, bs in buzzes.items():
        answers[qid] = []
        groups = guesses.get_group(qid).groupby('char_index')
        for char_index, scores in zip(*bs):
            guess = groups.get_group(char_index).head(1)['guess']
            guess = guess.values[0]
            buzz = scores[0] < scores[1]
            answers[qid].append({
                'char_index': char_index,
                'guess': guess,
                'buzz': buzz,
            })

    questions = QuizBowlDataset(buzzer_train=True).questions_by_fold()
    questions = {q.qanta_id: q for q in questions[fold]}

    curve_score = CurveScore()
    ew = []
    ew_opt = []
    for qid, answer in answers.items():
        question = questions[qid]
        q = {'text': question.text, 'page': question.page}
        ew.append(curve_score.score(answer, q))
        ew_opt.append(curve_score.score_optimal(answer, q))
    eval_out = {
        'expected_wins': sum(ew),
        'n_examples': len(ew),
        'expected_wins_optimal': sum(ew_opt),
    }
    print(json.dumps(eval_out))
    return eval_out
Beispiel #20
0
def ew(model, fold=BUZZER_DEV_FOLD):
    buzzes = get_buzzes(model, fold)

    guesses_dir = AbstractGuesser.output_path("qanta.guesser.rnn", "RnnGuesser", 0, "")
    guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, "char")
    with open(guesses_dir, "rb") as f:
        guesses = pickle.load(f)
    guesses = guesses.groupby("qanta_id")

    answers = dict()
    for qid, bs in buzzes.items():
        answers[qid] = []
        groups = guesses.get_group(qid).groupby("char_index")
        for char_index, scores in zip(*bs):
            guess = groups.get_group(char_index).head(1)["guess"]
            guess = guess.values[0]
            buzz = scores[0] < scores[1]
            answers[qid].append(
                {"char_index": char_index, "guess": guess, "buzz": buzz,}
            )

    questions = QuizBowlDataset(buzzer_train=True).questions_by_fold()
    questions = {q.qanta_id: q for q in questions[fold]}

    curve_score = CurveScore()
    ew = []
    ew_opt = []
    for qid, answer in answers.items():
        question = questions[qid]
        q = {"text": question.text, "page": question.page}
        ew.append(curve_score.score(answer, q))
        ew_opt.append(curve_score.score_optimal(answer, q))
    eval_out = {
        "expected_wins": sum(ew),
        "n_examples": len(ew),
        "expected_wins_optimal": sum(ew_opt),
    }
    print(json.dumps(eval_out))
    return eval_out
Beispiel #21
0
    def run(self):
        guesser_class = get_class(self.guesser_module, self.guesser_class)
        guesser_directory = AbstractGuesser.output_path(
            self.guesser_module, self.guesser_class, '')
        guesser_instance = guesser_class.load(
            guesser_directory)  # type: AbstractGuesser

        if self.fold in {c.GUESSER_TRAIN_FOLD, c.GUESSER_DEV_FOLD}:
            word_skip = conf['guesser_word_skip']
        else:
            word_skip = conf['buzzer_word_skip']

        log.info(
            'Generating and saving guesses for {} fold with word_skip={}...'.
            format(self.fold, word_skip))
        start_time = time.time()
        guess_df = guesser_instance.generate_guesses(self.n_guesses,
                                                     [self.fold],
                                                     word_skip=word_skip)
        end_time = time.time()
        log.info('Guessing on {} fold took {}s, saving guesses...'.format(
            self.fold, end_time - start_time))
        guesser_class.save_guesses(guess_df, guesser_directory, [self.fold])
        log.info('Done saving guesses')
Beispiel #22
0
import pickle
from qanta.guesser.abstract import AbstractGuesser
from qanta.guesser.dan import DanGuesser
from qanta.util.constants import BUZZER_TRAIN_FOLD, BUZZER_DEV_FOLD

guesser_directory = AbstractGuesser.output_path(
    "qanta.guesser.dan", "DanGuesser", 0, ""
)
guesser = DanGuesser.load(guesser_directory)  # type: AbstractGuesser
guesser.batch_size /= 8

word_skip = 2
folds = [BUZZER_TRAIN_FOLD, BUZZER_DEV_FOLD]
for fold in folds:
    df = guesser.generate_guesses(1, [fold], word_skip=word_skip)
    output_path = AbstractGuesser.guess_path(guesser_directory, fold)
    with open(output_path, "wb") as f:
        pickle.dump(df, f)
Beispiel #23
0
 def output(self):
     return LocalTarget(
         AbstractGuesser.output_path(self.guesser_module,
                                     self.guesser_class,
                                     'guesses_{}.pickle'.format(self.fold)))
Beispiel #24
0
Datei: plot.py Projekt: NPSDC/qb
def stack(model_dir, model_name, fold=BUZZER_DEV_FOLD):
    guesses_dir = AbstractGuesser.output_path("qanta.guesser.rnn", "RnnGuesser", 0, "")
    guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, "char")
    with open(guesses_dir, "rb") as f:
        guesses = pickle.load(f)
    guesses = guesses.groupby("qanta_id")

    buzzes_dir = os.path.join(model_dir, "{}_buzzes.pkl".format(fold))
    with open(buzzes_dir, "rb") as f:
        buzzes = pickle.load(f)

    questions = QuizBowlDataset(buzzer_train=True).questions_by_fold()
    questions = {q.qanta_id: q for q in questions[fold]}

    stack_freq = {"Position": [], "Buzzing": []}
    count = defaultdict(lambda: 0)
    for qid, (char_indices, scores) in buzzes.items():
        gs = guesses.get_group(qid).groupby("char_index")
        gs = gs.aggregate(lambda x: x.head(1)).to_dict()["guess"]
        question = questions[qid]
        q_len = len(question.text)
        for i, char_index in enumerate(char_indices):
            buzz_oracle = gs[char_index] == question.page
            buzz_buzzer = scores[i][1] > scores[i][0]

            only_oracle = buzz_oracle and (not buzz_buzzer)
            only_buzzer = buzz_buzzer and (not buzz_oracle)
            both = buzz_buzzer and buzz_oracle
            neither = (not buzz_buzzer) and (not buzz_oracle)

            rel_position = np.round(char_index / q_len, decimals=1)
            count[rel_position] += 1

            if only_oracle:
                stack_freq["Position"].append(rel_position)
                stack_freq["Buzzing"].append("Only optimal")

            if only_buzzer:
                stack_freq["Position"].append(rel_position)
                stack_freq["Buzzing"].append("Only buzzer")

            if both:
                stack_freq["Position"].append(rel_position)
                stack_freq["Buzzing"].append("Both")

            if neither:
                stack_freq["Position"].append(rel_position)
                stack_freq["Buzzing"].append("Neither")

    df = pd.DataFrame(stack_freq)
    df = df.groupby(["Position", "Buzzing"])
    df = df.size().reset_index().rename(columns={0: "Frequency"})
    df["Frequency"] = df.apply(
        lambda row: row["Frequency"] / count[row["Position"]], axis=1
    )
    df["Model"] = pd.Series([model_name for _ in range(len(df))])
    stack_dir = os.path.join(model_dir, "{}_stack.pkl".format(fold))
    with open(stack_dir, "wb") as f:
        pickle.dump(df, f)

    return df
Beispiel #25
0
from tqdm import tqdm
from elasticsearch_dsl.connections import connections
from elasticsearch_dsl import DocType, Text, Keyword, Search, Index
from qanta.util.constants import GUESSER_DEV_FOLD
from qanta.guesser.abstract import AbstractGuesser
from qanta.datasets.quiz_bowl import QuizBowlDataset
from qanta.guesser.experimental.elasticsearch_instance_of import ElasticSearchWikidataGuesser
from qanta.guesser.experimental.elasticsearch_instance_of import ElasticSearchIndex

INDEX_NAME = 'qb_ir_instance_of'

gspec = AbstractGuesser.list_enabled_guessers()[0]
guesser_dir = AbstractGuesser.output_path(gspec.guesser_module,
        gspec.guesser_class, '')
guesser = ElasticSearchWikidataGuesser.load(guesser_dir)
es_index = ElasticSearchIndex()

def recursive_guess(question, k=0):
    p_class, p_prob = guesser.test_instance_of([question])[0]
    first_guesses = search_not(question, p_class)
    print('First round')
    for x in first_guesses:
        print(x)
    print()

    print('Second round')
    new_guesses = []
    for i in range(k):
        guess = first_guesses[i][0]
        question += ' ' + ' '.join(guess.split('_'))
        guesses = es_index.search(question, p_class, p_prob, 0.6)
Beispiel #26
0
def export(output_file: str, fold: str = "buzztest"):
    fold = "buzztest"
    guesses_dir = AbstractGuesser.output_path("qanta.guesser.rnn",
                                              "RnnGuesser", 0, "")
    guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, "char")
    with open(guesses_dir, "rb") as f:
        guesses = pickle.load(f)
    guesses = guesses.groupby("qanta_id")

    questions = QuizBowlDataset(buzzer_train=True).questions_by_fold()
    questions = {q.qanta_id: q for q in questions[fold]}
    buzzers = {}
    for name in ["RNNBuzzer", "ThresholdBuzzer", "MLPBuzzer"]:
        model_dir = f"output/buzzer/{name}"
        buzzes_dir = os.path.join(model_dir, "{}_buzzes.pkl".format(fold))
        with open(buzzes_dir, "rb") as f:
            buzzers[name] = pickle.load(f)

    qid_to_buzzes = defaultdict(dict)
    for name, buzzes in track(buzzers.items()):
        for qid, (char_indices, scores) in buzzes.items():
            gs = (guesses.get_group(qid).groupby("char_index").aggregate(
                lambda x: x.head(1)).to_dict()["guess"])
            question = questions[qid]
            q_len = len(question.text)
            buzz_oracle_position = -1
            buzz_model_position = -1
            oracle_guess = None
            buzz_guess = None
            for i, char_index in enumerate(char_indices):
                buzz_oracle = gs[char_index] == question.page
                if buzz_oracle:
                    if buzz_oracle_position == -1 or char_index <= buzz_oracle_position:
                        oracle_guess = question.page
                        buzz_oracle_position = char_index

                if scores[i][1] > scores[i][0]:
                    if buzz_model_position == -1 or char_index < buzz_model_position:
                        buzz_guess = gs[char_index]
                        buzz_model_position = char_index
            qid_to_buzzes[qid][name] = {
                "oracle":
                buzz_oracle_position,
                "oracle_fraction":
                buzz_oracle_position /
                q_len if buzz_oracle_position != -1 else -1,
                "position":
                buzz_model_position,
                "position_fraction":
                buzz_model_position /
                q_len if buzz_model_position != -1 else -1,
                "q_len":
                q_len,
                "oracle_guess":
                oracle_guess,
                "buzz_guess":
                buzz_guess,
                "answer":
                question.page,
                "impossible":
                oracle_guess is None,
            }
    write_json(output_file, qid_to_buzzes)
Beispiel #27
0
import random
import pickle

from qanta.config import conf
from qanta.util.io import safe_path
from qanta.util.multiprocess import _multiprocess
from qanta.guesser.abstract import AbstractGuesser
from qanta.datasets.quiz_bowl import QuizBowlDataset, Question
from qanta.guesser.experimental.elasticsearch_instance_of import ElasticSearchWikidataGuesser
'''Randomly shuffle the word order and see if it changes the guesses.
'''

gspec = AbstractGuesser.list_enabled_guessers()[0]
guesser_dir = AbstractGuesser.output_path(gspec.guesser_module,
                                          gspec.guesser_class, '')
guesser = ElasticSearchWikidataGuesser.load(guesser_dir)


def main():
    fold = 'guessdev'
    db = QuizBowlDataset(1, guesser_train=True, buzzer_train=True)
    questions = db.questions_in_folds([fold])
    first_n = lambda x: len(x)

    print(guesser.guess_single(' '.join(questions[0].text.values())))
    '''
    s = [0, 0, 0, 0, 0]
    for q in questions:
        sents = list(q.text.values())
        text_before = ' '.join(sents[:first_n(sents)])
        words = text.split()
Beispiel #28
0
 def run(self):
     guesser_class = get_class(self.guesser_module, self.guesser_class)
     guesser_directory = AbstractGuesser.output_path(
         self.guesser_module, self.guesser_class, self.config_num, '')
     guesser_instance = guesser_class(self.config_num)
     guesser_instance.create_report(guesser_directory, self.fold)
Beispiel #29
0
def stack(model_dir, model_name, fold=BUZZER_DEV_FOLD):
    guesses_dir = AbstractGuesser.output_path('qanta.guesser.rnn',
                                              'RnnGuesser', 0, '')
    guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, 'char')
    with open(guesses_dir, 'rb') as f:
        guesses = pickle.load(f)
    guesses = guesses.groupby('qanta_id')

    buzzes_dir = os.path.join(model_dir, '{}_buzzes.pkl'.format(fold))
    with open(buzzes_dir, 'rb') as f:
        buzzes = pickle.load(f)

    questions = QuizBowlDataset(buzzer_train=True).questions_by_fold()
    questions = {q.qanta_id: q for q in questions[fold]}

    stack_freq = {'Position': [], 'Buzzing': []}
    count = defaultdict(lambda: 0)
    for qid, (char_indices, scores) in buzzes.items():
        gs = guesses.get_group(qid).groupby('char_index')
        gs = gs.aggregate(lambda x: x.head(1)).to_dict()['guess']
        question = questions[qid]
        q_len = len(question.text)
        for i, char_index in enumerate(char_indices):
            buzz_oracle = gs[char_index] == question.page
            buzz_buzzer = scores[i][1] > scores[i][0]

            only_oracle = buzz_oracle and (not buzz_buzzer)
            only_buzzer = buzz_buzzer and (not buzz_oracle)
            both = buzz_buzzer and buzz_oracle
            neither = (not buzz_buzzer) and (not buzz_oracle)

            rel_position = np.round(char_index / q_len, decimals=1)
            count[rel_position] += 1

            if only_oracle:
                stack_freq['Position'].append(rel_position)
                stack_freq['Buzzing'].append('Only optimal')

            if only_buzzer:
                stack_freq['Position'].append(rel_position)
                stack_freq['Buzzing'].append('Only buzzer')

            if both:
                stack_freq['Position'].append(rel_position)
                stack_freq['Buzzing'].append('Both')

            if neither:
                stack_freq['Position'].append(rel_position)
                stack_freq['Buzzing'].append('Neither')

    df = pd.DataFrame(stack_freq)
    df = df.groupby(['Position', 'Buzzing'])
    df = df.size().reset_index().rename(columns={0: 'Frequency'})
    df['Frequency'] = df.apply(
        lambda row: row['Frequency'] / count[row['Position']], axis=1)
    df['Model'] = pd.Series([model_name for _ in range(len(df))])
    stack_dir = os.path.join(model_dir, '{}_stack.pkl'.format(fold))
    with open(stack_dir, 'wb') as f:
        pickle.dump(df, f)

    return df
Beispiel #30
-1
def read_data(
        fold,
        output_type='char',
        guesser_module='qanta.guesser.rnn',
        guesser_class='RnnGuesser',
        guesser_config_num=0,
        vector_converter=vector_converter_0):

    if os.path.isfile(dataset_dir.format(fold)):
        with open(dataset_dir.format(fold), 'rb') as f:
            return pickle.load(f)

    g_dir = AbstractGuesser.output_path(
        guesser_module, guesser_class, guesser_config_num, '')
    g_path = AbstractGuesser.guess_path(g_dir, fold, output_type)
    with open(g_path, 'rb') as f:
        df = pickle.load(f)
    df_groups = df.groupby('qanta_id')

    questions = QuizBowlDataset(buzzer_train=True).questions_by_fold()
    questions = {q.qanta_id: q for q in questions[fold]}

    pool = Pool(8)
    worker = partial(process_question, questions, vector_converter)
    dataset = pool.map(worker, df_groups)

    with open(dataset_dir.format(fold), 'wb') as f:
        pickle.dump(dataset, f)

    return dataset
Beispiel #31
-2
def protobowl(model, fold=BUZZER_DEV_FOLD):
    buzzes = get_buzzes(model, fold)

    '''eval'''
    guesses_dir = AbstractGuesser.output_path(
        'qanta.guesser.rnn', 'RnnGuesser', 0, '')
    guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, 'char')
    with open(guesses_dir, 'rb') as f:
        guesses = pickle.load(f)
    guesses = guesses.groupby('qanta_id')

    questions = QuizBowlDataset(buzzer_train=True).questions_by_fold()
    questions = questions[fold]

    df = load_protobowl()
    df = df.groupby('qid')

    worker = partial(simulate_game, guesses, buzzes, df)

    possibility = []
    outcome = []
    for question in tqdm(questions):
        pos, out = worker(question)
        possibility += pos
        outcome += out

    result_df = pd.DataFrame({
        'Possibility': possibility,
        'Outcome': outcome,
    })

    result_dir = os.path.join(
        model.model_dir, '{}_protobowl.pkl'.format(fold))
    with open(result_dir, 'wb') as f:
        pickle.dump(result_df, f)