Ejemplo n.º 1
1
 def output(self):
     return [LocalTarget(AbstractGuesser.guess_path(bc.GUESSES_DIR, fold)) for fold in c.BUZZER_INPUT_FOLDS]
Ejemplo n.º 2
1
Archivo: plot.py Proyecto: Pinafore/qb
def stack(model_dir, model_name, fold=BUZZER_DEV_FOLD):
    guesses_dir = AbstractGuesser.output_path(
        'qanta.guesser.rnn', 'RnnGuesser', 0, '')
    guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, 'char')
    with open(guesses_dir, 'rb') as f:
        guesses = pickle.load(f)
    guesses = guesses.groupby('qanta_id')

    buzzes_dir = os.path.join(model_dir, '{}_buzzes.pkl'.format(fold))
    with open(buzzes_dir, 'rb') as f:
        buzzes = pickle.load(f)

    questions = QuizBowlDataset(buzzer_train=True).questions_by_fold()
    questions = {q.qanta_id: q for q in questions[fold]}

    stack_freq = {'Position': [], 'Buzzing': []}
    count = defaultdict(lambda: 0)
    for qid, (char_indices, scores) in buzzes.items():
        gs = guesses.get_group(qid).groupby('char_index')
        gs = gs.aggregate(lambda x: x.head(1)).to_dict()['guess']
        question = questions[qid]
        q_len = len(question.text)
        for i, char_index in enumerate(char_indices):
            buzz_oracle = gs[char_index] == question.page
            buzz_buzzer = scores[i][1] > scores[i][0]

            only_oracle = buzz_oracle and (not buzz_buzzer)
            only_buzzer = buzz_buzzer and (not buzz_oracle)
            both = buzz_buzzer and buzz_oracle
            neither = (not buzz_buzzer) and (not buzz_oracle)

            rel_position = np.round(char_index / q_len, decimals=1)
            count[rel_position] += 1

            if only_oracle:
                stack_freq['Position'].append(rel_position)
                stack_freq['Buzzing'].append('Only optimal')

            if only_buzzer:
                stack_freq['Position'].append(rel_position)
                stack_freq['Buzzing'].append('Only buzzer')

            if both:
                stack_freq['Position'].append(rel_position)
                stack_freq['Buzzing'].append('Both')

            if neither:
                stack_freq['Position'].append(rel_position)
                stack_freq['Buzzing'].append('Neither')

    df = pd.DataFrame(stack_freq)
    df = df.groupby(['Position', 'Buzzing'])
    df = df.size().reset_index().rename(columns={0: 'Frequency'})
    df['Frequency'] = df.apply(
        lambda row: row['Frequency'] / count[row['Position']],
        axis=1)
    df['Model'] = pd.Series([model_name for _ in range(len(df))])
    stack_dir = os.path.join(model_dir, '{}_stack.pkl'.format(fold))
    with open(stack_dir, 'wb') as f:
        pickle.dump(df, f)

    return df
Ejemplo n.º 3
1
import pickle
from qanta.guesser.abstract import AbstractGuesser
from qanta.guesser.dan import DanGuesser
from qanta.util.constants import BUZZER_TRAIN_FOLD, BUZZER_DEV_FOLD

guesser_directory = AbstractGuesser.output_path(
    'qanta.guesser.dan', 'DanGuesser', 0, '')
guesser = DanGuesser.load(guesser_directory)  # type: AbstractGuesser
guesser.batch_size /= 8

word_skip = 2
folds = [BUZZER_TRAIN_FOLD, BUZZER_DEV_FOLD]
for fold in folds:
    df = guesser.generate_guesses(1, [fold], word_skip=word_skip)
    output_path = AbstractGuesser.guess_path(guesser_directory, fold)
    with open(output_path, 'wb') as f:
        pickle.dump(df, f)
Ejemplo n.º 4
0
Archivo: guesser.py Proyecto: NPSDC/qb
    def output(self):
        if os.path.exists(c.QANTA_EXPO_DATASET_PATH):
            folds = [c.GUESSER_DEV_FOLD, c.GUESSER_TEST_FOLD, c.EXPO_FOLD]
        else:
            folds = [c.GUESSER_DEV_FOLD, c.GUESSER_TEST_FOLD]

        targets = [
            LocalTarget(
                AbstractGuesser.reporting_path(
                    self.guesser_module,
                    self.guesser_class,
                    self.config_num,
                    f"guesser_params.pickle",
                ))
        ]
        for f in folds:
            targets.append(
                LocalTarget(
                    AbstractGuesser.reporting_path(
                        self.guesser_module,
                        self.guesser_class,
                        self.config_num,
                        f"guesser_report_{f}.pickle",
                    )))
        return targets
Ejemplo n.º 5
0
def protobowl(model, fold=BUZZER_DEV_FOLD):
    buzzes = get_buzzes(model, fold)
    '''eval'''
    guesses_dir = AbstractGuesser.output_path('qanta.guesser.rnn',
                                              'RnnGuesser', 0, '')
    guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, 'char')
    with open(guesses_dir, 'rb') as f:
        guesses = pickle.load(f)
    guesses = guesses.groupby('qanta_id')

    questions = QuizBowlDataset(buzzer_train=True).questions_by_fold()
    questions = questions[fold]

    df = load_protobowl()
    df = df.groupby('qid')

    worker = partial(simulate_game, guesses, buzzes, df)

    possibility = []
    outcome = []
    for question in tqdm(questions):
        pos, out = worker(question)
        possibility += pos
        outcome += out

    result_df = pd.DataFrame({
        'Possibility': possibility,
        'Outcome': outcome,
    })

    result_dir = os.path.join(model.model_dir, '{}_protobowl.pkl'.format(fold))
    with open(result_dir, 'wb') as f:
        pickle.dump(result_df, f)
Ejemplo n.º 6
0
def read_data(fold,
              output_type='char',
              guesser_module='qanta.guesser.dan',
              guesser_class='DanGuesser',
              guesser_config_num=0,
              vector_converter=vector_converter_0):

    if os.path.isfile(dataset_dir.format(fold)):
        with open(dataset_dir.format(fold), 'rb') as f:
            return pickle.load(f)

    g_dir = AbstractGuesser.output_path(guesser_module, guesser_class,
                                        guesser_config_num, '')
    g_path = AbstractGuesser.guess_path(g_dir, fold, output_type)
    with open(g_path, 'rb') as f:
        df = pickle.load(f)
    df_groups = df.groupby('qanta_id')

    questions = QuizBowlDataset(buzzer_train=True).questions_by_fold()
    questions = {q.qanta_id: q for q in questions[fold]}

    pool = Pool(8)
    worker = partial(process_question, questions, vector_converter)
    dataset = pool.map(worker, df_groups)

    with open(dataset_dir.format(fold), 'wb') as f:
        return pickle.dump(dataset, f)

    return dataset
Ejemplo n.º 7
0
Archivo: util.py Proyecto: Agnon1573/qb
def merge_dfs():
    GUESSERS = ["{0}.{1}".format(
        x.guesser_module, x.guesser_class) \
        for x in AbstractGuesser.list_enabled_guessers()]
    log.info("Merging guesser DataFrames.")
    merged_dir = os.path.join(c.GUESSER_TARGET_PREFIX, 'merged')
    if not os.path.exists(merged_dir):
        os.makedirs(merged_dir)
    for fold in c.BUZZER_INPUT_FOLDS:
        if os.path.exists(AbstractGuesser.guess_path(merged_dir, fold)):
            log.info("Merged {0} exists, skipping.".format(fold))
            continue
        new_guesses = pd.DataFrame(columns=[
            'fold', 'guess', 'guesser', 'qnum', 'score', 'sentence', 'token'
        ],
                                   dtype='object')
        for guesser in GUESSERS:
            guesser_dir = os.path.join(c.GUESSER_TARGET_PREFIX, guesser)
            guesses = AbstractGuesser.load_guesses(guesser_dir, folds=[fold])
            new_guesses = new_guesses.append(guesses)
        for col in ['qnum', 'sentence', 'token', 'score']:
            new_guesses[col] = pd.to_numeric(new_guesses[col],
                                             downcast='integer')
        AbstractGuesser.save_guesses(new_guesses, merged_dir, folds=[fold])
        log.info("Merging: {0} finished.".format(fold))
Ejemplo n.º 8
0
Archivo: main.py Proyecto: ymedhat95/qb
def test():
    gspec = AbstractGuesser.list_enabled_guessers()[0]
    guesser_dir = AbstractGuesser.output_path(gspec.guesser_module,
            gspec.guesser_class, '')
    guesser = ElasticSearchWikidataGuesser.load(guesser_dir)

    torch.cuda.set_device(0)
    predictor = Predictor()
    predictor.cuda()

    dataset = BonusPairsDataset()
    examples = [x for x in dataset.examples if x['start'] != -1]
    
    guesses = []
    for example in tqdm(examples):
        document = example['content']
        question = example['query']
        answer = example['answer']
        predictions = predictor.predict(document, question, top_n=1)
        prediction = predictions[0][0]

        gs = guesser.guess_single(example['query'])
        gs = sorted(gs.items(), key=lambda x: x[1])[::-1]
        guess = gs[0][0].replace('_', ' ')

        guesses.append((prediction, guess, example['answer']))

    with open('results.pkl', 'wb') as f:
        pickle.dump(guesses, f)
Ejemplo n.º 9
0
 def scores(self):
     if not self.initialized:
         guess_df = AbstractGuesser.load_all_guesses(
             directory_prefix=self.directory_prefix)
         self.map = AbstractGuesser.load_guess_score_map(guess_df)
         self.initialized = True
     return self.map
Ejemplo n.º 10
0
Archivo: test.py Proyecto: ymedhat95/qb
def main():
    buzzer = RNNBuzzer()

    # setup questions
    questions = list(QuestionDatabase().all_questions().values())
    dev_questions = [x for x in questions if x.fold == 'dev']

    # setup machine agent
    gspec = AbstractGuesser.list_enabled_guessers()[0]
    guesser_dir = AbstractGuesser.output_path(gspec.guesser_module,
                                              gspec.guesser_class, '')
    guesser = ElasticSearchWikidataGuesser.load(guesser_dir)
    guesser = ESGuesserWrapper(guesser)
    machine_agent = GuesserBuzzerAgent(guesser, buzzer)

    # setup human agent
    human_agent = HumanAgent()

    # setup hook
    hooks = []
    hooks.append(hook.NotifyBuzzingHook)
    hooks.append(hook.GameInterfaceHook)
    hooks.append(hook.VisualizeGuesserBuzzerHook(machine_agent))
    hooks.append(hook.HighlightHook)

    # setup game
    game = Game(dev_questions, [human_agent, machine_agent], hooks)

    game.run(10)
Ejemplo n.º 11
0
def protobowl(model, fold=BUZZER_DEV_FOLD):
    buzzes = get_buzzes(model, fold)

    """eval"""
    guesses_dir = AbstractGuesser.output_path("qanta.guesser.rnn", "RnnGuesser", 0, "")
    guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, "char")
    with open(guesses_dir, "rb") as f:
        guesses = pickle.load(f)
    guesses = guesses.groupby("qanta_id")

    questions = QuizBowlDataset(buzzer_train=True).questions_by_fold()
    questions = questions[fold]

    df = load_protobowl()
    df = df.groupby("qid")

    worker = partial(simulate_game, guesses, buzzes, df)

    possibility = []
    outcome = []
    for question in tqdm(questions):
        pos, out = worker(question)
        possibility += pos
        outcome += out

    result_df = pd.DataFrame({"Possibility": possibility, "Outcome": outcome,})

    result_dir = os.path.join(model.model_dir, "{}_protobowl.pkl".format(fold))
    with open(result_dir, "wb") as f:
        pickle.dump(result_df, f)
Ejemplo n.º 12
0
Archivo: cli.py Proyecto: Pinafore/qb
def guesser_api(host, port, debug, guessers):
    if debug:
        log.warning(
            'WARNING: debug mode can expose environment variables (AWS keys), NEVER use when API is exposed to  web')
        log.warning('Confirm that you would like to enable flask debugging')
        confirmation = input('yes/no:\n').strip()
        if confirmation != 'yes':
            raise ValueError('Most confirm enabling debug mode')

    AbstractGuesser.multi_guesser_web_api(guessers, host=host, port=port, debug=debug)
Ejemplo n.º 13
0
 def output(self):
     return [
         LocalTarget(
             AbstractGuesser.output_path(self.guesser_module,
                                         self.guesser_class,
                                         'guesser_report.pdf')),
         LocalTarget(
             AbstractGuesser.output_path(self.guesser_module,
                                         self.guesser_class,
                                         'guesser_report.pickle'))
     ]
Ejemplo n.º 14
0
Archivo: guesser.py Proyecto: NPSDC/qb
    def run(self):
        guesser_class = get_class(self.guesser_module, self.guesser_class)
        reporting_directory = AbstractGuesser.reporting_path(
            self.guesser_module, self.guesser_class, self.config_num, "")

        # In the cases of huge parameter sweeps on SLURM its easy to accidentally run out of /fs/ storage.
        # Since we only care about the results we can get them, then delete the models. We can use the regular
        # GuesserReport to preserve the model
        guesser_directory = AbstractGuesser.output_path(
            self.guesser_module, self.guesser_class, self.config_num, "")

        param_path = AbstractGuesser.output_path(
            self.guesser_module,
            self.guesser_class,
            self.config_num,
            f"guesser_params.pickle",
        )
        guesses_files = []
        if os.path.exists(c.QANTA_EXPO_DATASET_PATH):
            folds = [c.GUESSER_DEV_FOLD, c.GUESSER_TEST_FOLD, c.EXPO_FOLD]
        else:
            folds = [c.GUESSER_DEV_FOLD, c.GUESSER_TEST_FOLD]

        for f in folds:
            guesses_files.extend([
                f"guesses_char_{f}.pickle",
                f"guesses_full_{f}.pickle",
                f"guesses_first_{f}.pickle",
            ])

        guesses_paths = [
            AbstractGuesser.output_path(self.guesser_module,
                                        self.guesser_class, self.config_num, f)
            for f in guesses_files
        ]

        log.info(f'Running: "cp {param_path} {reporting_directory}"')
        shell(f"cp {param_path} {reporting_directory}")

        for g_path in guesses_paths:
            log.info(f'Running: "cp {g_path} {reporting_directory}"')
            shell(f"cp {g_path} {reporting_directory}")

        guesser_instance = guesser_class(self.config_num)
        for f in folds:
            guesser_instance.create_report(reporting_directory, f)

        log.info(f'Running: "rm -rf {guesser_directory}"')
        shell(f"rm -rf {guesser_directory}")
        for g_path in guesses_paths:
            shell(f"rm -f {g_path}")
Ejemplo n.º 15
0
def guesser_api(host, port, debug, guessers):
    if debug:
        log.warning(
            'WARNING: debug mode can expose environment variables (AWS keys), NEVER use when API is exposed to  web'
        )
        log.warning('Confirm that you would like to enable flask debugging')
        confirmation = input('yes/no:\n').strip()
        if confirmation != 'yes':
            raise ValueError('Most confirm enabling debug mode')

    AbstractGuesser.multi_guesser_web_api(guessers,
                                          host=host,
                                          port=port,
                                          debug=debug)
Ejemplo n.º 16
0
Archivo: guesser.py Proyecto: NPSDC/qb
 def output(self):
     targets = []
     for g_spec in AbstractGuesser.list_enabled_guessers():
         guesser = f"{g_spec.guesser_module}.{g_spec.guesser_class}"
         targets.append(
             LocalTarget(f"output/guesser/best/{guesser}/best.touch"))
     return targets
Ejemplo n.º 17
0
Archivo: guesser.py Proyecto: NPSDC/qb
 def requires(self):
     for g_spec in AbstractGuesser.list_enabled_guessers():
         yield TrainGuesser(
             guesser_module=g_spec.guesser_module,
             guesser_class=g_spec.guesser_class,
             dependency_module=g_spec.dependency_module,
             dependency_class=g_spec.dependency_class,
         )
Ejemplo n.º 18
0
    def output(self):
        guesser_class = get_class(self.guesser_module, self.guesser_class)
        guesser_targets = [
            LocalTarget(file) for file in guesser_class.files(
                AbstractGuesser.output_path(self.guesser_module,
                                            self.guesser_class, ''))
        ]

        return [
            LocalTarget(
                AbstractGuesser.output_path(self.guesser_module,
                                            self.guesser_class, '')),
            LocalTarget(
                AbstractGuesser.output_path(self.guesser_module,
                                            self.guesser_class,
                                            'guesser_params.pickle'))
        ] + guesser_targets
Ejemplo n.º 19
0
 def run(self):
     guesser_class = get_class(self.guesser_module, self.guesser_class)
     guesser_instance = guesser_class()  # type: AbstractGuesser
     qb_dataset = guesser_instance.qb_dataset()
     start_time = time.time()
     guesser_instance.train(qb_dataset.training_data())
     end_time = time.time()
     guesser_instance.save(
         AbstractGuesser.output_path(self.guesser_module,
                                     self.guesser_class, ''))
     params = guesser_instance.parameters()
     params['training_time'] = end_time - start_time
     params_path = AbstractGuesser.output_path(self.guesser_module,
                                               self.guesser_class,
                                               'guesser_params.pickle')
     with open(params_path, 'wb') as f:
         pickle.dump(params, f)
Ejemplo n.º 20
0
 def requires(self):
     yield AllSingleGuesserReports()
     for g_spec in AbstractGuesser.list_enabled_guessers():
         yield GenerateGuesses(guesser_module=g_spec.guesser_module,
                               guesser_class=g_spec.guesser_class,
                               dependency_module=g_spec.dependency_module,
                               dependency_class=g_spec.dependency_class,
                               fold='expo')
Ejemplo n.º 21
0
Archivo: guesser.py Proyecto: NPSDC/qb
 def output(self):
     return LocalTarget(
         AbstractGuesser.output_path(
             self.guesser_module,
             self.guesser_class,
             self.config_num,
             f"guesser_report_{self.fold}.pickle",
         ))
Ejemplo n.º 22
0
Archivo: guesser.py Proyecto: NPSDC/qb
    def run(self):
        guesser_class = get_class(self.guesser_module, self.guesser_class)
        guesser_directory = AbstractGuesser.output_path(
            self.guesser_module, self.guesser_class, self.config_num, "")
        guesser_instance = guesser_class.load(
            guesser_directory)  # type: AbstractGuesser

        if self.fold in {c.GUESSER_TRAIN_FOLD, c.GUESSER_DEV_FOLD}:
            char_skip = conf["guesser_char_skip"]
        elif self.fold == c.EXPO_FOLD:
            char_skip = conf["expo_char_skip"]
        else:
            char_skip = conf["buzzer_char_skip"]

        log.info(
            f"Generating and saving guesses for {self.fold} fold with char_skip={char_skip}..."
        )
        start_time = time.time()
        guess_df = guesser_instance.generate_guesses(self.n_guesses,
                                                     [self.fold],
                                                     char_skip=char_skip)
        end_time = time.time()
        elapsed = end_time - start_time
        log.info(
            f"Guessing on {self.fold} fold took {elapsed}s, saving guesses...")
        guesser_class.save_guesses(guess_df, guesser_directory, [self.fold],
                                   "char")
        log.info("Done saving guesses")

        log.info(
            f"Generating and saving guesses for {self.fold} fold with full question..."
        )
        start_time = time.time()
        guess_df = guesser_instance.generate_guesses(self.n_guesses,
                                                     [self.fold],
                                                     full_question=True)
        end_time = time.time()
        elapsed = end_time - start_time
        log.info(
            f"Guessing on {self.fold} fold took {elapsed}s, saving guesses...")
        guesser_class.save_guesses(guess_df, guesser_directory, [self.fold],
                                   "full")
        log.info("Done saving guesses")

        log.info(
            f"Generating and saving guesses for {self.fold} fold with first sentence"
        )
        start_time = time.time()
        guess_df = guesser_instance.generate_guesses(self.n_guesses,
                                                     [self.fold],
                                                     first_sentence=True)
        end_time = time.time()
        elapsed = end_time - start_time
        log.info(
            f"Guessing on {self.fold} fold took {elapsed}s, saving guesses...")
        guesser_class.save_guesses(guess_df, guesser_directory, [self.fold],
                                   "first")
        log.info("Done saving guesses")
Ejemplo n.º 23
0
 def requires(self):
     for g_spec in AbstractGuesser.list_enabled_guessers():
         for fold in c.GUESSER_GENERATION_FOLDS:
             yield GenerateGuesses(
                 guesser_module=g_spec.guesser_module,
                 guesser_class=g_spec.guesser_class,
                 dependency_module=g_spec.dependency_module,
                 dependency_class=g_spec.dependency_class,
                 fold=fold)
Ejemplo n.º 24
0
 def requires(self):
     for g_spec in AbstractGuesser.list_enabled_guessers():
         yield GuesserPerformance(
             guesser_module=g_spec.guesser_module,
             guesser_class=g_spec.guesser_class,
             dependency_module=g_spec.dependency_module,
             dependency_class=g_spec.dependency_class,
             config_num=g_spec.config_num,
         )
Ejemplo n.º 25
0
Archivo: test.py Proyecto: nadesai/qb
def test_buzzer():
    questions = QuestionDatabase().all_questions()
    buzzer = RNNBuzzer(word_skip=conf['buzzer_word_skip'])

    # setup machine agent
    gspec = AbstractGuesser.list_enabled_guessers()[0]
    guesser_dir = AbstractGuesser.output_path(gspec.guesser_module,
                                              gspec.guesser_class, '')
    guesser = ElasticSearchWikidataGuesser.load(guesser_dir)
    guesser = ESGuesserWrapper(guesser)

    key = list(questions.keys())[4]
    question = questions[key].flatten_text().split()
    for i, word in enumerate(question):
        clue = ' '.join(question[:i])
        guesses = guesser.guess(clue)
        buzz = buzzer.buzz(guesses)
        print(buzz)
Ejemplo n.º 26
0
    def run(self):
        guesser_class = get_class(self.guesser_module, self.guesser_class)
        guesser_directory = AbstractGuesser.output_path(
            self.guesser_module, self.guesser_class, self.config_num, '')
        guesser_instance = guesser_class.load(
            guesser_directory)  # type: AbstractGuesser

        if self.fold in {c.GUESSER_TRAIN_FOLD, c.GUESSER_DEV_FOLD}:
            char_skip = conf['guesser_char_skip']
        else:
            char_skip = conf['buzzer_char_skip']

        log.info(
            f'Generating and saving guesses for {self.fold} fold with char_skip={char_skip}...'
        )
        start_time = time.time()
        guess_df = guesser_instance.generate_guesses(self.n_guesses,
                                                     [self.fold],
                                                     char_skip=char_skip)
        end_time = time.time()
        elapsed = end_time - start_time
        log.info(
            f'Guessing on {self.fold} fold took {elapsed}s, saving guesses...')
        guesser_class.save_guesses(guess_df, guesser_directory, [self.fold],
                                   'char')
        log.info('Done saving guesses')

        log.info(
            f'Generating and saving guesses for {self.fold} fold with full question...'
        )
        start_time = time.time()
        guess_df = guesser_instance.generate_guesses(self.n_guesses,
                                                     [self.fold],
                                                     full_question=True)
        end_time = time.time()
        elapsed = end_time - start_time
        log.info(
            f'Guessing on {self.fold} fold took {elapsed}s, saving guesses...')
        guesser_class.save_guesses(guess_df, guesser_directory, [self.fold],
                                   'full')
        log.info('Done saving guesses')

        log.info(
            f'Generating and saving guesses for {self.fold} fold with first sentence'
        )
        start_time = time.time()
        guess_df = guesser_instance.generate_guesses(self.n_guesses,
                                                     [self.fold],
                                                     first_sentence=True)
        end_time = time.time()
        elapsed = end_time - start_time
        log.info(
            f'Guessing on {self.fold} fold took {elapsed}s, saving guesses...')
        guesser_class.save_guesses(guess_df, guesser_directory, [self.fold],
                                   'first')
        log.info('Done saving guesses')
Ejemplo n.º 27
0
Archivo: eval.py Proyecto: Pinafore/qb
def ew(model, fold=BUZZER_DEV_FOLD):
    buzzes = get_buzzes(model, fold)

    guesses_dir = AbstractGuesser.output_path(
        'qanta.guesser.rnn', 'RnnGuesser', 0, '')
    guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, 'char')
    with open(guesses_dir, 'rb') as f:
        guesses = pickle.load(f)
    guesses = guesses.groupby('qanta_id')

    answers = dict()
    for qid, bs in buzzes.items():
        answers[qid] = []
        groups = guesses.get_group(qid).groupby('char_index')
        for char_index, scores in zip(*bs):
            guess = groups.get_group(char_index).head(1)['guess']
            guess = guess.values[0]
            buzz = scores[0] < scores[1]
            answers[qid].append({
                'char_index': char_index,
                'guess': guess,
                'buzz': buzz,
            })

    questions = QuizBowlDataset(buzzer_train=True).questions_by_fold()
    questions = {q.qanta_id: q for q in questions[fold]}

    curve_score = CurveScore()
    ew = []
    ew_opt = []
    for qid, answer in answers.items():
        question = questions[qid]
        q = {'text': question.text, 'page': question.page}
        ew.append(curve_score.score(answer, q))
        ew_opt.append(curve_score.score_optimal(answer, q))
    eval_out = {
        'expected_wins': sum(ew),
        'n_examples': len(ew),
        'expected_wins_optimal': sum(ew_opt),
    }
    print(json.dumps(eval_out))
    return eval_out
Ejemplo n.º 28
0
def ew(model, fold=BUZZER_DEV_FOLD):
    buzzes = get_buzzes(model, fold)

    guesses_dir = AbstractGuesser.output_path('qanta.guesser.rnn',
                                              'RnnGuesser', 0, '')
    guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, 'char')
    with open(guesses_dir, 'rb') as f:
        guesses = pickle.load(f)
    guesses = guesses.groupby('qanta_id')

    answers = dict()
    for qid, bs in buzzes.items():
        answers[qid] = []
        groups = guesses.get_group(qid).groupby('char_index')
        for char_index, scores in zip(*bs):
            guess = groups.get_group(char_index).head(1)['guess']
            guess = guess.values[0]
            buzz = scores[0] < scores[1]
            answers[qid].append({
                'char_index': char_index,
                'guess': guess,
                'buzz': buzz,
            })

    questions = QuizBowlDataset(buzzer_train=True).questions_by_fold()
    questions = {q.qanta_id: q for q in questions[fold]}

    curve_score = CurveScore()
    ew = []
    ew_opt = []
    for qid, answer in answers.items():
        question = questions[qid]
        q = {'text': question.text, 'page': question.page}
        ew.append(curve_score.score(answer, q))
        ew_opt.append(curve_score.score_optimal(answer, q))
    eval_out = {
        'expected_wins': sum(ew),
        'n_examples': len(ew),
        'expected_wins_optimal': sum(ew_opt),
    }
    print(json.dumps(eval_out))
    return eval_out
Ejemplo n.º 29
0
 def output(self):
     files = [
         f'guesses_char_{self.fold}.pickle',
         f'guesses_full_{self.fold}.pickle',
         f'guesses_first_{self.fold}.pickle'
     ]
     return [
         LocalTarget(
             AbstractGuesser.output_path(self.guesser_module,
                                         self.guesser_class,
                                         self.config_num, f)) for f in files
     ]
Ejemplo n.º 30
0
Archivo: util.py Proyecto: Agnon1573/qb
def load_quizbowl(folds=c.BUZZER_INPUT_FOLDS) \
                    -> Tuple[Dict[str, int], Dict[str, list]]:
    # merge_dfs()
    log.info('Loading data')
    question_db = QuestionDatabase()
    quizbowl_db = QuizBowlDataset(bc.MIN_ANSWERS,
                                  guesser_train=True,
                                  buzzer_train=True)
    all_questions = question_db.all_questions()
    if not os.path.isfile(bc.OPTIONS_DIR):
        log.info('Loading the set of options')
        all_options = set(quizbowl_db.training_data()[1])

        id2option = list(all_options)
        with open(safe_path(bc.OPTIONS_DIR), 'wb') as outfile:
            pickle.dump(id2option, outfile)
    else:
        with open(safe_path(bc.OPTIONS_DIR), 'rb') as infile:
            id2option = pickle.load(infile)
    option2id = {o: i for i, o in enumerate(id2option)}
    num_options = len(id2option)
    log.info('Number of options {0}'.format(len(id2option)))

    guesses_by_fold = dict()
    for fold in folds:
        save_dir = '%s_processed.pickle' % (os.path.join(bc.GUESSES_DIR, fold))
        if os.path.isfile(save_dir):
            with open(safe_path(save_dir), 'rb') as infile:
                guesses_by_fold[fold] = pickle.load(infile)
            log.info('Loading {0} guesses'.format(fold))
            continue

        log.info('Processing {0} guesses'.format(fold))
        guesses = AbstractGuesser.load_guesses(bc.GUESSES_DIR, folds=[fold])

        worker = partial(_process_question, option2id, all_questions)
        inputs = guesses.groupby('qnum')
        guesses_by_fold[fold] = _multiprocess(worker,
                                              inputs,
                                              info='df data',
                                              multi=True)
        guesses_by_fold[fold] = [
            x for x in guesses_by_fold[fold] if x is not None
        ]
        print(len(guesses_by_fold[fold]))

        with open(safe_path(save_dir), 'wb') as outfile:
            pickle.dump(guesses_by_fold[fold], outfile)

        log.info('Processed {0} guesses saved to {1}'.format(fold, save_dir))

    return option2id, guesses_by_fold
Ejemplo n.º 31
0
Archivo: guesser.py Proyecto: NPSDC/qb
 def run(self):
     guesser_types = set()
     for g_spec in AbstractGuesser.list_enabled_guessers():
         guesser_types.add(
             f"{g_spec.guesser_module}.{g_spec.guesser_class}")
     _, _, all_dfs, _ = merge_reports(guesser_types)
     best_guessers = find_best_guessers(all_dfs)
     for g, config_num in best_guessers.items():
         inp = f"output/guesser/{g}/{config_num}"
         out = f"output/guesser/best/{g}/"
         shell(f"touch {inp}/best.touch")
         shell(f"mkdir -p {out}")
         shell(f"cp -r {inp}/* {out}")
Ejemplo n.º 32
0
Archivo: guesser.py Proyecto: NPSDC/qb
 def run(self):
     guesser_class = get_class(self.guesser_module, self.guesser_class)
     guesser_instance = guesser_class(
         self.config_num)  # type: AbstractGuesser
     qb_dataset = guesser_instance.qb_dataset()
     start_time = time.time()
     guesser_instance.train(qb_dataset.training_data())
     end_time = time.time()
     guesser_instance.save(
         AbstractGuesser.output_path(self.guesser_module,
                                     self.guesser_class, self.config_num,
                                     ""))
     params = guesser_instance.parameters()
     params["training_time"] = end_time - start_time
     params_path = AbstractGuesser.output_path(
         self.guesser_module,
         self.guesser_class,
         self.config_num,
         "guesser_params.pickle",
     )
     with open(params_path, "wb") as f:
         pickle.dump(params, f)
Ejemplo n.º 33
0
def ew(model, fold=BUZZER_DEV_FOLD):
    buzzes = get_buzzes(model, fold)

    guesses_dir = AbstractGuesser.output_path("qanta.guesser.rnn", "RnnGuesser", 0, "")
    guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, "char")
    with open(guesses_dir, "rb") as f:
        guesses = pickle.load(f)
    guesses = guesses.groupby("qanta_id")

    answers = dict()
    for qid, bs in buzzes.items():
        answers[qid] = []
        groups = guesses.get_group(qid).groupby("char_index")
        for char_index, scores in zip(*bs):
            guess = groups.get_group(char_index).head(1)["guess"]
            guess = guess.values[0]
            buzz = scores[0] < scores[1]
            answers[qid].append(
                {"char_index": char_index, "guess": guess, "buzz": buzz,}
            )

    questions = QuizBowlDataset(buzzer_train=True).questions_by_fold()
    questions = {q.qanta_id: q for q in questions[fold]}

    curve_score = CurveScore()
    ew = []
    ew_opt = []
    for qid, answer in answers.items():
        question = questions[qid]
        q = {"text": question.text, "page": question.page}
        ew.append(curve_score.score(answer, q))
        ew_opt.append(curve_score.score_optimal(answer, q))
    eval_out = {
        "expected_wins": sum(ew),
        "n_examples": len(ew),
        "expected_wins_optimal": sum(ew_opt),
    }
    print(json.dumps(eval_out))
    return eval_out
Ejemplo n.º 34
0
def task_list():
    guess_df = AbstractGuesser.load_all_guesses()
    question_db = QuestionDatabase()
    question_map = question_db.all_questions()
    tasks = []
    guess_df = guess_df[['qnum', 'sentence', 'token', 'guess',
                         'fold']].drop_duplicates(
                             ['qnum', 'sentence', 'token', 'guess'])
    for name, guesses in guess_df.groupby(['qnum', 'sentence', 'token']):
        qnum = name[0]
        question = question_map[qnum]
        tasks.append(Task(question, guesses))

    return tasks
    def __init__(self, buzzer_model_dir='data/neo_0.npz'):
        gspec = AbstractGuesser.list_enabled_guessers()[0]
        guesser_dir = 'data/guesser'
        self.guesser = ElasticSearchWikidataGuesser.load(guesser_dir)

        if chainer.cuda.available:
            self.buzzer = RNNBuzzer(model_dir=buzzer_model_dir,
                                    word_skip=conf['buzzer_word_skip'])
        else:
            self.buzzer = StupidBuzzer()

        self.ok_to_buzz = True
        self.answer = ''
        self.guesses = []
        self.evidence = dict()
Ejemplo n.º 36
0
def main(folds, model_name):
    
    all_questions = QuestionDatabase().all_questions()
    answers = {k: v.page for k, v in all_questions.items()}
    question_texts = {k: v.text for k, v in all_questions.items()}
    protobowl_ids = {k: all_questions[k].protobowl 
        for k in all_questions if all_questions[k].protobowl != ''}
    protobowl_df = load_protobowl().groupby('qid')

    save_dir = 'output/summary/new_performance/'
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    # feature -> fold -> value
    variables = defaultdict(lambda: defaultdict())
    for fold in folds:
        guesses_df = AbstractGuesser.load_guesses(
                bc.GUESSES_DIR, folds=[fold])
        questions = guesses_df.groupby('qnum')

        buzzes_dir = bc.BUZZES_DIR.format(fold, model_name)
        with open(buzzes_dir, 'rb') as infile:
            buzzes = pickle.load(infile)
        log.info('Buzzes loaded from {}.'.format(buzzes_dir))

        # qnum -> n_guessers * length
        top_guesses = _multiprocess(_get_top_guesses, questions, 
            info='Top guesses', multi=True)
        top_guesses = {k: v for k, v in top_guesses}
        inputs = [top_guesses, buzzes, answers, variables, fold, save_dir]

        # get_eop_stats(*inputs)
        get_his_stats(*inputs)
        # get_hyper_search(*inputs)
        
        p_inputs = [question_texts, protobowl_ids, protobowl_df, questions] + inputs
        get_protobowl(p_inputs)

    for key, value in variables.items():
        variables[key] = dict(value)
    variables = dict(variables)

    report(variables, save_dir, folds)
Ejemplo n.º 37
0
Archivo: cli.py Proyecto: Pinafore/qb
def generate_guesser_slurm(slurm_config_file, task, output_dir):
    with open(slurm_config_file) as f:
        slurm_config = yaml.load(f)
        default_slurm_config = slurm_config['default']
    env = Environment(loader=PackageLoader('qanta', 'slurm/templates'))
    template = env.get_template('guesser-luigi-template.sh')
    enabled_guessers = list(AbstractGuesser.list_enabled_guessers())

    for i, gs in enumerate(enabled_guessers):
        if gs.guesser_class == 'ElasticSearchGuesser':
            raise ValueError('ElasticSearchGuesser is not compatible with slurm')
        elif gs.guesser_class in slurm_config:
            guesser_slurm_config = slurm_config[gs.guesser_class]
        else:
            guesser_slurm_config = None
        partition = get_slurm_config_value('partition', default_slurm_config, guesser_slurm_config)
        qos = get_slurm_config_value('qos', default_slurm_config, guesser_slurm_config)
        mem_per_cpu = get_slurm_config_value('mem_per_cpu', default_slurm_config, guesser_slurm_config)
        gres = get_slurm_config_value('gres', default_slurm_config, guesser_slurm_config)
        max_time = get_slurm_config_value('max_time', default_slurm_config, guesser_slurm_config)
        cpus_per_task = get_slurm_config_value('cpus_per_task', default_slurm_config, guesser_slurm_config)
        account = get_slurm_config_value('account', default_slurm_config, guesser_slurm_config)
        if task == 'GuesserReport':
            folds = GUESSER_GENERATION_FOLDS
        else:
            folds = []
        script = template.render({
            'task': task,
            'guesser_module': gs.guesser_module,
            'guesser_class': gs.guesser_class,
            'dependency_module': gs.dependency_module,
            'dependency_class': gs.dependency_class,
            'config_num': gs.config_num,
            'partition': partition,
            'qos': qos,
            'mem_per_cpu': mem_per_cpu,
            'max_time': max_time,
            'gres': gres,
            'cpus_per_task': cpus_per_task,
            'account': account,
            'folds': folds
        })
        slurm_file = path.join(output_dir, f'slurm-{i}.sh')
        with safe_open(slurm_file, 'w') as f:
            f.write(script)

    singleton_path = 'qanta/slurm/templates/guesser-singleton.sh'
    singleton_output = path.join(output_dir, 'guesser-singleton.sh')
    shell(f'cp {singleton_path} {singleton_output}')

    master_template = env.get_template('guesser-master-template.sh')
    master_script = master_template.render({
        'script_list': [
                           path.join(output_dir, f'slurm-{i}.sh') for i in range(len(enabled_guessers))
                       ] + [singleton_output],
        'gres': gres,
        'partition': partition,
        'qos': qos,
        'mem_per_cpu': mem_per_cpu,
        'max_time': max_time,
        'gres': gres,
        'cpus_per_task': cpus_per_task,
        'account': account
    })
    with safe_open(path.join(output_dir, 'slurm-master.sh'), 'w') as f:
        f.write(master_script)
Ejemplo n.º 38
0
from tqdm import tqdm
from elasticsearch_dsl.connections import connections
from elasticsearch_dsl import DocType, Text, Keyword, Search, Index
from qanta.util.constants import GUESSER_DEV_FOLD
from qanta.guesser.abstract import AbstractGuesser
from qanta.datasets.quiz_bowl import QuizBowlDataset
from qanta.guesser.experimental.elasticsearch_instance_of import ElasticSearchWikidataGuesser
from qanta.guesser.experimental.elasticsearch_instance_of import ElasticSearchIndex

INDEX_NAME = 'qb_ir_instance_of'

gspec = AbstractGuesser.list_enabled_guessers()[0]
guesser_dir = AbstractGuesser.output_path(gspec.guesser_module,
        gspec.guesser_class, '')
guesser = ElasticSearchWikidataGuesser.load(guesser_dir)
es_index = ElasticSearchIndex()

def recursive_guess(question, k=0):
    p_class, p_prob = guesser.test_instance_of([question])[0]
    first_guesses = search_not(question, p_class)
    print('First round')
    for x in first_guesses:
        print(x)
    print()

    print('Second round')
    new_guesses = []
    for i in range(k):
        guess = first_guesses[i][0]
        question += ' ' + ' '.join(guess.split('_'))
        guesses = es_index.search(question, p_class, p_prob, 0.6)
Ejemplo n.º 39
-1
Archivo: util.py Proyecto: Pinafore/qb
def read_data(
        fold,
        output_type='char',
        guesser_module='qanta.guesser.rnn',
        guesser_class='RnnGuesser',
        guesser_config_num=0,
        vector_converter=vector_converter_0):

    if os.path.isfile(dataset_dir.format(fold)):
        with open(dataset_dir.format(fold), 'rb') as f:
            return pickle.load(f)

    g_dir = AbstractGuesser.output_path(
        guesser_module, guesser_class, guesser_config_num, '')
    g_path = AbstractGuesser.guess_path(g_dir, fold, output_type)
    with open(g_path, 'rb') as f:
        df = pickle.load(f)
    df_groups = df.groupby('qanta_id')

    questions = QuizBowlDataset(buzzer_train=True).questions_by_fold()
    questions = {q.qanta_id: q for q in questions[fold]}

    pool = Pool(8)
    worker = partial(process_question, questions, vector_converter)
    dataset = pool.map(worker, df_groups)

    with open(dataset_dir.format(fold), 'wb') as f:
        pickle.dump(dataset, f)

    return dataset
Ejemplo n.º 40
-2
Archivo: eval.py Proyecto: Pinafore/qb
def protobowl(model, fold=BUZZER_DEV_FOLD):
    buzzes = get_buzzes(model, fold)

    '''eval'''
    guesses_dir = AbstractGuesser.output_path(
        'qanta.guesser.rnn', 'RnnGuesser', 0, '')
    guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, 'char')
    with open(guesses_dir, 'rb') as f:
        guesses = pickle.load(f)
    guesses = guesses.groupby('qanta_id')

    questions = QuizBowlDataset(buzzer_train=True).questions_by_fold()
    questions = questions[fold]

    df = load_protobowl()
    df = df.groupby('qid')

    worker = partial(simulate_game, guesses, buzzes, df)

    possibility = []
    outcome = []
    for question in tqdm(questions):
        pos, out = worker(question)
        possibility += pos
        outcome += out

    result_df = pd.DataFrame({
        'Possibility': possibility,
        'Outcome': outcome,
    })

    result_dir = os.path.join(
        model.model_dir, '{}_protobowl.pkl'.format(fold))
    with open(result_dir, 'wb') as f:
        pickle.dump(result_df, f)