コード例 #1
0
ファイル: util.py プロジェクト: Agnon1573/qb
def merge_dfs():
    GUESSERS = ["{0}.{1}".format(
        x.guesser_module, x.guesser_class) \
        for x in AbstractGuesser.list_enabled_guessers()]
    log.info("Merging guesser DataFrames.")
    merged_dir = os.path.join(c.GUESSER_TARGET_PREFIX, 'merged')
    if not os.path.exists(merged_dir):
        os.makedirs(merged_dir)
    for fold in c.BUZZER_INPUT_FOLDS:
        if os.path.exists(AbstractGuesser.guess_path(merged_dir, fold)):
            log.info("Merged {0} exists, skipping.".format(fold))
            continue
        new_guesses = pd.DataFrame(columns=[
            'fold', 'guess', 'guesser', 'qnum', 'score', 'sentence', 'token'
        ],
                                   dtype='object')
        for guesser in GUESSERS:
            guesser_dir = os.path.join(c.GUESSER_TARGET_PREFIX, guesser)
            guesses = AbstractGuesser.load_guesses(guesser_dir, folds=[fold])
            new_guesses = new_guesses.append(guesses)
        for col in ['qnum', 'sentence', 'token', 'score']:
            new_guesses[col] = pd.to_numeric(new_guesses[col],
                                             downcast='integer')
        AbstractGuesser.save_guesses(new_guesses, merged_dir, folds=[fold])
        log.info("Merging: {0} finished.".format(fold))
コード例 #2
0
ファイル: util.py プロジェクト: Agnon1573/qb
def load_quizbowl(folds=c.BUZZER_INPUT_FOLDS) \
                    -> Tuple[Dict[str, int], Dict[str, list]]:
    # merge_dfs()
    log.info('Loading data')
    question_db = QuestionDatabase()
    quizbowl_db = QuizBowlDataset(bc.MIN_ANSWERS,
                                  guesser_train=True,
                                  buzzer_train=True)
    all_questions = question_db.all_questions()
    if not os.path.isfile(bc.OPTIONS_DIR):
        log.info('Loading the set of options')
        all_options = set(quizbowl_db.training_data()[1])

        id2option = list(all_options)
        with open(safe_path(bc.OPTIONS_DIR), 'wb') as outfile:
            pickle.dump(id2option, outfile)
    else:
        with open(safe_path(bc.OPTIONS_DIR), 'rb') as infile:
            id2option = pickle.load(infile)
    option2id = {o: i for i, o in enumerate(id2option)}
    num_options = len(id2option)
    log.info('Number of options {0}'.format(len(id2option)))

    guesses_by_fold = dict()
    for fold in folds:
        save_dir = '%s_processed.pickle' % (os.path.join(bc.GUESSES_DIR, fold))
        if os.path.isfile(save_dir):
            with open(safe_path(save_dir), 'rb') as infile:
                guesses_by_fold[fold] = pickle.load(infile)
            log.info('Loading {0} guesses'.format(fold))
            continue

        log.info('Processing {0} guesses'.format(fold))
        guesses = AbstractGuesser.load_guesses(bc.GUESSES_DIR, folds=[fold])

        worker = partial(_process_question, option2id, all_questions)
        inputs = guesses.groupby('qnum')
        guesses_by_fold[fold] = _multiprocess(worker,
                                              inputs,
                                              info='df data',
                                              multi=True)
        guesses_by_fold[fold] = [
            x for x in guesses_by_fold[fold] if x is not None
        ]
        print(len(guesses_by_fold[fold]))

        with open(safe_path(save_dir), 'wb') as outfile:
            pickle.dump(guesses_by_fold[fold], outfile)

        log.info('Processed {0} guesses saved to {1}'.format(fold, save_dir))

    return option2id, guesses_by_fold
コード例 #3
0
ファイル: hyper_search.py プロジェクト: Agnon1573/qb
def hyper_search(fold):
    option2id, all_guesses = load_quizbowl()

    all_questions = QuestionDatabase().all_questions()
    answers = {k: v.page for k, v in all_questions.items()}
    guesses_df = AbstractGuesser.load_guesses(bc.GUESSES_DIR, folds=[fold])

    cfgs = get_cfgs()
    cfg_buzzes = []
    for i, cfg in enumerate(cfgs):
        print('**********{}**********'.format(i))
        buzzes = run(cfg, fold, all_guesses, option2id)
        cfg_buzzes.append((cfg, buzzes))

    with open('output/buzzer/cfg_buzzes_{}.pkl'.format(fold), 'wb') as outfile:
        pickle.dump(cfg_buzzes, outfile)
コード例 #4
0
ファイル: new_performance.py プロジェクト: NPSDC/qb
def main(folds, model_name):

    all_questions = QuestionDatabase().all_questions()
    answers = {k: v.page for k, v in all_questions.items()}
    question_texts = {k: v.text for k, v in all_questions.items()}
    protobowl_ids = {
        k: all_questions[k].protobowl
        for k in all_questions if all_questions[k].protobowl != ""
    }
    protobowl_df = load_protobowl().groupby("qid")

    save_dir = "output/summary/new_performance/"
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    # feature -> fold -> value
    variables = defaultdict(lambda: defaultdict())
    for fold in folds:
        guesses_df = AbstractGuesser.load_guesses(bc.GUESSES_DIR, folds=[fold])
        questions = guesses_df.groupby("qnum")

        buzzes_dir = bc.BUZZES_DIR.format(fold, model_name)
        with open(buzzes_dir, "rb") as infile:
            buzzes = pickle.load(infile)
        log.info("Buzzes loaded from {}.".format(buzzes_dir))

        # qnum -> n_guessers * length
        top_guesses = _multiprocess(_get_top_guesses,
                                    questions,
                                    info="Top guesses",
                                    multi=True)
        top_guesses = {k: v for k, v in top_guesses}
        inputs = [top_guesses, buzzes, answers, variables, fold, save_dir]

        # get_eop_stats(*inputs)
        get_his_stats(*inputs)
        # get_hyper_search(*inputs)

        p_inputs = [question_texts, protobowl_ids, protobowl_df, questions
                    ] + inputs
        get_protobowl(p_inputs)

    for key, value in variables.items():
        variables[key] = dict(value)
    variables = dict(variables)

    report(variables, save_dir, folds)
コード例 #5
0
ファイル: new_performance.py プロジェクト: Pinafore/qb
def main(folds, model_name):
    
    all_questions = QuestionDatabase().all_questions()
    answers = {k: v.page for k, v in all_questions.items()}
    question_texts = {k: v.text for k, v in all_questions.items()}
    protobowl_ids = {k: all_questions[k].protobowl 
        for k in all_questions if all_questions[k].protobowl != ''}
    protobowl_df = load_protobowl().groupby('qid')

    save_dir = 'output/summary/new_performance/'
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    # feature -> fold -> value
    variables = defaultdict(lambda: defaultdict())
    for fold in folds:
        guesses_df = AbstractGuesser.load_guesses(
                bc.GUESSES_DIR, folds=[fold])
        questions = guesses_df.groupby('qnum')

        buzzes_dir = bc.BUZZES_DIR.format(fold, model_name)
        with open(buzzes_dir, 'rb') as infile:
            buzzes = pickle.load(infile)
        log.info('Buzzes loaded from {}.'.format(buzzes_dir))

        # qnum -> n_guessers * length
        top_guesses = _multiprocess(_get_top_guesses, questions, 
            info='Top guesses', multi=True)
        top_guesses = {k: v for k, v in top_guesses}
        inputs = [top_guesses, buzzes, answers, variables, fold, save_dir]

        # get_eop_stats(*inputs)
        get_his_stats(*inputs)
        # get_hyper_search(*inputs)
        
        p_inputs = [question_texts, protobowl_ids, protobowl_df, questions] + inputs
        get_protobowl(p_inputs)

    for key, value in variables.items():
        variables[key] = dict(value)
    variables = dict(variables)

    report(variables, save_dir, folds)
コード例 #6
0
ファイル: test.py プロジェクト: nadesai/qb
def generate(config, folds):
    N_GUESSERS = len(GUESSERS)
    option2id, all_guesses = load_quizbowl(folds)

    cfg = getattr(configs, config)()
    # cfg = pickle.load(open(cfg.ckp_dir, 'rb'))
    make_vector = getattr(iterator, cfg.make_vector)

    iterators = dict()
    for fold in folds:
        iterators[fold] = QuestionIterator(all_guesses[fold], option2id,
            batch_size=cfg.batch_size, make_vector=make_vector)
    
    if not os.path.exists(cfg.model_dir):
        log.info('Model {0} not available'.format(cfg.model_dir))
        exit(0)

    model = RNN(iterators[folds[0]].n_input, cfg.n_hidden, N_GUESSERS + 1)

    log.info('Loading model {0}'.format(cfg.model_dir))
    chainer.serializers.load_npz(cfg.model_dir, model)

    gpu = conf['buzzer']['gpu']
    if gpu != -1 and chainer.cuda.available:
        log.info('Using gpu {0}'.format(gpu))
        chainer.cuda.get_device(gpu).use()
        model.to_gpu(gpu)

    trainer = Trainer(model, cfg.model_dir)

    for fold in folds:
        buzzes = trainer.test(iterators[fold])
        log.info('{0} buzzes generated. Size {1}.'.format(fold, len(buzzes)))
        buzzes_dir = bc.BUZZES_DIR.format(fold, cfg.model_name)
        with open(buzzes_dir, 'wb') as f:
            pickle.dump(buzzes, f)
        log.info('Buzzes saved to {0}.'.format(buzzes_dir))

        if fold == 'expo':
            guesses_df = AbstractGuesser.load_guesses(bc.GUESSES_DIR, folds=[fold])
            buzzer2vwexpo(guesses_df, buzzes, fold)
コード例 #7
0
def report_ultimate():
    all_questions = QuestionDatabase().all_questions()
    answers = {k: v.page for k, v in all_questions.items()}
    question_texts = {k: v.text for k, v in all_questions.items()}
    protobowl_ids = {
        k: all_questions[k].protobowl
        for k in all_questions if all_questions[k].protobowl != ''
    }
    protobowl_df, user_count = load_protobowl()
    guesses_df = AbstractGuesser.load_guesses(bc.GUESSES_DIR,
                                              folds=[c.BUZZER_DEV_FOLD])
    questions = guesses_df.groupby('qnum')
    top_guesses = _multiprocess(_get_top_guesses,
                                questions,
                                info='Top guesses',
                                multi=True)
    top_guesses = {k: v for k, v in top_guesses}

    option2id, all_guesses = load_quizbowl()
    test_iter = QuestionIterator(all_guesses[c.BUZZER_DEV_FOLD],
                                 option2id,
                                 batch_size=128)
    buzzes = ultimate_buzzer(test_iter)

    save_dir = 'output/summary/new_performance/'
    inputs = [top_guesses, buzzes, answers, None, c.BUZZER_DEV_FOLD, save_dir]
    user_answers_thresholds = [1, 10, 50, 100, 500, 1000, 2000]
    threshold_stats = []
    for threshold in user_answers_thresholds:
        pdf1 = protobowl_df[protobowl_df.user_answers > threshold]
        p_inputs = [
            question_texts, protobowl_ids,
            pdf1.groupby('qid'), questions
        ] + inputs
        pstats = get_protobowl(p_inputs)
        threshold_stats.append(pstats)
        print('ultimate', threshold, pstats)
    print('ultimate', [x['reward'] for x in threshold_stats])
コード例 #8
0
def report(buzzes_dir):
    all_questions = QuestionDatabase().all_questions()
    answers = {k: v.page for k, v in all_questions.items()}
    question_texts = {k: v.text for k, v in all_questions.items()}
    protobowl_ids = {
        k: all_questions[k].protobowl
        for k in all_questions if all_questions[k].protobowl != ''
    }
    protobowl_df, user_count = load_protobowl()
    guesses_df = AbstractGuesser.load_guesses(bc.GUESSES_DIR,
                                              folds=[c.BUZZER_DEV_FOLD])
    questions = guesses_df.groupby('qnum')
    top_guesses = _multiprocess(_get_top_guesses,
                                questions,
                                info='Top guesses',
                                multi=True)
    top_guesses = {k: v for k, v in top_guesses}

    with open(buzzes_dir, 'rb') as infile:
        buzzes = pickle.load(infile)

    save_dir = 'output/summary/new_performance/'
    inputs = [top_guesses, buzzes, answers, None, c.BUZZER_DEV_FOLD, save_dir]
    user_answers_thresholds = [1, 10, 50, 100, 500, 1000, 2000]
    threshold_stats = []
    for threshold in user_answers_thresholds:
        pdf1 = protobowl_df[protobowl_df.user_answers > threshold]
        p_inputs = [
            question_texts, protobowl_ids,
            pdf1.groupby('qid'), questions
        ] + inputs
        pstats = get_protobowl(p_inputs)
        threshold_stats.append(pstats)
        print(threshold, pstats)
    with open(buzzes_dir + '.pstats', 'wb') as f:
        pickle.dump(threshold_stats, f)
    print([x['reward'] for x in threshold_stats])
コード例 #9
0
ファイル: interface.py プロジェクト: Agnon1573/qb
        log.info('\n\n[buzzer2vwexpo] writing to files')

        buzz_template = '|'.join(['{}' for _ in range(7)])
        buzz_out = '\n'.join(buzz_template.format(*r) for r in
                itertools.chain(*buzzf))
        buzz_file.write(buzz_out)
        log.info('buzz file written')

        final_out = '\n'.join('{0},{1}'.format(*r) for r in
                itertools.chain(*finalf))
        final_file.write(final_out)
        log.info('final file written')

        pred_out = '\n'.join('{0} {1}_{2}_{3}'.format(*r) for r in
                itertools.chain(*predf))
        pred_file.write(pred_out)
        log.info('vw_pred file written')

        meta_out = '\n'.join('{0} {1} {2} {3}'.format(*r) for r in
                itertools.chain(*metaf))
        meta_file.write(meta_out)
        log.info('vw_meta file written')

if __name__ == '__main__':
    model_name = 'neo_0'
    guesses_df = AbstractGuesser.load_guesses(bc.GUESSES_DIR, folds=['expo'])
    expo_buzzes_dir = 'output/buzzer/neo/expo_buzzes.{}.pkl'.format(model_name)
    with open(expo_buzzes_dir, 'rb') as f:
        expo_buzzes = pickle.load(f)
    buzzer2vwexpo(guesses_df, expo_buzzes, 'expo')