Esempio n. 1
0
def sample_n_guesser_correct_questions(question_lookup,
                                       guess_lookup,
                                       n_correct_samples,
                                       n_samples=10):
    sampled_questions_by_correct = defaultdict(list)
    dataset = QuizBowlDataset(guesser_train=True)
    training_data = dataset.training_data()
    answer_counts = defaultdict(int)
    for ans in training_data[1]:
        answer_counts[ans] += 1

    for n_correct, keys in n_correct_samples.items():
        samples = random.sample(keys, min(n_samples, len(keys)))
        for key in samples:
            qnum, sent, token = key
            page = question_lookup[qnum].page
            text = question_lookup[qnum].get_text(sent, token)
            guesses = guess_lookup[key]
            correct_guessers = tuple(guesses[guesses.guess == page].guesser)
            wrong_guessers = tuple(guesses[guesses.guess != page].guesser)
            sampled_questions_by_correct[n_correct].append(
                (text, key, page, answer_counts[page], correct_guessers,
                 wrong_guessers))

    return sampled_questions_by_correct
Esempio n. 2
0
def evaluate(ckp_dir):
    db = QuizBowlDataset(guesser_train=True, buzzer_train=True)
    questions = db.questions_in_folds(["guessdev"])
    questions = {x.qnum: x for x in questions}

    with open(ckp_dir, "rb") as f:
        checkpoint = pickle.load(f)

    scores = [0, 0, 0, 0, 0]
    descriptions = [
        "accuracy before",
        "accuracy after",
        "before after match",
        "top 5 accuracy before",
        "top 5 accuracy after",
    ]
    for k, q in checkpoint.items():
        page = questions[k].page
        gb = sorted(q["guesses_before"].items(), key=lambda x: x[1])[::-1]
        ga = sorted(q["guesses_after"].items(), key=lambda x: x[1])[::-1]
        scores[0] += gb[0][0] == page  # accuracy before
        scores[1] += ga[0][0] == page  # accuracy after
        scores[2] += ga[0][0] == gb[0][0]  # top 1 match before / after
        scores[3] += page in [x[0] for x in gb[:5]]  # top 5 accuracy before
        scores[4] += page in [x[0] for x in ga[:5]]  # top 5 accuracy after
    scores = [x / len(questions) for x in scores]
    for s, d in zip(scores, descriptions):
        print(d, s)
Esempio n. 3
0
def create_instance_of_map(wikidata_claims_instance_of_path,
                           output_path,
                           n_types=50):
    ds = QuizBowlDataset(guesser_train=True)
    training_data = ds.training_data()
    answers = set(training_data[1])
    answer_counts = Counter(training_data[1])
    with open(wikidata_claims_instance_of_path) as f:
        claims = defaultdict(set)
        for line in f:
            c = json.loads(line)
            if c['title'] is not None:
                title = normalize_wikipedia_title(c['title'])
                if title in answers:
                    c_object = c['object']
                    if c_object in object_blacklist:
                        continue
                    if c_object in object_merge_map:
                        claims[title].add(object_merge_map[c_object])
                    elif is_god(c_object):
                        claims[title].add('god')
                    else:
                        claims[title].add(c_object)
    for k in claims:
        if 'literary work' in claims[k] and len(claims[k]) > 1:
            claims[k].remove('literary work')

    total_counts = defaultdict(int)
    class_counts = defaultdict(int)
    answer_types = {}
    for a in answers:
        if a in claims:
            answer_types[a] = claims[a]
            for obj in claims[a]:
                total_counts[obj] += answer_counts[a]
                class_counts[obj] += 1

    sorted_total_counts = sorted(total_counts.items(),
                                 reverse=True,
                                 key=lambda k: k[1])
    top_types = {key for key, count in sorted_total_counts[:n_types]}
    instance_of_map = {}
    for a in answer_types:
        top_class_types = top_types.intersection(answer_types[a])
        if len(top_class_types) == 0:
            instance_of_map[a] = NO_MATCH
        elif len(top_class_types) > 1:
            frequency = 0
            final_a_type = None
            for a_type in top_class_types:
                if class_counts[a_type] > frequency:
                    frequency = class_counts[a_type]
                    final_a_type = a_type
            instance_of_map[a] = final_a_type
        else:
            instance_of_map[a] = next(iter(top_class_types))

    with open(output_path, 'wb') as f:
        pickle.dump(instance_of_map, f)
Esempio n. 4
0
def main():
    fold = 'guessdev'
    db = QuizBowlDataset(1, guesser_train=True, buzzer_train=True)
    questions = db.questions_in_folds([fold])
    first_n = lambda x: len(x)

    print(guesser.guess_single(' '.join(questions[0].text.values())))
    '''
Esempio n. 5
0
def main():
    fold = 'guessdev'
    db = QuizBowlDataset(1, guesser_train=True, buzzer_train=True)
    questions = db.questions_in_folds([fold])
    first_n = lambda x: len(x)

    print(guesser.guess_single(' '.join(questions[0].text.values())))
    
    '''
Esempio n. 6
0
def main():
    dataset = QuizBowlDataset(guesser_train=True)
    questions = dataset.questions_by_fold([GUESSER_DEV_FOLD])
    questions = questions[GUESSER_DEV_FOLD]
    correct = 0
    for question in tqdm(questions):
        guess = recursive_guess(question.text[0], 3)
        correct += (guess == question.page)
    print(correct / len(questions))
Esempio n. 7
0
def main():
    dataset = QuizBowlDataset(guesser_train=True)
    questions = dataset.questions_by_fold([GUESSER_DEV_FOLD])
    questions = questions[GUESSER_DEV_FOLD]
    correct = 0
    for question in tqdm(questions):
        guess = recursive_guess(question.text[0], 3)
        correct += guess == question.page
    print(correct / len(questions))
Esempio n. 8
0
def test():
    dataset = QuizBowlDataset(guesser_train=True)
    questions = dataset.questions_by_fold([GUESSER_DEV_FOLD])
    questions = questions[GUESSER_DEV_FOLD]

    i = 10
    question = questions[i]
    guess = recursive_guess(question.text[0], k=1)
    print(question.page)
    print(question.text[0])
Esempio n. 9
0
def get_quizbowl():
    qb_dataset = QuizBowlDataset(guesser_train=True, buzzer_train=False)
    training_data = qb_dataset.training_data()
    train_x, train_y, dev_x, dev_y, i_to_word, class_to_i, i_to_class = preprocess_dataset(
        training_data)
    i_to_word = ['<unk>', '<eos>'] + sorted(i_to_word)
    word_to_i = {x: i for i, x in enumerate(i_to_word)}
    train = transform_to_array(zip(train_x, train_y), word_to_i)
    dev = transform_to_array(zip(dev_x, dev_y), word_to_i)
    return train, dev, word_to_i, i_to_class
Esempio n. 10
0
def test():
    dataset = QuizBowlDataset(guesser_train=True)
    questions = dataset.questions_by_fold([GUESSER_DEV_FOLD])
    questions = questions[GUESSER_DEV_FOLD]

    i = 10
    question = questions[i]
    guess = recursive_guess(question.text[0], k=1)
    print(question.page)
    print(question.text[0])
Esempio n. 11
0
def create_instance_of_map(wikidata_claims_instance_of_path, output_path, n_types=50):
    ds = QuizBowlDataset(guesser_train=True)
    training_data = ds.training_data()
    answers = set(training_data[1])
    answer_counts = Counter(training_data[1])
    with open(wikidata_claims_instance_of_path) as f:
        claims = defaultdict(set)
        for line in f:
            c = json.loads(line)
            if c['title'] is not None:
                title = normalize_wikipedia_title(c['title'])
                if title in answers:
                    c_object = c['object']
                    if c_object in object_blacklist:
                        continue
                    if c_object in object_merge_map:
                        claims[title].add(object_merge_map[c_object])
                    elif is_god(c_object):
                        claims[title].add('god')
                    else:
                        claims[title].add(c_object)
    for k in claims:
        if 'literary work' in claims[k] and len(claims[k]) > 1:
            claims[k].remove('literary work')

    total_counts = defaultdict(int)
    class_counts = defaultdict(int)
    answer_types = {}
    for a in answers:
        if a in claims:
            answer_types[a] = claims[a]
            for obj in claims[a]:
                total_counts[obj] += answer_counts[a]
                class_counts[obj] += 1

    sorted_total_counts = sorted(total_counts.items(), reverse=True, key=lambda k: k[1])
    top_types = {key for key, count in sorted_total_counts[:n_types]}
    instance_of_map = {}
    for a in answer_types:
        top_class_types = top_types.intersection(answer_types[a])
        if len(top_class_types) == 0:
            instance_of_map[a] = NO_MATCH
        elif len(top_class_types) > 1:
            frequency = 0
            final_a_type = None
            for a_type in top_class_types:
                if class_counts[a_type] > frequency:
                    frequency = class_counts[a_type]
                    final_a_type = a_type
            instance_of_map[a] = final_a_type
        else:
            instance_of_map[a] = next(iter(top_class_types))

    with open(output_path, 'wb') as f:
        pickle.dump(instance_of_map, f)
Esempio n. 12
0
File: util.py Progetto: Agnon1573/qb
def load_quizbowl(folds=c.BUZZER_INPUT_FOLDS) \
                    -> Tuple[Dict[str, int], Dict[str, list]]:
    # merge_dfs()
    log.info('Loading data')
    question_db = QuestionDatabase()
    quizbowl_db = QuizBowlDataset(bc.MIN_ANSWERS,
                                  guesser_train=True,
                                  buzzer_train=True)
    all_questions = question_db.all_questions()
    if not os.path.isfile(bc.OPTIONS_DIR):
        log.info('Loading the set of options')
        all_options = set(quizbowl_db.training_data()[1])

        id2option = list(all_options)
        with open(safe_path(bc.OPTIONS_DIR), 'wb') as outfile:
            pickle.dump(id2option, outfile)
    else:
        with open(safe_path(bc.OPTIONS_DIR), 'rb') as infile:
            id2option = pickle.load(infile)
    option2id = {o: i for i, o in enumerate(id2option)}
    num_options = len(id2option)
    log.info('Number of options {0}'.format(len(id2option)))

    guesses_by_fold = dict()
    for fold in folds:
        save_dir = '%s_processed.pickle' % (os.path.join(bc.GUESSES_DIR, fold))
        if os.path.isfile(save_dir):
            with open(safe_path(save_dir), 'rb') as infile:
                guesses_by_fold[fold] = pickle.load(infile)
            log.info('Loading {0} guesses'.format(fold))
            continue

        log.info('Processing {0} guesses'.format(fold))
        guesses = AbstractGuesser.load_guesses(bc.GUESSES_DIR, folds=[fold])

        worker = partial(_process_question, option2id, all_questions)
        inputs = guesses.groupby('qnum')
        guesses_by_fold[fold] = _multiprocess(worker,
                                              inputs,
                                              info='df data',
                                              multi=True)
        guesses_by_fold[fold] = [
            x for x in guesses_by_fold[fold] if x is not None
        ]
        print(len(guesses_by_fold[fold]))

        with open(safe_path(save_dir), 'wb') as outfile:
            pickle.dump(guesses_by_fold[fold], outfile)

        log.info('Processed {0} guesses saved to {1}'.format(fold, save_dir))

    return option2id, guesses_by_fold
Esempio n. 13
0
File: stats.py Progetto: xxlatgh/qb
def compute_question_stats(question_db_path: str):
    dataset = QuizBowlDataset(5, qb_question_db=question_db_path)
    train_dev_questions = dataset.questions_in_folds(('train', 'dev'))
    question_lengths = [
        len(q.flatten_text().split()) for q in train_dev_questions
    ]

    mean = np.mean(question_lengths)
    std = np.std(question_lengths)

    stats = (mean, std)

    with safe_open(SENTENCE_STATS, 'wb') as f:
        pickle.dump(stats, f)
Esempio n. 14
0
def create_memory_index():
    dataset = QuizBowlDataset(guesser_train=True)
    training_data = dataset.training_data()
    answers = set(training_data[1])
    cw = CachedWikipedia()

    try:
        Index('mem').delete()
    except:
        pass
    Answer.init()
    all_wiki_pages = [cw[page] for page in answers]
    wiki_pages = [p for p in all_wiki_pages if p.content != '']
    sc = create_spark_context()
    sc.parallelize(wiki_pages, 1000).foreach(index_page)
Esempio n. 15
0
def protobowl(model, fold=BUZZER_DEV_FOLD):
    buzzes = get_buzzes(model, fold)

    """eval"""
    guesses_dir = AbstractGuesser.output_path("qanta.guesser.rnn", "RnnGuesser", 0, "")
    guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, "char")
    with open(guesses_dir, "rb") as f:
        guesses = pickle.load(f)
    guesses = guesses.groupby("qanta_id")

    questions = QuizBowlDataset(buzzer_train=True).questions_by_fold()
    questions = questions[fold]

    df = load_protobowl()
    df = df.groupby("qid")

    worker = partial(simulate_game, guesses, buzzes, df)

    possibility = []
    outcome = []
    for question in tqdm(questions):
        pos, out = worker(question)
        possibility += pos
        outcome += out

    result_df = pd.DataFrame({"Possibility": possibility, "Outcome": outcome,})

    result_dir = os.path.join(model.model_dir, "{}_protobowl.pkl".format(fold))
    with open(result_dir, "wb") as f:
        pickle.dump(result_df, f)
Esempio n. 16
0
def protobowl(model, fold=BUZZER_DEV_FOLD):
    buzzes = get_buzzes(model, fold)
    '''eval'''
    guesses_dir = AbstractGuesser.output_path('qanta.guesser.rnn',
                                              'RnnGuesser', 0, '')
    guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, 'char')
    with open(guesses_dir, 'rb') as f:
        guesses = pickle.load(f)
    guesses = guesses.groupby('qanta_id')

    questions = QuizBowlDataset(buzzer_train=True).questions_by_fold()
    questions = questions[fold]

    df = load_protobowl()
    df = df.groupby('qid')

    worker = partial(simulate_game, guesses, buzzes, df)

    possibility = []
    outcome = []
    for question in tqdm(questions):
        pos, out = worker(question)
        possibility += pos
        outcome += out

    result_df = pd.DataFrame({
        'Possibility': possibility,
        'Outcome': outcome,
    })

    result_dir = os.path.join(model.model_dir, '{}_protobowl.pkl'.format(fold))
    with open(result_dir, 'wb') as f:
        pickle.dump(result_df, f)
Esempio n. 17
0
def generate_domain_classifier_data(weight=150):
    """
    Reads all sentences from every wikipedia page corresponding to a known answer and splits them into two vowpal wabbit files,

    interleaving true quiz bowl questions randomly and with higher weight specified by the weight arg.
    """
    qb_data = QuizBowlDataset(guesser_train=True).training_data()
    real_questions = [('1', str(weight), ans, clean_question(sent))
                      for q, ans, _ in zip(*qb_data) for sent in q]
    pages = set(a for _, _, a, _ in real_questions)

    cw = CachedWikipedia()

    # Split wikipedia questions into two sets
    wiki_questions = ([], [])
    use_second = False
    for page in pages:
        for sentence in sentences_from_page(cw[page]):
            q = clean_question(sentence)
            wiki_questions[use_second].append(('-1', '1', page, q))
            use_second = not use_second

    vw_line = '{} {} \'{}|text {}\n'
    for i, wiki_qs in enumerate(wiki_questions):
        # Create list of True/False and shuffle to define ordering of train data
        order = list(
            chain(repeat(False, len(real_questions)),
                  repeat(True, len(wiki_qs))))
        random.shuffle(order)
        iters = (iter(real_questions), iter(wiki_qs))
        with safe_open(DOMAIN_TARGET_PREFIX + str(i), 'w') as f:
            for choice in order:
                f.write(vw_line.format(*next(iters[choice])))
Esempio n. 18
0
def read_data(fold,
              output_type='char',
              guesser_module='qanta.guesser.dan',
              guesser_class='DanGuesser',
              guesser_config_num=0,
              vector_converter=vector_converter_0):

    if os.path.isfile(dataset_dir.format(fold)):
        with open(dataset_dir.format(fold), 'rb') as f:
            return pickle.load(f)

    g_dir = AbstractGuesser.output_path(guesser_module, guesser_class,
                                        guesser_config_num, '')
    g_path = AbstractGuesser.guess_path(g_dir, fold, output_type)
    with open(g_path, 'rb') as f:
        df = pickle.load(f)
    df_groups = df.groupby('qanta_id')

    questions = QuizBowlDataset(buzzer_train=True).questions_by_fold()
    questions = {q.qanta_id: q for q in questions[fold]}

    pool = Pool(8)
    worker = partial(process_question, questions, vector_converter)
    dataset = pool.map(worker, df_groups)

    with open(dataset_dir.format(fold), 'wb') as f:
        return pickle.dump(dataset, f)

    return dataset
Esempio n. 19
0
    def training_data(self):
        cw = CachedWikipedia(QB_WIKI_LOCATION)
        ds = QuizBowlDataset(2)
        train_data = ds.training_data()
        answer_classes = set(train_data[1])
        train_x = []
        train_y = []

        for page in answer_classes:
            sentences = list(wiki_sentences(cw[page].content))
            sampled_sentences = random.sample(
                sentences, min(len(sentences), self.max_sentences))
            training_examples = []
            for sentence in sampled_sentences:
                training_examples.append(sentence)
            train_x.append(training_examples)
            train_y.append(page)
        return train_x, train_y, None
Esempio n. 20
0
def main(questions, n_keep, ckp_dir):
    db = QuizBowlDataset(guesser_train=True, buzzer_train=True)
    questions = db.questions_in_folds(['guessdev'])
    questions = {x.qnum: x for x in questions}

    checkpoint = defaultdict(dict)
    for qnum, question in questions.items():
        text_before = question.flatten_text()
        guesses_before = guesser.guess_single(text_before)
        text_after, guesses_after, removed = greedy_remove(
                text_before, guesses_before, n_keep)
        checkpoint[qnum]['text_before'] = text_before
        checkpoint[qnum]['text_after'] = text_after
        checkpoint[qnum]['guesses_before'] = guesses_before
        checkpoint[qnum]['guesses_after'] = guesses_after
        checkpoint[qnum]['removed'] = removed

    checkpoint = dict(checkpoint)
    with open(safe_path(ckp_dir), 'wb') as f:
        pickle.dump(checkpoint, f)

    evaluate(ckp_dir)
Esempio n. 21
0
def evaluate(ckp_dir):
    db = QuizBowlDataset(guesser_train=True, buzzer_train=True)
    questions = db.questions_in_folds(['guessdev'])
    questions = {x.qnum: x for x in questions}

    with open(ckp_dir, 'rb') as f:
        checkpoint = pickle.load(f)

    scores = [0, 0, 0, 0, 0]
    descriptions = ['accuracy before', 'accuracy after', 'before after match',
                    'top 5 accuracy before', 'top 5 accuracy after']
    for k, q in checkpoint.items():
        page = questions[k].page
        gb = sorted(q['guesses_before'].items(), key=lambda x: x[1])[::-1]
        ga = sorted(q['guesses_after'].items(), key=lambda x: x[1])[::-1]
        scores[0] += gb[0][0] == page # accuracy before
        scores[1] += ga[0][0] == page # accuracy after
        scores[2] += ga[0][0] == gb[0][0] # top 1 match before / after
        scores[3] += page in [x[0] for x in gb[:5]] # top 5 accuracy before
        scores[4] += page in [x[0] for x in ga[:5]] # top 5 accuracy after
    scores = [x / len(questions) for x in scores]
    for s, d in zip(scores, descriptions):
        print(d, s)
Esempio n. 22
0
def main(questions, n_keep, ckp_dir):
    db = QuizBowlDataset(guesser_train=True, buzzer_train=True)
    questions = db.questions_in_folds(["guessdev"])
    questions = {x.qnum: x for x in questions}

    checkpoint = defaultdict(dict)
    for qnum, question in questions.items():
        text_before = question.flatten_text()
        guesses_before = guesser.guess_single(text_before)
        text_after, guesses_after, removed = greedy_remove(
            text_before, guesses_before, n_keep
        )
        checkpoint[qnum]["text_before"] = text_before
        checkpoint[qnum]["text_after"] = text_after
        checkpoint[qnum]["guesses_before"] = guesses_before
        checkpoint[qnum]["guesses_after"] = guesses_after
        checkpoint[qnum]["removed"] = removed

    checkpoint = dict(checkpoint)
    with open(safe_path(ckp_dir), "wb") as f:
        pickle.dump(checkpoint, f)

    evaluate(ckp_dir)
Esempio n. 23
0
def ew(model, fold=BUZZER_DEV_FOLD):
    buzzes = get_buzzes(model, fold)

    guesses_dir = AbstractGuesser.output_path('qanta.guesser.rnn',
                                              'RnnGuesser', 0, '')
    guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, 'char')
    with open(guesses_dir, 'rb') as f:
        guesses = pickle.load(f)
    guesses = guesses.groupby('qanta_id')

    answers = dict()
    for qid, bs in buzzes.items():
        answers[qid] = []
        groups = guesses.get_group(qid).groupby('char_index')
        for char_index, scores in zip(*bs):
            guess = groups.get_group(char_index).head(1)['guess']
            guess = guess.values[0]
            buzz = scores[0] < scores[1]
            answers[qid].append({
                'char_index': char_index,
                'guess': guess,
                'buzz': buzz,
            })

    questions = QuizBowlDataset(buzzer_train=True).questions_by_fold()
    questions = {q.qanta_id: q for q in questions[fold]}

    curve_score = CurveScore()
    ew = []
    ew_opt = []
    for qid, answer in answers.items():
        question = questions[qid]
        q = {'text': question.text, 'page': question.page}
        ew.append(curve_score.score(answer, q))
        ew_opt.append(curve_score.score_optimal(answer, q))
    eval_out = {
        'expected_wins': sum(ew),
        'n_examples': len(ew),
        'expected_wins_optimal': sum(ew_opt),
    }
    print(json.dumps(eval_out))
    return eval_out
Esempio n. 24
0
def main():
    fold = BUZZER_DEV_FOLD

    # load questions
    print('loading questions')
    questions = QuizBowlDataset(buzzer_train=True).questions_by_fold()
    questions = questions[fold]

    # load guesser outputs
    print('loading guesser outputs')
    guesses = read_data(fold)
    guesses = {x[0]: x for x in guesses}

    # load buzzer outputs
    print('loading buzzer outputs')
    buzz_dir = os.path.join(buzzes_dir.format(fold))
    with open(buzz_dir, 'rb') as f:
        buzzes = pickle.load(f)

    # load protobowl records
    print('loading protobowl records')
    df, _ = load_protobowl()
    record_groups = df.groupby('qid')

    metrics = [_protobowl_scores, _curve_scores]
    pool = Pool(8)
    worker = partial(run_all_metrics, guesses, buzzes, record_groups, metrics)
    scores = pool.map(worker, questions)

    all_scores = list(map(list, zip(*scores)))

    protobowl_scores = all_scores[0]
    protobowl_scores = list(map(list, zip(*protobowl_scores)))
    protobowl_scores = [[x for x in s if x is not None]
                        for s in protobowl_scores]
    print([np.mean(s) for s in protobowl_scores])

    curve_scores = all_scores[1]
    curve_scores = list(map(list, zip(*curve_scores)))
    curve_scores = [[x for x in s if x is not None] for s in curve_scores]
    print([np.mean(s) for s in curve_scores])
Esempio n. 25
0
def ew(model, fold=BUZZER_DEV_FOLD):
    buzzes = get_buzzes(model, fold)

    guesses_dir = AbstractGuesser.output_path("qanta.guesser.rnn", "RnnGuesser", 0, "")
    guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, "char")
    with open(guesses_dir, "rb") as f:
        guesses = pickle.load(f)
    guesses = guesses.groupby("qanta_id")

    answers = dict()
    for qid, bs in buzzes.items():
        answers[qid] = []
        groups = guesses.get_group(qid).groupby("char_index")
        for char_index, scores in zip(*bs):
            guess = groups.get_group(char_index).head(1)["guess"]
            guess = guess.values[0]
            buzz = scores[0] < scores[1]
            answers[qid].append(
                {"char_index": char_index, "guess": guess, "buzz": buzz,}
            )

    questions = QuizBowlDataset(buzzer_train=True).questions_by_fold()
    questions = {q.qanta_id: q for q in questions[fold]}

    curve_score = CurveScore()
    ew = []
    ew_opt = []
    for qid, answer in answers.items():
        question = questions[qid]
        q = {"text": question.text, "page": question.page}
        ew.append(curve_score.score(answer, q))
        ew_opt.append(curve_score.score_optimal(answer, q))
    eval_out = {
        "expected_wins": sum(ew),
        "n_examples": len(ew),
        "expected_wins_optimal": sum(ew_opt),
    }
    print(json.dumps(eval_out))
    return eval_out
Esempio n. 26
0
 def qb_dataset(self) -> QuizBowlDataset:
     return QuizBowlDataset(guesser_train=True)
Esempio n. 27
0
import json
from qanta.util.constants import GUESSER_DEV_FOLD
from qanta.datasets.quiz_bowl import QuizBowlDataset
''' Preprocess the questions from a database (like data/naqt.db) and extracts 
the text, qid and answer. Stores the data in sample_questions.json '''

dataset = QuizBowlDataset(guesser_train=True, qb_question_db='data/naqt.db')
questions = dataset.questions_by_fold([GUESSER_DEV_FOLD])[GUESSER_DEV_FOLD]


def convert(q):
    return {'qid': q.qnum, 'text': ' '.join(q.text.values()), 'answer': q.page}


questions = [convert(q) for q in questions]
with open('data/sample_questions.json', 'w') as f:
    f.write(json.dumps(questions))
Esempio n. 28
0
def export(output_file: str, fold: str = "buzztest"):
    fold = "buzztest"
    guesses_dir = AbstractGuesser.output_path("qanta.guesser.rnn",
                                              "RnnGuesser", 0, "")
    guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, "char")
    with open(guesses_dir, "rb") as f:
        guesses = pickle.load(f)
    guesses = guesses.groupby("qanta_id")

    questions = QuizBowlDataset(buzzer_train=True).questions_by_fold()
    questions = {q.qanta_id: q for q in questions[fold]}
    buzzers = {}
    for name in ["RNNBuzzer", "ThresholdBuzzer", "MLPBuzzer"]:
        model_dir = f"output/buzzer/{name}"
        buzzes_dir = os.path.join(model_dir, "{}_buzzes.pkl".format(fold))
        with open(buzzes_dir, "rb") as f:
            buzzers[name] = pickle.load(f)

    qid_to_buzzes = defaultdict(dict)
    for name, buzzes in track(buzzers.items()):
        for qid, (char_indices, scores) in buzzes.items():
            gs = (guesses.get_group(qid).groupby("char_index").aggregate(
                lambda x: x.head(1)).to_dict()["guess"])
            question = questions[qid]
            q_len = len(question.text)
            buzz_oracle_position = -1
            buzz_model_position = -1
            oracle_guess = None
            buzz_guess = None
            for i, char_index in enumerate(char_indices):
                buzz_oracle = gs[char_index] == question.page
                if buzz_oracle:
                    if buzz_oracle_position == -1 or char_index <= buzz_oracle_position:
                        oracle_guess = question.page
                        buzz_oracle_position = char_index

                if scores[i][1] > scores[i][0]:
                    if buzz_model_position == -1 or char_index < buzz_model_position:
                        buzz_guess = gs[char_index]
                        buzz_model_position = char_index
            qid_to_buzzes[qid][name] = {
                "oracle":
                buzz_oracle_position,
                "oracle_fraction":
                buzz_oracle_position /
                q_len if buzz_oracle_position != -1 else -1,
                "position":
                buzz_model_position,
                "position_fraction":
                buzz_model_position /
                q_len if buzz_model_position != -1 else -1,
                "q_len":
                q_len,
                "oracle_guess":
                oracle_guess,
                "buzz_guess":
                buzz_guess,
                "answer":
                question.page,
                "impossible":
                oracle_guess is None,
            }
    write_json(output_file, qid_to_buzzes)
Esempio n. 29
0
File: dan.py Progetto: nadesai/qb
 def qb_dataset(self):
     return QuizBowlDataset(guesser_train=True, buzzer_train=self.use_buzz_as_train)
Esempio n. 30
0
 def qb_dataset(self) -> QuizBowlDataset:
     return QuizBowlDataset(2)
Esempio n. 31
0
 def qb_dataset(self):
     return QuizBowlDataset(1)
Esempio n. 32
0
File: plot.py Progetto: NPSDC/qb
def stack(model_dir, model_name, fold=BUZZER_DEV_FOLD):
    guesses_dir = AbstractGuesser.output_path("qanta.guesser.rnn", "RnnGuesser", 0, "")
    guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, "char")
    with open(guesses_dir, "rb") as f:
        guesses = pickle.load(f)
    guesses = guesses.groupby("qanta_id")

    buzzes_dir = os.path.join(model_dir, "{}_buzzes.pkl".format(fold))
    with open(buzzes_dir, "rb") as f:
        buzzes = pickle.load(f)

    questions = QuizBowlDataset(buzzer_train=True).questions_by_fold()
    questions = {q.qanta_id: q for q in questions[fold]}

    stack_freq = {"Position": [], "Buzzing": []}
    count = defaultdict(lambda: 0)
    for qid, (char_indices, scores) in buzzes.items():
        gs = guesses.get_group(qid).groupby("char_index")
        gs = gs.aggregate(lambda x: x.head(1)).to_dict()["guess"]
        question = questions[qid]
        q_len = len(question.text)
        for i, char_index in enumerate(char_indices):
            buzz_oracle = gs[char_index] == question.page
            buzz_buzzer = scores[i][1] > scores[i][0]

            only_oracle = buzz_oracle and (not buzz_buzzer)
            only_buzzer = buzz_buzzer and (not buzz_oracle)
            both = buzz_buzzer and buzz_oracle
            neither = (not buzz_buzzer) and (not buzz_oracle)

            rel_position = np.round(char_index / q_len, decimals=1)
            count[rel_position] += 1

            if only_oracle:
                stack_freq["Position"].append(rel_position)
                stack_freq["Buzzing"].append("Only optimal")

            if only_buzzer:
                stack_freq["Position"].append(rel_position)
                stack_freq["Buzzing"].append("Only buzzer")

            if both:
                stack_freq["Position"].append(rel_position)
                stack_freq["Buzzing"].append("Both")

            if neither:
                stack_freq["Position"].append(rel_position)
                stack_freq["Buzzing"].append("Neither")

    df = pd.DataFrame(stack_freq)
    df = df.groupby(["Position", "Buzzing"])
    df = df.size().reset_index().rename(columns={0: "Frequency"})
    df["Frequency"] = df.apply(
        lambda row: row["Frequency"] / count[row["Position"]], axis=1
    )
    df["Model"] = pd.Series([model_name for _ in range(len(df))])
    stack_dir = os.path.join(model_dir, "{}_stack.pkl".format(fold))
    with open(stack_dir, "wb") as f:
        pickle.dump(df, f)

    return df
Esempio n. 33
0
 def qb_dataset(self):
     return QuizBowlDataset(self.min_answers)
Esempio n. 34
0
def stack(model_dir, model_name, fold=BUZZER_DEV_FOLD):
    guesses_dir = AbstractGuesser.output_path('qanta.guesser.rnn',
                                              'RnnGuesser', 0, '')
    guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, 'char')
    with open(guesses_dir, 'rb') as f:
        guesses = pickle.load(f)
    guesses = guesses.groupby('qanta_id')

    buzzes_dir = os.path.join(model_dir, '{}_buzzes.pkl'.format(fold))
    with open(buzzes_dir, 'rb') as f:
        buzzes = pickle.load(f)

    questions = QuizBowlDataset(buzzer_train=True).questions_by_fold()
    questions = {q.qanta_id: q for q in questions[fold]}

    stack_freq = {'Position': [], 'Buzzing': []}
    count = defaultdict(lambda: 0)
    for qid, (char_indices, scores) in buzzes.items():
        gs = guesses.get_group(qid).groupby('char_index')
        gs = gs.aggregate(lambda x: x.head(1)).to_dict()['guess']
        question = questions[qid]
        q_len = len(question.text)
        for i, char_index in enumerate(char_indices):
            buzz_oracle = gs[char_index] == question.page
            buzz_buzzer = scores[i][1] > scores[i][0]

            only_oracle = buzz_oracle and (not buzz_buzzer)
            only_buzzer = buzz_buzzer and (not buzz_oracle)
            both = buzz_buzzer and buzz_oracle
            neither = (not buzz_buzzer) and (not buzz_oracle)

            rel_position = np.round(char_index / q_len, decimals=1)
            count[rel_position] += 1

            if only_oracle:
                stack_freq['Position'].append(rel_position)
                stack_freq['Buzzing'].append('Only optimal')

            if only_buzzer:
                stack_freq['Position'].append(rel_position)
                stack_freq['Buzzing'].append('Only buzzer')

            if both:
                stack_freq['Position'].append(rel_position)
                stack_freq['Buzzing'].append('Both')

            if neither:
                stack_freq['Position'].append(rel_position)
                stack_freq['Buzzing'].append('Neither')

    df = pd.DataFrame(stack_freq)
    df = df.groupby(['Position', 'Buzzing'])
    df = df.size().reset_index().rename(columns={0: 'Frequency'})
    df['Frequency'] = df.apply(
        lambda row: row['Frequency'] / count[row['Position']], axis=1)
    df['Model'] = pd.Series([model_name for _ in range(len(df))])
    stack_dir = os.path.join(model_dir, '{}_stack.pkl'.format(fold))
    with open(stack_dir, 'wb') as f:
        pickle.dump(df, f)

    return df
Esempio n. 35
0
    parser.add_argument('--questions', type=str, default='questions.csv')
    parser.add_argument('--buzzes', type=str, default="ir_buzz.csv")
    parser.add_argument('--skip', type=int, default=0)
    parser.add_argument('--output', type=str, default="competition.csv")
    parser.add_argument('--finals', type=str, default="finals.csv")
    parser.add_argument('--power', type=str, default="power.csv")
    parser.add_argument('--max_questions', type=int, default=60)
    parser.add_argument('--readable', type=str, default="readable.txt")

    flags = parser.parse_args()

    questions = Questions(flags.questions)
    buzzes = Buzzes(flags.buzzes)
    finals = load_finals(flags.finals)
    power = PowerPositions(flags.power)
    qb_dataset = QuizBowlDataset(guesser_train=True)
    qb_answer_set = {g for g in qb_dataset.training_data()[1]}
    print("Done loading data")
    clear_screen()

    current_players = set()

    if True:
        print("Time for a buzzer check")
        players_needed = [1, 2, 3, 4]
        while len(current_players) < len(players_needed):
            print("Player %i, please buzz in" %
                  min(x for x in players_needed if x not in current_players))
            press = interpret_keypress()
            if press in players_needed:
                os.system("afplay /System/Library/Sounds/Glass.aiff")