Exemple #1
0
def sample_n_guesser_correct_questions(question_lookup,
                                       guess_lookup,
                                       n_correct_samples,
                                       n_samples=10):
    sampled_questions_by_correct = defaultdict(list)
    dataset = QuizBowlDataset(guesser_train=True)
    training_data = dataset.training_data()
    answer_counts = defaultdict(int)
    for ans in training_data[1]:
        answer_counts[ans] += 1

    for n_correct, keys in n_correct_samples.items():
        samples = random.sample(keys, min(n_samples, len(keys)))
        for key in samples:
            qnum, sent, token = key
            page = question_lookup[qnum].page
            text = question_lookup[qnum].get_text(sent, token)
            guesses = guess_lookup[key]
            correct_guessers = tuple(guesses[guesses.guess == page].guesser)
            wrong_guessers = tuple(guesses[guesses.guess != page].guesser)
            sampled_questions_by_correct[n_correct].append(
                (text, key, page, answer_counts[page], correct_guessers,
                 wrong_guessers))

    return sampled_questions_by_correct
Exemple #2
0
def create_instance_of_map(wikidata_claims_instance_of_path,
                           output_path,
                           n_types=50):
    ds = QuizBowlDataset(guesser_train=True)
    training_data = ds.training_data()
    answers = set(training_data[1])
    answer_counts = Counter(training_data[1])
    with open(wikidata_claims_instance_of_path) as f:
        claims = defaultdict(set)
        for line in f:
            c = json.loads(line)
            if c['title'] is not None:
                title = normalize_wikipedia_title(c['title'])
                if title in answers:
                    c_object = c['object']
                    if c_object in object_blacklist:
                        continue
                    if c_object in object_merge_map:
                        claims[title].add(object_merge_map[c_object])
                    elif is_god(c_object):
                        claims[title].add('god')
                    else:
                        claims[title].add(c_object)
    for k in claims:
        if 'literary work' in claims[k] and len(claims[k]) > 1:
            claims[k].remove('literary work')

    total_counts = defaultdict(int)
    class_counts = defaultdict(int)
    answer_types = {}
    for a in answers:
        if a in claims:
            answer_types[a] = claims[a]
            for obj in claims[a]:
                total_counts[obj] += answer_counts[a]
                class_counts[obj] += 1

    sorted_total_counts = sorted(total_counts.items(),
                                 reverse=True,
                                 key=lambda k: k[1])
    top_types = {key for key, count in sorted_total_counts[:n_types]}
    instance_of_map = {}
    for a in answer_types:
        top_class_types = top_types.intersection(answer_types[a])
        if len(top_class_types) == 0:
            instance_of_map[a] = NO_MATCH
        elif len(top_class_types) > 1:
            frequency = 0
            final_a_type = None
            for a_type in top_class_types:
                if class_counts[a_type] > frequency:
                    frequency = class_counts[a_type]
                    final_a_type = a_type
            instance_of_map[a] = final_a_type
        else:
            instance_of_map[a] = next(iter(top_class_types))

    with open(output_path, 'wb') as f:
        pickle.dump(instance_of_map, f)
Exemple #3
0
def get_quizbowl():
    qb_dataset = QuizBowlDataset(guesser_train=True, buzzer_train=False)
    training_data = qb_dataset.training_data()
    train_x, train_y, dev_x, dev_y, i_to_word, class_to_i, i_to_class = preprocess_dataset(
        training_data)
    i_to_word = ['<unk>', '<eos>'] + sorted(i_to_word)
    word_to_i = {x: i for i, x in enumerate(i_to_word)}
    train = transform_to_array(zip(train_x, train_y), word_to_i)
    dev = transform_to_array(zip(dev_x, dev_y), word_to_i)
    return train, dev, word_to_i, i_to_class
Exemple #4
0
def create_instance_of_map(wikidata_claims_instance_of_path, output_path, n_types=50):
    ds = QuizBowlDataset(guesser_train=True)
    training_data = ds.training_data()
    answers = set(training_data[1])
    answer_counts = Counter(training_data[1])
    with open(wikidata_claims_instance_of_path) as f:
        claims = defaultdict(set)
        for line in f:
            c = json.loads(line)
            if c['title'] is not None:
                title = normalize_wikipedia_title(c['title'])
                if title in answers:
                    c_object = c['object']
                    if c_object in object_blacklist:
                        continue
                    if c_object in object_merge_map:
                        claims[title].add(object_merge_map[c_object])
                    elif is_god(c_object):
                        claims[title].add('god')
                    else:
                        claims[title].add(c_object)
    for k in claims:
        if 'literary work' in claims[k] and len(claims[k]) > 1:
            claims[k].remove('literary work')

    total_counts = defaultdict(int)
    class_counts = defaultdict(int)
    answer_types = {}
    for a in answers:
        if a in claims:
            answer_types[a] = claims[a]
            for obj in claims[a]:
                total_counts[obj] += answer_counts[a]
                class_counts[obj] += 1

    sorted_total_counts = sorted(total_counts.items(), reverse=True, key=lambda k: k[1])
    top_types = {key for key, count in sorted_total_counts[:n_types]}
    instance_of_map = {}
    for a in answer_types:
        top_class_types = top_types.intersection(answer_types[a])
        if len(top_class_types) == 0:
            instance_of_map[a] = NO_MATCH
        elif len(top_class_types) > 1:
            frequency = 0
            final_a_type = None
            for a_type in top_class_types:
                if class_counts[a_type] > frequency:
                    frequency = class_counts[a_type]
                    final_a_type = a_type
            instance_of_map[a] = final_a_type
        else:
            instance_of_map[a] = next(iter(top_class_types))

    with open(output_path, 'wb') as f:
        pickle.dump(instance_of_map, f)
Exemple #5
0
def load_quizbowl(folds=c.BUZZER_INPUT_FOLDS) \
                    -> Tuple[Dict[str, int], Dict[str, list]]:
    # merge_dfs()
    log.info('Loading data')
    question_db = QuestionDatabase()
    quizbowl_db = QuizBowlDataset(bc.MIN_ANSWERS,
                                  guesser_train=True,
                                  buzzer_train=True)
    all_questions = question_db.all_questions()
    if not os.path.isfile(bc.OPTIONS_DIR):
        log.info('Loading the set of options')
        all_options = set(quizbowl_db.training_data()[1])

        id2option = list(all_options)
        with open(safe_path(bc.OPTIONS_DIR), 'wb') as outfile:
            pickle.dump(id2option, outfile)
    else:
        with open(safe_path(bc.OPTIONS_DIR), 'rb') as infile:
            id2option = pickle.load(infile)
    option2id = {o: i for i, o in enumerate(id2option)}
    num_options = len(id2option)
    log.info('Number of options {0}'.format(len(id2option)))

    guesses_by_fold = dict()
    for fold in folds:
        save_dir = '%s_processed.pickle' % (os.path.join(bc.GUESSES_DIR, fold))
        if os.path.isfile(save_dir):
            with open(safe_path(save_dir), 'rb') as infile:
                guesses_by_fold[fold] = pickle.load(infile)
            log.info('Loading {0} guesses'.format(fold))
            continue

        log.info('Processing {0} guesses'.format(fold))
        guesses = AbstractGuesser.load_guesses(bc.GUESSES_DIR, folds=[fold])

        worker = partial(_process_question, option2id, all_questions)
        inputs = guesses.groupby('qnum')
        guesses_by_fold[fold] = _multiprocess(worker,
                                              inputs,
                                              info='df data',
                                              multi=True)
        guesses_by_fold[fold] = [
            x for x in guesses_by_fold[fold] if x is not None
        ]
        print(len(guesses_by_fold[fold]))

        with open(safe_path(save_dir), 'wb') as outfile:
            pickle.dump(guesses_by_fold[fold], outfile)

        log.info('Processed {0} guesses saved to {1}'.format(fold, save_dir))

    return option2id, guesses_by_fold
Exemple #6
0
def create_memory_index():
    dataset = QuizBowlDataset(guesser_train=True)
    training_data = dataset.training_data()
    answers = set(training_data[1])
    cw = CachedWikipedia()

    try:
        Index('mem').delete()
    except:
        pass
    Answer.init()
    all_wiki_pages = [cw[page] for page in answers]
    wiki_pages = [p for p in all_wiki_pages if p.content != '']
    sc = create_spark_context()
    sc.parallelize(wiki_pages, 1000).foreach(index_page)
Exemple #7
0
    def training_data(self):
        cw = CachedWikipedia(QB_WIKI_LOCATION)
        ds = QuizBowlDataset(2)
        train_data = ds.training_data()
        answer_classes = set(train_data[1])
        train_x = []
        train_y = []

        for page in answer_classes:
            sentences = list(wiki_sentences(cw[page].content))
            sampled_sentences = random.sample(
                sentences, min(len(sentences), self.max_sentences))
            training_examples = []
            for sentence in sampled_sentences:
                training_examples.append(sentence)
            train_x.append(training_examples)
            train_y.append(page)
        return train_x, train_y, None
Exemple #8
0
    parser.add_argument('--buzzes', type=str, default="ir_buzz.csv")
    parser.add_argument('--skip', type=int, default=0)
    parser.add_argument('--output', type=str, default="competition.csv")
    parser.add_argument('--finals', type=str, default="finals.csv")
    parser.add_argument('--power', type=str, default="power.csv")
    parser.add_argument('--max_questions', type=int, default=60)
    parser.add_argument('--readable', type=str, default="readable.txt")

    flags = parser.parse_args()

    questions = Questions(flags.questions)
    buzzes = Buzzes(flags.buzzes)
    finals = load_finals(flags.finals)
    power = PowerPositions(flags.power)
    qb_dataset = QuizBowlDataset(guesser_train=True)
    qb_answer_set = {g for g in qb_dataset.training_data()[1]}
    print("Done loading data")
    clear_screen()

    current_players = set()

    if True:
        print("Time for a buzzer check")
        players_needed = [1, 2, 3, 4]
        while len(current_players) < len(players_needed):
            print("Player %i, please buzz in" %
                  min(x for x in players_needed if x not in current_players))
            press = interpret_keypress()
            if press in players_needed:
                os.system("afplay /System/Library/Sounds/Glass.aiff")
                print("Thanks for buzzing in, player %i!" % press)