def sample_n_guesser_correct_questions(question_lookup, guess_lookup, n_correct_samples, n_samples=10): sampled_questions_by_correct = defaultdict(list) dataset = QuizBowlDataset(guesser_train=True) training_data = dataset.training_data() answer_counts = defaultdict(int) for ans in training_data[1]: answer_counts[ans] += 1 for n_correct, keys in n_correct_samples.items(): samples = random.sample(keys, min(n_samples, len(keys))) for key in samples: qnum, sent, token = key page = question_lookup[qnum].page text = question_lookup[qnum].get_text(sent, token) guesses = guess_lookup[key] correct_guessers = tuple(guesses[guesses.guess == page].guesser) wrong_guessers = tuple(guesses[guesses.guess != page].guesser) sampled_questions_by_correct[n_correct].append( (text, key, page, answer_counts[page], correct_guessers, wrong_guessers)) return sampled_questions_by_correct
def create_instance_of_map(wikidata_claims_instance_of_path, output_path, n_types=50): ds = QuizBowlDataset(guesser_train=True) training_data = ds.training_data() answers = set(training_data[1]) answer_counts = Counter(training_data[1]) with open(wikidata_claims_instance_of_path) as f: claims = defaultdict(set) for line in f: c = json.loads(line) if c['title'] is not None: title = normalize_wikipedia_title(c['title']) if title in answers: c_object = c['object'] if c_object in object_blacklist: continue if c_object in object_merge_map: claims[title].add(object_merge_map[c_object]) elif is_god(c_object): claims[title].add('god') else: claims[title].add(c_object) for k in claims: if 'literary work' in claims[k] and len(claims[k]) > 1: claims[k].remove('literary work') total_counts = defaultdict(int) class_counts = defaultdict(int) answer_types = {} for a in answers: if a in claims: answer_types[a] = claims[a] for obj in claims[a]: total_counts[obj] += answer_counts[a] class_counts[obj] += 1 sorted_total_counts = sorted(total_counts.items(), reverse=True, key=lambda k: k[1]) top_types = {key for key, count in sorted_total_counts[:n_types]} instance_of_map = {} for a in answer_types: top_class_types = top_types.intersection(answer_types[a]) if len(top_class_types) == 0: instance_of_map[a] = NO_MATCH elif len(top_class_types) > 1: frequency = 0 final_a_type = None for a_type in top_class_types: if class_counts[a_type] > frequency: frequency = class_counts[a_type] final_a_type = a_type instance_of_map[a] = final_a_type else: instance_of_map[a] = next(iter(top_class_types)) with open(output_path, 'wb') as f: pickle.dump(instance_of_map, f)
def get_quizbowl(): qb_dataset = QuizBowlDataset(guesser_train=True, buzzer_train=False) training_data = qb_dataset.training_data() train_x, train_y, dev_x, dev_y, i_to_word, class_to_i, i_to_class = preprocess_dataset( training_data) i_to_word = ['<unk>', '<eos>'] + sorted(i_to_word) word_to_i = {x: i for i, x in enumerate(i_to_word)} train = transform_to_array(zip(train_x, train_y), word_to_i) dev = transform_to_array(zip(dev_x, dev_y), word_to_i) return train, dev, word_to_i, i_to_class
def create_instance_of_map(wikidata_claims_instance_of_path, output_path, n_types=50): ds = QuizBowlDataset(guesser_train=True) training_data = ds.training_data() answers = set(training_data[1]) answer_counts = Counter(training_data[1]) with open(wikidata_claims_instance_of_path) as f: claims = defaultdict(set) for line in f: c = json.loads(line) if c['title'] is not None: title = normalize_wikipedia_title(c['title']) if title in answers: c_object = c['object'] if c_object in object_blacklist: continue if c_object in object_merge_map: claims[title].add(object_merge_map[c_object]) elif is_god(c_object): claims[title].add('god') else: claims[title].add(c_object) for k in claims: if 'literary work' in claims[k] and len(claims[k]) > 1: claims[k].remove('literary work') total_counts = defaultdict(int) class_counts = defaultdict(int) answer_types = {} for a in answers: if a in claims: answer_types[a] = claims[a] for obj in claims[a]: total_counts[obj] += answer_counts[a] class_counts[obj] += 1 sorted_total_counts = sorted(total_counts.items(), reverse=True, key=lambda k: k[1]) top_types = {key for key, count in sorted_total_counts[:n_types]} instance_of_map = {} for a in answer_types: top_class_types = top_types.intersection(answer_types[a]) if len(top_class_types) == 0: instance_of_map[a] = NO_MATCH elif len(top_class_types) > 1: frequency = 0 final_a_type = None for a_type in top_class_types: if class_counts[a_type] > frequency: frequency = class_counts[a_type] final_a_type = a_type instance_of_map[a] = final_a_type else: instance_of_map[a] = next(iter(top_class_types)) with open(output_path, 'wb') as f: pickle.dump(instance_of_map, f)
def load_quizbowl(folds=c.BUZZER_INPUT_FOLDS) \ -> Tuple[Dict[str, int], Dict[str, list]]: # merge_dfs() log.info('Loading data') question_db = QuestionDatabase() quizbowl_db = QuizBowlDataset(bc.MIN_ANSWERS, guesser_train=True, buzzer_train=True) all_questions = question_db.all_questions() if not os.path.isfile(bc.OPTIONS_DIR): log.info('Loading the set of options') all_options = set(quizbowl_db.training_data()[1]) id2option = list(all_options) with open(safe_path(bc.OPTIONS_DIR), 'wb') as outfile: pickle.dump(id2option, outfile) else: with open(safe_path(bc.OPTIONS_DIR), 'rb') as infile: id2option = pickle.load(infile) option2id = {o: i for i, o in enumerate(id2option)} num_options = len(id2option) log.info('Number of options {0}'.format(len(id2option))) guesses_by_fold = dict() for fold in folds: save_dir = '%s_processed.pickle' % (os.path.join(bc.GUESSES_DIR, fold)) if os.path.isfile(save_dir): with open(safe_path(save_dir), 'rb') as infile: guesses_by_fold[fold] = pickle.load(infile) log.info('Loading {0} guesses'.format(fold)) continue log.info('Processing {0} guesses'.format(fold)) guesses = AbstractGuesser.load_guesses(bc.GUESSES_DIR, folds=[fold]) worker = partial(_process_question, option2id, all_questions) inputs = guesses.groupby('qnum') guesses_by_fold[fold] = _multiprocess(worker, inputs, info='df data', multi=True) guesses_by_fold[fold] = [ x for x in guesses_by_fold[fold] if x is not None ] print(len(guesses_by_fold[fold])) with open(safe_path(save_dir), 'wb') as outfile: pickle.dump(guesses_by_fold[fold], outfile) log.info('Processed {0} guesses saved to {1}'.format(fold, save_dir)) return option2id, guesses_by_fold
def create_memory_index(): dataset = QuizBowlDataset(guesser_train=True) training_data = dataset.training_data() answers = set(training_data[1]) cw = CachedWikipedia() try: Index('mem').delete() except: pass Answer.init() all_wiki_pages = [cw[page] for page in answers] wiki_pages = [p for p in all_wiki_pages if p.content != ''] sc = create_spark_context() sc.parallelize(wiki_pages, 1000).foreach(index_page)
def training_data(self): cw = CachedWikipedia(QB_WIKI_LOCATION) ds = QuizBowlDataset(2) train_data = ds.training_data() answer_classes = set(train_data[1]) train_x = [] train_y = [] for page in answer_classes: sentences = list(wiki_sentences(cw[page].content)) sampled_sentences = random.sample( sentences, min(len(sentences), self.max_sentences)) training_examples = [] for sentence in sampled_sentences: training_examples.append(sentence) train_x.append(training_examples) train_y.append(page) return train_x, train_y, None
parser.add_argument('--buzzes', type=str, default="ir_buzz.csv") parser.add_argument('--skip', type=int, default=0) parser.add_argument('--output', type=str, default="competition.csv") parser.add_argument('--finals', type=str, default="finals.csv") parser.add_argument('--power', type=str, default="power.csv") parser.add_argument('--max_questions', type=int, default=60) parser.add_argument('--readable', type=str, default="readable.txt") flags = parser.parse_args() questions = Questions(flags.questions) buzzes = Buzzes(flags.buzzes) finals = load_finals(flags.finals) power = PowerPositions(flags.power) qb_dataset = QuizBowlDataset(guesser_train=True) qb_answer_set = {g for g in qb_dataset.training_data()[1]} print("Done loading data") clear_screen() current_players = set() if True: print("Time for a buzzer check") players_needed = [1, 2, 3, 4] while len(current_players) < len(players_needed): print("Player %i, please buzz in" % min(x for x in players_needed if x not in current_players)) press = interpret_keypress() if press in players_needed: os.system("afplay /System/Library/Sounds/Glass.aiff") print("Thanks for buzzing in, player %i!" % press)