def run(self): with open(QANTA_TRAIN_DATASET_PATH) as f: all_guess_train = [ q for q in json.load(f)['questions'] if q['fold'] == GUESSER_TRAIN_FOLD ] guess_train, guess_val = train_test_split(all_guess_train, random_state=42, train_size=.9) with open(QANTA_DEV_DATASET_PATH) as f: guess_dev = [ q for q in json.load(f)['questions'] if q['fold'] == GUESSER_DEV_FOLD ] with open(QANTA_TORCH_TRAIN_LOCAL_PATH, 'w') as f: json.dump(format_qanta_json(guess_train, DS_VERSION), f) with open(QANTA_TORCH_VAL_LOCAL_PATH, 'w') as f: json.dump(format_qanta_json(guess_val, DS_VERSION), f) with open(QANTA_TORCH_DEV_LOCAL_PATH, 'w') as f: json.dump(format_qanta_json(guess_dev, DS_VERSION), f)
def run(self): with open(QANTA_MAPPED_DATASET_PATH) as f: questions = [q for q in json.load(f)['questions'] if q['page'] is not None] train_questions = [q for q in questions if 'train' in q['fold']] dev_questions = [q for q in questions if 'dev' in q['fold']] test_questions = [q for q in questions if 'test' in q['fold']] with open(QANTA_TRAIN_DATASET_PATH, 'w') as f: json.dump(format_qanta_json(train_questions, DS_VERSION), f) with open(QANTA_DEV_DATASET_PATH, 'w') as f: json.dump(format_qanta_json(dev_questions, DS_VERSION), f) with open(QANTA_TEST_DATASET_PATH, 'w') as f: json.dump(format_qanta_json(test_questions, DS_VERSION), f)
def ingestion_cli(start_idx): """ Input format is for jason's HS project, but can be changed. The original code for answer mapping was designed to map everything over multiple passes, not yield a callable function to map an arbitrary answer line to a QB answer. Rather than implement this, a hacky way to achieve similar functionality to map a new dataset is to combine already mapped questions with new questions, have the code map answer for both at the same time, then only use the mappings from the new questions. There are some edge cases, but this should in general work (hopefully). """ with open(QANTA_PREPROCESSED_DATASET_PATH) as f: unmapped_questions = json.load(f)['questions'] with open('data/external/high_school_project/quizdb-20190313164802.json') as f: raw_questions = json.load(f)['data']['tossups'] new_questions = [] idx = start_idx for q in raw_questions: new_questions.append({ 'qanta_id': idx, 'text': q['text'], 'answer': q['answer'], 'page': None, 'category': None, 'subcategory': None, 'tournament': q['tournament']['name'], 'difficulty': q['tournament']['difficulty'], 'year': int(q['tournament']['year']), 'proto_id': None, 'qdb_id': q['id'], 'dataset': 'quizdb.org', 'fold': 'guesstest' }) idx += 1 questions = unmapped_questions + new_questions answer_map, amb_answer_map, unbound_answers, report = create_answer_map(questions) with safe_open('data/external/high_school_project/automatic_report.json', 'w') as f: json.dump(report, f) write_answer_map( answer_map, amb_answer_map, unbound_answers, 'data/external/high_school_project/answer_map.json', 'data/external/high_school_project/unbound_answers.json' ) with open('data/internal/page_assignment/unmappable.yaml') as f: unmappable = yaml.load(f) page_assigner = PageAssigner() mapping_report = unmapped_to_mapped_questions( new_questions, answer_map, amb_answer_map, unmappable, page_assigner ) add_sentences_(new_questions) with open('data/external/high_school_project/qanta.acf-regionals-2018.json', 'w') as f: json.dump(format_qanta_json(new_questions, DS_VERSION), f) with open('data/external/high_school_project/mapping_report.json', 'w') as f: json.dump(mapping_report, f)
def adversarial_to_json(adversarial_json, json_dir): from qanta.datasets.quiz_bowl import QantaDatabase db = QantaDatabase() lookup = {q.page.lower(): q.page for q in db.mapped_questions} with open(adversarial_json) as f: questions = json.load(f) rows = [] for i, q in enumerate(questions): answer = q['answer'].strip().replace(' ', '_') if answer in lookup: answer = lookup[answer] else: log.warning(f'Could not find: {answer}') rows.append({ 'text': q['question'].strip(), 'page': answer, 'answer': '', 'qanta_id': 1000000 + i, 'proto_id': None, 'qdb_id': None, 'category': '', 'subcategory': '', 'tournament': '', 'difficulty': '', 'dataset': 'adversarial', 'year': -1, 'fold': 'expo', 'gameplay': False }) from qanta.ingestion.preprocess import add_sentences_, format_qanta_json from qanta.util.constants import DS_VERSION add_sentences_(rows, parallel=False) with open(path.join(json_dir, f'qanta.expo.{DS_VERSION}.json'), 'w') as f: json.dump(format_qanta_json(rows, DS_VERSION), f)
def run(self): with open(QANTA_UNMAPPED_DATASET_PATH) as f: qanta_questions = json.load(f)["questions"] add_sentences_(qanta_questions) add_answer_prompts_(qanta_questions) with open(QANTA_PREPROCESSED_DATASET_PATH, "w") as f: json.dump(format_qanta_json(qanta_questions, DS_VERSION), f)
def run(self): with open(QANTA_MAPPED_DATASET_PATH) as f: questions = [ q for q in json.load(f)['questions'] if q['page'] is not None ] train_questions = [q for q in questions if 'train' in q['fold']] dev_questions = [q for q in questions if 'dev' in q['fold']] test_questions = [q for q in questions if 'test' in q['fold']] with open(QANTA_TRAIN_DATASET_PATH, 'w') as f: json.dump(format_qanta_json(train_questions, DS_VERSION), f) with open(QANTA_DEV_DATASET_PATH, 'w') as f: json.dump(format_qanta_json(dev_questions, DS_VERSION), f) with open(QANTA_TEST_DATASET_PATH, 'w') as f: json.dump(format_qanta_json(test_questions, DS_VERSION), f)
def run(self): with open(QANTA_TRAIN_DATASET_PATH) as f: all_guess_train = [q for q in json.load(f)['questions'] if q['fold'] == GUESSER_TRAIN_FOLD] guess_train, guess_val = train_test_split(all_guess_train, random_state=42, train_size=.9) with open(QANTA_DEV_DATASET_PATH) as f: guess_dev = [q for q in json.load(f)['questions'] if q['fold'] == GUESSER_DEV_FOLD] with open(QANTA_TORCH_TRAIN_LOCAL_PATH, 'w') as f: json.dump(format_qanta_json(guess_train, DS_VERSION), f) with open(QANTA_TORCH_VAL_LOCAL_PATH, 'w') as f: json.dump(format_qanta_json(guess_val, DS_VERSION), f) with open(QANTA_TORCH_DEV_LOCAL_PATH, 'w') as f: json.dump(format_qanta_json(guess_dev, DS_VERSION), f)
def run(self): with open(QANTA_MAPPED_DATASET_PATH) as f: questions = [ q for q in json.load(f)["questions"] if q["page"] is not None ] train_questions = [q for q in questions if "train" in q["fold"]] dev_questions = [q for q in questions if "dev" in q["fold"]] test_questions = [q for q in questions if "test" in q["fold"]] with open(QANTA_TRAIN_DATASET_PATH, "w") as f: json.dump(format_qanta_json(train_questions, DS_VERSION), f) with open(QANTA_DEV_DATASET_PATH, "w") as f: json.dump(format_qanta_json(dev_questions, DS_VERSION), f) with open(QANTA_TEST_DATASET_PATH, "w") as f: json.dump(format_qanta_json(test_questions, DS_VERSION), f)
def run(self): with open(QANTA_PREPROCESSED_DATASET_PATH) as f: qanta_questions = json.load(f)['questions'] with open(PROTOBOWL_QUESTION_PLAYER_COUNTS) as f: question_player_counts = json.load(f) assign_folds_(qanta_questions, question_player_counts) with open(QANTA_FOLDED_DATASET_PATH, 'w') as f: json.dump(format_qanta_json(qanta_questions, DS_VERSION), f)
def run(self): with open(QANTA_PREPROCESSED_DATASET_PATH) as f: qanta_questions = json.load(f)['questions'] with open(PROTOBOWL_QUESTION_PLAYER_COUNTS) as f: question_player_counts = json.load(f) assign_folds_(qanta_questions, question_player_counts) with open(QANTA_FOLDED_DATASET_PATH, 'w') as f: json.dump(format_qanta_json(qanta_questions, DS_VERSION), f)
def run(self): protobowl_questions = Protobowl.parse_tossups(PROTOBOWL_TOSSUPS_PATH) quizdb_tournaments = QuizdbOrg.parse_tournaments(QDB_TOURNAMENTS_PATH) quizdb_categories = QuizdbOrg.parse_categories(QDB_CATEGORIES_PATH) quizdb_subcategories = QuizdbOrg.parse_subcategories(QDB_SUBCATEGORIES_PATH) quizdb_questions = QuizdbOrg.parse_tossups( quizdb_tournaments, quizdb_categories, quizdb_subcategories, QDB_TOSSUPS_PATH ) qanta_questions = merge_datasets(protobowl_questions, quizdb_questions) with open(safe_path(QANTA_UNMAPPED_DATASET_PATH), 'w') as f: json.dump(format_qanta_json(qanta_questions, DS_VERSION), f)
def format_additional(): """ Additional questions were added to dataset, this processes the csv version to match the dataset format while verifying that page info is valid. """ titles_checksum = "6fa134836b3a7e3b562cdaa8ad353f2d" verify_checksum( titles_checksum, "data/external/wikipedia/wikipedia-titles.2018.04.18.json" ) with open("data/external/wikipedia/wikipedia-titles.2018.04.18.json") as f: titles = set(json.load(f)) trick_checksum = "905594aab776ddb10b0d7f36d30633a2" verify_checksum(trick_checksum, "data/external/datasets/trick-additional.csv") with open("data/external/datasets/trick-additional.csv") as f: # Ignore header row rows = list(csv.reader(f))[1:] questions = [] for _, text, page in rows: page = page.replace(" ", "_") if page not in titles: log.info(f"Page not in titles: {page}") questions.append( { "text": text, "answer": page, "page": page, "fold": "advtest", "year": 2018, "dataset": "trickme", "proto_id": None, "qdb_id": None, "difficulty": None, "category": None, "subcategory": None, "qanta_id": None, "tournament": TOURNAMENT_DEC_15, "gameplay": False, "interface": "ir-r2", } ) add_sentences_(questions, parallel=False) dataset = format_qanta_json(questions, "2018.04.18") dataset["dependent_checksums"] = { "trick-additional.csv": trick_checksum, "wikipedia-titles.2018.04.18.json": titles_checksum, } path_formatted = "data/external/datasets/qanta.trick-additional-ir-round2.json" with open(path_formatted, "w") as f: json.dump(dataset, f) log.info(f"File: {path_formatted} Checksum: {md5sum(path_formatted)}")
def format_additional(): """ Additional questions were added to dataset, this processes the csv version to match the dataset format while verifying that page info is valid. """ titles_checksum = '6fa134836b3a7e3b562cdaa8ad353f2d' verify_checksum( titles_checksum, 'data/external/wikipedia/wikipedia-titles.2018.04.18.json') with open('data/external/wikipedia/wikipedia-titles.2018.04.18.json') as f: titles = set(json.load(f)) trick_checksum = '905594aab776ddb10b0d7f36d30633a2' verify_checksum(trick_checksum, 'data/external/datasets/trick-additional.csv') with open('data/external/datasets/trick-additional.csv') as f: # Ignore header row rows = list(csv.reader(f))[1:] questions = [] for _, text, page in rows: page = page.replace(' ', '_') if page not in titles: log.info(f'Page not in titles: {page}') questions.append({ 'text': text, 'answer': page, 'page': page, 'fold': 'advtest', 'year': 2018, 'dataset': 'trickme', 'proto_id': None, 'qdb_id': None, 'difficulty': None, 'category': None, 'subcategory': None, 'qanta_id': None, 'tournament': TOURNAMENT_DEC_15, 'gameplay': False, 'interface': 'ir-r2' }) add_sentences_(questions, parallel=False) dataset = format_qanta_json(questions, '2018.04.18') dataset['dependent_checksums'] = { 'trick-additional.csv': trick_checksum, 'wikipedia-titles.2018.04.18.json': titles_checksum } path_formatted = 'data/external/datasets/qanta.trick-additional-ir-round2.json' with open(path_formatted, 'w') as f: json.dump(dataset, f) log.info(f'File: {path_formatted} Checksum: {md5sum(path_formatted)}')
def run(self): protobowl_questions = Protobowl.parse_tossups(PROTOBOWL_TOSSUPS_PATH) quizdb_tournaments = QuizdbOrg.parse_tournaments(QDB_TOURNAMENTS_PATH) quizdb_categories = QuizdbOrg.parse_categories(QDB_CATEGORIES_PATH) quizdb_subcategories = QuizdbOrg.parse_subcategories( QDB_SUBCATEGORIES_PATH) quizdb_questions = QuizdbOrg.parse_tossups(quizdb_tournaments, quizdb_categories, quizdb_subcategories, QDB_TOSSUPS_PATH) qanta_questions = merge_datasets(protobowl_questions, quizdb_questions) with open(safe_path(QANTA_UNMAPPED_DATASET_PATH), 'w') as f: json.dump(format_qanta_json(qanta_questions, DS_VERSION), f)
def split_ds(id_model_path, expo_path, version, rnn_out, es_out): with open(id_model_path) as f: lookup = json.load(f) with open(expo_path) as f: questions = json.load(f)['questions'] es_questions = [] rnn_questions = [] for q in questions: qanta_id = str(q['qanta_id']) if lookup[qanta_id] == 'es': es_questions.append(q) elif lookup[qanta_id] == 'rnn': rnn_questions.append(q) else: raise ValueError('Unhandled question source') with open(rnn_out, 'w') as f: json.dump(format_qanta_json(rnn_questions, version), f) with open(es_out, 'w') as f: json.dump(format_qanta_json(es_questions, version), f)
def split_ds(id_model_path, expo_path, version, rnn_out, es_out): with open(id_model_path) as f: lookup = json.load(f) with open(expo_path) as f: questions = json.load(f)['questions'] es_questions = [] rnn_questions = [] for q in questions: qanta_id = str(q['qanta_id']) if lookup[qanta_id] == 'es': es_questions.append(q) elif lookup[qanta_id] == 'rnn': rnn_questions.append(q) else: raise ValueError('Unhandled question source') with open(rnn_out, 'w') as f: json.dump(format_qanta_json(rnn_questions, version), f) with open(es_out, 'w') as f: json.dump(format_qanta_json(es_questions, version), f)
def split_ds(id_model_path, expo_path, version, rnn_out, es_out): with open(id_model_path) as f: lookup = json.load(f) with open(expo_path) as f: questions = json.load(f)["questions"] es_questions = [] rnn_questions = [] for q in questions: qanta_id = str(q["qanta_id"]) if lookup[qanta_id] == "es": es_questions.append(q) elif lookup[qanta_id] == "rnn": rnn_questions.append(q) else: raise ValueError("Unhandled question source") with open(rnn_out, "w") as f: json.dump(format_qanta_json(rnn_questions, version), f) with open(es_out, "w") as f: json.dump(format_qanta_json(es_questions, version), f)
def run(self): with open(ANSWER_MAP_PATH) as f: content = json.load(f) answer_map = content['answer_map'] ambig_answer_map = content['ambig_answer_map'] with open(QANTA_FOLDED_DATASET_PATH) as f: qanta_questions = json.load(f)['questions'] with open('data/internal/page_assignment/unmappable.yaml') as f: unmappable = yaml.load(f) page_assigner = PageAssigner() mapping_report = unmapped_to_mapped_questions(qanta_questions, answer_map, ambig_answer_map, unmappable, page_assigner) with open(QANTA_MAPPED_DATASET_PATH, 'w') as f: json.dump(format_qanta_json(qanta_questions, DS_VERSION), f) with open(QANTA_MAP_REPORT_PATH, 'w') as f: json.dump(mapping_report, f)
def run(self): with open(ANSWER_MAP_PATH) as f: content = json.load(f) answer_map = content['answer_map'] ambig_answer_map = content['ambig_answer_map'] with open(QANTA_FOLDED_DATASET_PATH) as f: qanta_questions = json.load(f)['questions'] with open('data/internal/page_assignment/unmappable.yaml') as f: unmappable = yaml.load(f) page_assigner = PageAssigner() mapping_report = unmapped_to_mapped_questions( qanta_questions, answer_map, ambig_answer_map, unmappable, page_assigner ) with open(QANTA_MAPPED_DATASET_PATH, 'w') as f: json.dump(format_qanta_json(qanta_questions, DS_VERSION), f) with open(QANTA_MAP_REPORT_PATH, 'w') as f: json.dump(mapping_report, f)
def parse_tossups( qanta_ds_path='data/external/datasets/qanta.mapped.2018.04.18.json', trick_path='data/external/datasets/trickme_questions_12-15-2018.json', start_idx=2000000, version='2018.04.18'): with open(qanta_ds_path) as f: qanta_ds = json.load(f)['questions'] answer_set = {q['page'] for q in qanta_ds if q['page'] is not None} lookup = {a.lower().replace(' ', '_'): a for a in answer_set} with open(trick_path) as f: questions = [] for i, q in enumerate(json.load(f)): if 'Question' in q: text = q['Question'] elif 'question' in q: text = q['question'] else: raise ValueError( 'Could not find question field in question') if 'Answer' in q: answer = q['Answer'].replace(' ', '_') elif 'answer' in q: answer = q['answer'].replace(' ', '_') else: raise ValueError('Could not find answer field in question') if len(answer) == 0 or len(text) == 0: continue if i in DIRECT_MAP: m_ans, m_page = DIRECT_MAP[i] if m_ans == answer: if m_page is None: continue # Skip this explicitly elif m_page in answer_set: page = m_page else: raise ValueError( f'{m_page} not in answer set\n Q: {text}') else: raise ValueError(f'Mapping error: {answer} != {m_ans}') elif answer in lookup: page = lookup[answer] else: raise ValueError( f'Could not find: idx: {i} Q:"{text}" \nA: "{answer}"') q_out = { 'text': text, 'answer': answer, 'page': page, 'fold': 'advtest', 'year': 2018, 'dataset': 'trickme', 'proto_id': None, 'qdb_id': None, 'trickme_id': i, 'difficulty': None, 'category': None, 'subcategory': None, 'qanta_id': start_idx + i, 'tournament': 'Adversarial Question Writing UMD December 15', 'gameplay': False } if 'email' in q: q_out['author_email'] = q['email'] if 'category' in q and q['category'] != "None": q_out['category'] = q['category'] questions.append(q_out) add_sentences_(questions, parallel=True) dataset = format_qanta_json(questions, version) return dataset
def trick_to_ds( answer_map_path, qanta_ds_path, wiki_titles_path, trick_path, id_model_path, out_path, start_idx, version, fold, year, tournament, separate_rounds, ): with open(answer_map_path) as f: answer_map = yaml.load(f) with open(qanta_ds_path) as f: qanta_ds = json.load(f)["questions"] answer_set = {q["page"] for q in qanta_ds if q["page"] is not None} with open(wiki_titles_path) as f: titles = set(json.load(f)) lookup = {a.lower().replace(" ", "_"): a for a in answer_set} id_model_map = {} skipped = 0 with open(trick_path) as f: questions = [] for i, q in enumerate(json.load(f)): if "Question" in q: text = q["Question"] elif "question" in q: text = q["question"] else: raise ValueError("Could not find question field in question") if "Answer" in q: answer = q["Answer"].replace(" ", "_") elif "answer" in q: answer = q["answer"].replace(" ", "_") else: raise ValueError("Could not find answer field in question") if "trick_id" in q: trick_id = q["trick_id"] else: trick_id = None if len(answer) == 0: raise ValueError(f"Empty answer for trick_id={trick_id}") elif len(text) == 0: raise ValueError(f"Empty text for trick_id={trick_id}") if answer in titles or answer in answer_set: page = answer elif answer in lookup: page = lookup[answer] elif answer in answer_map: m_page = answer_map[answer] if m_page is None: if "model" in q: log.info( f'Explicitly Skipping {answer}, int-model: {q["model"]}' ) else: log.info(f"Explicitly Skipping {answer}") continue # Skip this explicitly elif m_page in answer_set: page = m_page else: raise ValueError(f"{m_page} not in answer set\n Q: {text}") else: log.error( f'Unhandled Skipping: idx: {i} trick_id: {trick_id} A: "{answer}"\nQ:"{text}"' ) skipped += 1 continue q_out = { "text": text, "answer": answer, "page": page, "fold": fold, "year": year, "dataset": "trickme", "proto_id": None, "qdb_id": None, "difficulty": None, "category": None, "subcategory": None, "qanta_id": start_idx + i, "tournament": tournament, "gameplay": False, "trick_id": trick_id, } if "email" in q: q_out["author_email"] = q["email"] if "category" in q and q["category"] != "None": q_out["category"] = q["category"] if "round" in q: q_out["round"] = q["round"] if "model" in q: id_model_map[q_out["qanta_id"]] = q["model"] questions.append(q_out) log.info(f"Total: {len(questions)} Skipped: {skipped}") add_sentences_(questions, parallel=False) if separate_rounds: rounds = defaultdict(list) for q in questions: rounds[q["round"]].append(q) for name, round_questions in rounds.items(): dataset = format_qanta_json(round_questions, version) file_name = out_path.split(".") if file_name[-1] == "json": file_name.pop() file_name.extend([name, "json"]) else: file_name.extend([name, "json"]) round_out_path = ".".join(file_name) log.info(f"Writing round {name} to {round_out_path}") with open(round_out_path, "w") as f: json.dump(dataset, f) else: dataset = format_qanta_json(questions, version) with open(out_path, "w") as f: json.dump(dataset, f) with open(id_model_path, "w") as f: json.dump(id_model_map, f)
def trick_to_ds(answer_map_path, qanta_ds_path, wiki_titles_path, trick_path, id_model_path, out_path, start_idx, version, fold, year, tournament, separate_rounds): with open(answer_map_path) as f: answer_map = yaml.load(f) with open(qanta_ds_path) as f: qanta_ds = json.load(f)['questions'] answer_set = {q['page'] for q in qanta_ds if q['page'] is not None} with open(wiki_titles_path) as f: titles = set(json.load(f)) lookup = {a.lower().replace(' ', '_'): a for a in answer_set} id_model_map = {} skipped = 0 with open(trick_path) as f: questions = [] for i, q in enumerate(json.load(f)): if 'Question' in q: text = q['Question'] elif 'question' in q: text = q['question'] else: raise ValueError('Could not find question field in question') if 'Answer' in q: answer = q['Answer'].replace(' ', '_') elif 'answer' in q: answer = q['answer'].replace(' ', '_') else: raise ValueError('Could not find answer field in question') if 'trick_id' in q: trick_id = q['trick_id'] else: trick_id = None if len(answer) == 0: raise ValueError(f'Empty answer for trick_id={trick_id}') elif len(text) == 0: raise ValueError(f'Empty text for trick_id={trick_id}') if answer in titles or answer in answer_set: page = answer elif answer in lookup: page = lookup[answer] elif answer in answer_map: m_page = answer_map[answer] if m_page is None: if 'model' in q: log.info(f'Explicitly Skipping {answer}, int-model: {q["model"]}') else: log.info(f'Explicitly Skipping {answer}') continue # Skip this explicitly elif m_page in answer_set: page = m_page else: raise ValueError(f'{m_page} not in answer set\n Q: {text}') else: log.error(f'Unhandled Skipping: idx: {i} trick_id: {trick_id} A: "{answer}"\nQ:"{text}"') skipped += 1 continue q_out = { 'text': text, 'answer': answer, 'page': page, 'fold': fold, 'year': year, 'dataset': 'trickme', 'proto_id': None, 'qdb_id': None, 'difficulty': None, 'category': None, 'subcategory': None, 'qanta_id': start_idx + i, 'tournament': tournament, 'gameplay': False, 'trick_id': trick_id } if 'email' in q: q_out['author_email'] = q['email'] if 'category' in q and q['category'] != "None": q_out['category'] = q['category'] if 'round' in q: q_out['round'] = q['round'] if 'model' in q: id_model_map[q_out['qanta_id']] = q['model'] questions.append(q_out) log.info(f'Total: {len(questions)} Skipped: {skipped}') add_sentences_(questions, parallel=False) if separate_rounds: rounds = defaultdict(list) for q in questions: rounds[q['round']].append(q) for name, round_questions in rounds.items(): dataset = format_qanta_json(round_questions, version) file_name = out_path.split('.') if file_name[-1] == 'json': file_name.pop() file_name.extend([name, 'json']) else: file_name.extend([name, 'json']) round_out_path = '.'.join(file_name) log.info(f'Writing round {name} to {round_out_path}') with open(round_out_path, 'w') as f: json.dump(dataset, f) else: dataset = format_qanta_json(questions, version) with open(out_path, 'w') as f: json.dump(dataset, f) with open(id_model_path, 'w') as f: json.dump(id_model_map, f)
def ingestion_cli(start_idx): """ Input format is for jason's HS project, but can be changed. The original code for answer mapping was designed to map everything over multiple passes, not yield a callable function to map an arbitrary answer line to a QB answer. Rather than implement this, a hacky way to achieve similar functionality to map a new dataset is to combine already mapped questions with new questions, have the code map answer for both at the same time, then only use the mappings from the new questions. There are some edge cases, but this should in general work (hopefully). """ with open(QANTA_PREPROCESSED_DATASET_PATH) as f: unmapped_questions = json.load(f)["questions"] with open("data/external/high_school_project/quizdb-20190313164802.json" ) as f: raw_questions = json.load(f)["data"]["tossups"] new_questions = [] idx = start_idx for q in raw_questions: new_questions.append({ "qanta_id": idx, "text": q["text"], "answer": q["answer"], "page": None, "category": None, "subcategory": None, "tournament": q["tournament"]["name"], "difficulty": q["tournament"]["difficulty"], "year": int(q["tournament"]["year"]), "proto_id": None, "qdb_id": q["id"], "dataset": "quizdb.org", "fold": "guesstest", }) idx += 1 questions = unmapped_questions + new_questions answer_map, amb_answer_map, unbound_answers, report = create_answer_map( questions) with safe_open("data/external/high_school_project/automatic_report.json", "w") as f: json.dump(report, f) write_answer_map( answer_map, amb_answer_map, unbound_answers, "data/external/high_school_project/answer_map.json", "data/external/high_school_project/unbound_answers.json", ) with open("data/internal/page_assignment/unmappable.yaml") as f: unmappable = yaml.load(f) page_assigner = PageAssigner() mapping_report = unmapped_to_mapped_questions(new_questions, answer_map, amb_answer_map, unmappable, page_assigner) add_sentences_(new_questions) with open( "data/external/high_school_project/qanta.acf-regionals-2018.json", "w") as f: json.dump(format_qanta_json(new_questions, DS_VERSION), f) with open("data/external/high_school_project/mapping_report.json", "w") as f: json.dump(mapping_report, f)
def trick_to_ds(answer_map_path, qanta_ds_path, wiki_titles_path, trick_path, id_model_path, out_path, start_idx, version, fold, year, tournament, separate_rounds): with open(answer_map_path) as f: answer_map = yaml.load(f) with open(qanta_ds_path) as f: qanta_ds = json.load(f)['questions'] answer_set = {q['page'] for q in qanta_ds if q['page'] is not None} with open(wiki_titles_path) as f: titles = set(json.load(f)) lookup = {a.lower().replace(' ', '_'): a for a in answer_set} id_model_map = {} skipped = 0 with open(trick_path) as f: questions = [] for i, q in enumerate(json.load(f)): if 'Question' in q: text = q['Question'] elif 'question' in q: text = q['question'] else: raise ValueError('Could not find question field in question') if 'Answer' in q: answer = q['Answer'].replace(' ', '_') elif 'answer' in q: answer = q['answer'].replace(' ', '_') else: raise ValueError('Could not find answer field in question') if 'trick_id' in q: trick_id = q['trick_id'] else: trick_id = None if len(answer) == 0: raise ValueError(f'Empty answer for trick_id={trick_id}') elif len(text) == 0: raise ValueError(f'Empty text for trick_id={trick_id}') if answer in titles or answer in answer_set: page = answer elif answer in lookup: page = lookup[answer] elif answer in answer_map: m_page = answer_map[answer] if m_page is None: if 'model' in q: log.info( f'Explicitly Skipping {answer}, int-model: {q["model"]}' ) else: log.info(f'Explicitly Skipping {answer}') continue # Skip this explicitly elif m_page in answer_set: page = m_page else: raise ValueError(f'{m_page} not in answer set\n Q: {text}') else: log.error( f'Unhandled Skipping: idx: {i} trick_id: {trick_id} A: "{answer}"\nQ:"{text}"' ) skipped += 1 continue q_out = { 'text': text, 'answer': answer, 'page': page, 'fold': fold, 'year': year, 'dataset': 'trickme', 'proto_id': None, 'qdb_id': None, 'difficulty': None, 'category': None, 'subcategory': None, 'qanta_id': start_idx + i, 'tournament': tournament, 'gameplay': False, 'trick_id': trick_id } if 'email' in q: q_out['author_email'] = q['email'] if 'category' in q and q['category'] != "None": q_out['category'] = q['category'] if 'round' in q: q_out['round'] = q['round'] if 'model' in q: id_model_map[q_out['qanta_id']] = q['model'] questions.append(q_out) log.info(f'Total: {len(questions)} Skipped: {skipped}') add_sentences_(questions, parallel=False) if separate_rounds: rounds = defaultdict(list) for q in questions: rounds[q['round']].append(q) for name, round_questions in rounds.items(): dataset = format_qanta_json(round_questions, version) file_name = out_path.split('.') if file_name[-1] == 'json': file_name.pop() file_name.extend([name, 'json']) else: file_name.extend([name, 'json']) round_out_path = '.'.join(file_name) log.info(f'Writing round {name} to {round_out_path}') with open(round_out_path, 'w') as f: json.dump(dataset, f) else: dataset = format_qanta_json(questions, version) with open(out_path, 'w') as f: json.dump(dataset, f) with open(id_model_path, 'w') as f: json.dump(id_model_map, f)
def nonnaqt_to_json(csv_input, json_dir): question_sentences = defaultdict(list) with open(csv_input) as f: csv_rows = list(csv.reader(f)) for r in csv_rows[1:]: if len(r) != 5: raise ValueError('Invalid csv row, must have 5 columns') qnum, sent, text, page, fold = r qnum = int(qnum) sent = int(sent) question_sentences[qnum].append({ 'qnum': qnum, 'sent': sent, 'text': text, 'page': page, 'fold': fold }) questions = [] for sentences in tqdm.tqdm(question_sentences.values()): ordered_sentences = sorted(sentences, key=lambda s: s['sent']) text = ' '.join(s['text'] for s in ordered_sentences) tokenizations = [] position = 0 for i in range(len(ordered_sentences)): sent = ordered_sentences[i]['text'] length = len(sent) tokenizations.append((position, position + length)) position += length + 1 q = ordered_sentences[0] questions.append({ 'answer': '', 'category': '', 'subcategory': '', 'tournament': '', 'year': -1, 'dataset': 'non_naqt', 'difficulty': '', 'first_sentence': ordered_sentences[0]['text'], 'qanta_id': q['qnum'], 'fold': q['fold'], 'gameplay': False, 'page': q['page'], 'proto_id': None, 'qdb_id': None, 'text': text, 'tokenizations': tokenizations }) train_questions = [q for q in questions if q['fold'] == 'guesstrain'] dev_questions = [q for q in questions if q['fold'] == 'guessdev'] test_questions = [q for q in questions if q['fold'] == 'test'] for q in test_questions: q['fold'] = 'guesstest' from qanta.ingestion.preprocess import format_qanta_json from qanta.util.constants import DS_VERSION with open(path.join(json_dir, f'qanta.mapped.{DS_VERSION}.json'), 'w') as f: json.dump(format_qanta_json(questions, DS_VERSION), f) with open(path.join(json_dir, f'qanta.train.{DS_VERSION}.json'), 'w') as f: json.dump(format_qanta_json(train_questions, DS_VERSION), f) with open(path.join(json_dir, f'qanta.dev.{DS_VERSION}.json'), 'w') as f: json.dump(format_qanta_json(dev_questions, DS_VERSION), f) with open(path.join(json_dir, f'qanta.test.{DS_VERSION}.json'), 'w') as f: json.dump(format_qanta_json(test_questions, DS_VERSION), f) from sklearn.model_selection import train_test_split guess_train, guess_val = train_test_split(train_questions, random_state=42, train_size=.9) with open(path.join(json_dir, f'qanta.torchtext.train.{DS_VERSION}.json'), 'w') as f: json.dump(format_qanta_json(guess_train, DS_VERSION), f) with open(path.join(json_dir, f'qanta.torchtext.val.{DS_VERSION}.json'), 'w') as f: json.dump(format_qanta_json(guess_val, DS_VERSION), f) with open(path.join(json_dir, f'qanta.torchtext.dev.{DS_VERSION}.json'), 'w') as f: json.dump(format_qanta_json(dev_questions, DS_VERSION), f)
def run(self): with open(QANTA_UNMAPPED_DATASET_PATH) as f: qanta_questions = json.load(f)['questions'] add_sentences_(qanta_questions) with open(QANTA_PREPROCESSED_DATASET_PATH, 'w') as f: json.dump(format_qanta_json(qanta_questions, DS_VERSION), f)
def run(self): with open(QANTA_UNMAPPED_DATASET_PATH) as f: qanta_questions = json.load(f)['questions'] add_sentences_(qanta_questions) with open(QANTA_PREPROCESSED_DATASET_PATH, 'w') as f: json.dump(format_qanta_json(qanta_questions, DS_VERSION), f)