Python format_qanta_json Examples, qanta.ingestion.preprocess.format_qanta_json Python Examples

Example #1

0

Show file

File: pipeline.py Project: theJasonFan/qb

    def run(self):
        with open(QANTA_TRAIN_DATASET_PATH) as f:
            all_guess_train = [
                q for q in json.load(f)['questions']
                if q['fold'] == GUESSER_TRAIN_FOLD
            ]

        guess_train, guess_val = train_test_split(all_guess_train,
                                                  random_state=42,
                                                  train_size=.9)

        with open(QANTA_DEV_DATASET_PATH) as f:
            guess_dev = [
                q for q in json.load(f)['questions']
                if q['fold'] == GUESSER_DEV_FOLD
            ]

        with open(QANTA_TORCH_TRAIN_LOCAL_PATH, 'w') as f:
            json.dump(format_qanta_json(guess_train, DS_VERSION), f)

        with open(QANTA_TORCH_VAL_LOCAL_PATH, 'w') as f:
            json.dump(format_qanta_json(guess_val, DS_VERSION), f)

        with open(QANTA_TORCH_DEV_LOCAL_PATH, 'w') as f:
            json.dump(format_qanta_json(guess_dev, DS_VERSION), f)

Example #2

0

Show file

File: pipeline.py Project: Pinafore/qb

    def run(self):
        with open(QANTA_MAPPED_DATASET_PATH) as f:
            questions = [q for q in json.load(f)['questions'] if q['page'] is not None]
        train_questions = [q for q in questions if 'train' in q['fold']]
        dev_questions = [q for q in questions if 'dev' in q['fold']]
        test_questions = [q for q in questions if 'test' in q['fold']]

        with open(QANTA_TRAIN_DATASET_PATH, 'w') as f:
            json.dump(format_qanta_json(train_questions, DS_VERSION), f)

        with open(QANTA_DEV_DATASET_PATH, 'w') as f:
            json.dump(format_qanta_json(dev_questions, DS_VERSION), f)

        with open(QANTA_TEST_DATASET_PATH, 'w') as f:
            json.dump(format_qanta_json(test_questions, DS_VERSION), f)

Example #3

0

Show file

File: command.py Project: ymedhat95/qb

def ingestion_cli(start_idx):
    """
    Input format is for jason's HS project, but can be changed. The original code for answer
    mapping was designed to map everything over multiple passes, not yield a callable function to map
    an arbitrary answer line to a QB answer. Rather than implement this, a hacky way to achieve similar
    functionality to map a new dataset is to combine already mapped questions with new questions, have
    the code map answer for both at the same time, then only use the mappings from the new questions.
    There are some edge cases, but this should in general work (hopefully).
    """
    with open(QANTA_PREPROCESSED_DATASET_PATH) as f:
        unmapped_questions = json.load(f)['questions']

    with open('data/external/high_school_project/quizdb-20190313164802.json') as f:
        raw_questions = json.load(f)['data']['tossups']

    new_questions = []
    idx = start_idx
    for q in raw_questions:
        new_questions.append({
            'qanta_id': idx,
            'text': q['text'],
            'answer': q['answer'],
            'page': None,
            'category': None,
            'subcategory': None,
            'tournament': q['tournament']['name'],
            'difficulty': q['tournament']['difficulty'],
            'year': int(q['tournament']['year']),
            'proto_id': None,
            'qdb_id': q['id'],
            'dataset': 'quizdb.org',
            'fold': 'guesstest'
        })
        idx += 1
    questions = unmapped_questions + new_questions
    answer_map, amb_answer_map, unbound_answers, report = create_answer_map(questions)
    with safe_open('data/external/high_school_project/automatic_report.json', 'w') as f:
        json.dump(report, f)

    write_answer_map(
        answer_map, amb_answer_map, unbound_answers,
        'data/external/high_school_project/answer_map.json',
        'data/external/high_school_project/unbound_answers.json'
    )
    with open('data/internal/page_assignment/unmappable.yaml') as f:
        unmappable = yaml.load(f)

    page_assigner = PageAssigner()
    mapping_report = unmapped_to_mapped_questions(
        new_questions,
        answer_map, amb_answer_map,
        unmappable, page_assigner
    )

    add_sentences_(new_questions)
    with open('data/external/high_school_project/qanta.acf-regionals-2018.json', 'w') as f:
        json.dump(format_qanta_json(new_questions, DS_VERSION), f)

    with open('data/external/high_school_project/mapping_report.json', 'w') as f:
        json.dump(mapping_report, f)

Example #4

0

Show file

File: cli.py Project: Eric-Wallace-WebHost/qb

def adversarial_to_json(adversarial_json, json_dir):
    from qanta.datasets.quiz_bowl import QantaDatabase
    db = QantaDatabase()
    lookup = {q.page.lower(): q.page for q in db.mapped_questions}
    with open(adversarial_json) as f:
        questions = json.load(f)
        rows = []
        for i, q in enumerate(questions):
            answer = q['answer'].strip().replace(' ', '_')
            if answer in lookup:
                answer = lookup[answer]
            else:
                log.warning(f'Could not find: {answer}')
            rows.append({
                'text': q['question'].strip(),
                'page': answer,
                'answer': '',
                'qanta_id': 1000000 + i,
                'proto_id': None,
                'qdb_id': None,
                'category': '',
                'subcategory': '',
                'tournament': '',
                'difficulty': '',
                'dataset': 'adversarial',
                'year': -1,
                'fold': 'expo',
                'gameplay': False
            })

    from qanta.ingestion.preprocess import add_sentences_, format_qanta_json
    from qanta.util.constants import DS_VERSION
    add_sentences_(rows, parallel=False)
    with open(path.join(json_dir, f'qanta.expo.{DS_VERSION}.json'), 'w') as f:
        json.dump(format_qanta_json(rows, DS_VERSION), f)

Example #5

0

Show file

File: pipeline.py Project: NPSDC/qb

 def run(self):
     with open(QANTA_UNMAPPED_DATASET_PATH) as f:
         qanta_questions = json.load(f)["questions"]
     add_sentences_(qanta_questions)
     add_answer_prompts_(qanta_questions)
     with open(QANTA_PREPROCESSED_DATASET_PATH, "w") as f:
         json.dump(format_qanta_json(qanta_questions, DS_VERSION), f)

Example #6

0

Show file

File: pipeline.py Project: theJasonFan/qb

    def run(self):
        with open(QANTA_MAPPED_DATASET_PATH) as f:
            questions = [
                q for q in json.load(f)['questions'] if q['page'] is not None
            ]
        train_questions = [q for q in questions if 'train' in q['fold']]
        dev_questions = [q for q in questions if 'dev' in q['fold']]
        test_questions = [q for q in questions if 'test' in q['fold']]

        with open(QANTA_TRAIN_DATASET_PATH, 'w') as f:
            json.dump(format_qanta_json(train_questions, DS_VERSION), f)

        with open(QANTA_DEV_DATASET_PATH, 'w') as f:
            json.dump(format_qanta_json(dev_questions, DS_VERSION), f)

        with open(QANTA_TEST_DATASET_PATH, 'w') as f:
            json.dump(format_qanta_json(test_questions, DS_VERSION), f)

Example #7

0

Show file

File: pipeline.py Project: Pinafore/qb

    def run(self):
        with open(QANTA_TRAIN_DATASET_PATH) as f:
            all_guess_train = [q for q in json.load(f)['questions'] if q['fold'] == GUESSER_TRAIN_FOLD]

        guess_train, guess_val = train_test_split(all_guess_train, random_state=42, train_size=.9)

        with open(QANTA_DEV_DATASET_PATH) as f:
            guess_dev = [q for q in json.load(f)['questions'] if q['fold'] == GUESSER_DEV_FOLD]

        with open(QANTA_TORCH_TRAIN_LOCAL_PATH, 'w') as f:
            json.dump(format_qanta_json(guess_train, DS_VERSION), f)

        with open(QANTA_TORCH_VAL_LOCAL_PATH, 'w') as f:
            json.dump(format_qanta_json(guess_val, DS_VERSION), f)

        with open(QANTA_TORCH_DEV_LOCAL_PATH, 'w') as f:
            json.dump(format_qanta_json(guess_dev, DS_VERSION), f)

Example #8

0

Show file

File: pipeline.py Project: NPSDC/qb

    def run(self):
        with open(QANTA_MAPPED_DATASET_PATH) as f:
            questions = [
                q for q in json.load(f)["questions"] if q["page"] is not None
            ]
        train_questions = [q for q in questions if "train" in q["fold"]]
        dev_questions = [q for q in questions if "dev" in q["fold"]]
        test_questions = [q for q in questions if "test" in q["fold"]]

        with open(QANTA_TRAIN_DATASET_PATH, "w") as f:
            json.dump(format_qanta_json(train_questions, DS_VERSION), f)

        with open(QANTA_DEV_DATASET_PATH, "w") as f:
            json.dump(format_qanta_json(dev_questions, DS_VERSION), f)

        with open(QANTA_TEST_DATASET_PATH, "w") as f:
            json.dump(format_qanta_json(test_questions, DS_VERSION), f)

Example #9

0

Show file

File: pipeline.py Project: theJasonFan/qb

    def run(self):
        with open(QANTA_PREPROCESSED_DATASET_PATH) as f:
            qanta_questions = json.load(f)['questions']

        with open(PROTOBOWL_QUESTION_PLAYER_COUNTS) as f:
            question_player_counts = json.load(f)
        assign_folds_(qanta_questions, question_player_counts)

        with open(QANTA_FOLDED_DATASET_PATH, 'w') as f:
            json.dump(format_qanta_json(qanta_questions, DS_VERSION), f)

Example #10

0

Show file

File: pipeline.py Project: Pinafore/qb

    def run(self):
        with open(QANTA_PREPROCESSED_DATASET_PATH) as f:
            qanta_questions = json.load(f)['questions']

        with open(PROTOBOWL_QUESTION_PLAYER_COUNTS) as f:
            question_player_counts = json.load(f)
        assign_folds_(qanta_questions, question_player_counts)

        with open(QANTA_FOLDED_DATASET_PATH, 'w') as f:
            json.dump(format_qanta_json(qanta_questions, DS_VERSION), f)

Example #11

0

Show file

File: pipeline.py Project: Pinafore/qb

 def run(self):
     protobowl_questions = Protobowl.parse_tossups(PROTOBOWL_TOSSUPS_PATH)
     quizdb_tournaments = QuizdbOrg.parse_tournaments(QDB_TOURNAMENTS_PATH)
     quizdb_categories = QuizdbOrg.parse_categories(QDB_CATEGORIES_PATH)
     quizdb_subcategories = QuizdbOrg.parse_subcategories(QDB_SUBCATEGORIES_PATH)
     quizdb_questions = QuizdbOrg.parse_tossups(
         quizdb_tournaments, quizdb_categories, quizdb_subcategories, QDB_TOSSUPS_PATH
     )
     qanta_questions = merge_datasets(protobowl_questions, quizdb_questions)
     with open(safe_path(QANTA_UNMAPPED_DATASET_PATH), 'w') as f:
         json.dump(format_qanta_json(qanta_questions, DS_VERSION), f)

Example #12

0

Show file

def format_additional():
    """
    Additional questions were added to dataset, this processes the csv version to match
    the dataset format while verifying that page info is valid.
    """
    titles_checksum = "6fa134836b3a7e3b562cdaa8ad353f2d"
    verify_checksum(
        titles_checksum, "data/external/wikipedia/wikipedia-titles.2018.04.18.json"
    )
    with open("data/external/wikipedia/wikipedia-titles.2018.04.18.json") as f:
        titles = set(json.load(f))

    trick_checksum = "905594aab776ddb10b0d7f36d30633a2"
    verify_checksum(trick_checksum, "data/external/datasets/trick-additional.csv")

    with open("data/external/datasets/trick-additional.csv") as f:
        # Ignore header row
        rows = list(csv.reader(f))[1:]

    questions = []
    for _, text, page in rows:
        page = page.replace(" ", "_")
        if page not in titles:
            log.info(f"Page not in titles: {page}")
        questions.append(
            {
                "text": text,
                "answer": page,
                "page": page,
                "fold": "advtest",
                "year": 2018,
                "dataset": "trickme",
                "proto_id": None,
                "qdb_id": None,
                "difficulty": None,
                "category": None,
                "subcategory": None,
                "qanta_id": None,
                "tournament": TOURNAMENT_DEC_15,
                "gameplay": False,
                "interface": "ir-r2",
            }
        )
    add_sentences_(questions, parallel=False)
    dataset = format_qanta_json(questions, "2018.04.18")
    dataset["dependent_checksums"] = {
        "trick-additional.csv": trick_checksum,
        "wikipedia-titles.2018.04.18.json": titles_checksum,
    }
    path_formatted = "data/external/datasets/qanta.trick-additional-ir-round2.json"
    with open(path_formatted, "w") as f:
        json.dump(dataset, f)
    log.info(f"File: {path_formatted} Checksum: {md5sum(path_formatted)}")

Example #13

0

Show file

def format_additional():
    """
    Additional questions were added to dataset, this processes the csv version to match
    the dataset format while verifying that page info is valid.
    """
    titles_checksum = '6fa134836b3a7e3b562cdaa8ad353f2d'
    verify_checksum(
        titles_checksum,
        'data/external/wikipedia/wikipedia-titles.2018.04.18.json')
    with open('data/external/wikipedia/wikipedia-titles.2018.04.18.json') as f:
        titles = set(json.load(f))

    trick_checksum = '905594aab776ddb10b0d7f36d30633a2'
    verify_checksum(trick_checksum,
                    'data/external/datasets/trick-additional.csv')

    with open('data/external/datasets/trick-additional.csv') as f:
        # Ignore header row
        rows = list(csv.reader(f))[1:]

    questions = []
    for _, text, page in rows:
        page = page.replace(' ', '_')
        if page not in titles:
            log.info(f'Page not in titles: {page}')
        questions.append({
            'text': text,
            'answer': page,
            'page': page,
            'fold': 'advtest',
            'year': 2018,
            'dataset': 'trickme',
            'proto_id': None,
            'qdb_id': None,
            'difficulty': None,
            'category': None,
            'subcategory': None,
            'qanta_id': None,
            'tournament': TOURNAMENT_DEC_15,
            'gameplay': False,
            'interface': 'ir-r2'
        })
    add_sentences_(questions, parallel=False)
    dataset = format_qanta_json(questions, '2018.04.18')
    dataset['dependent_checksums'] = {
        'trick-additional.csv': trick_checksum,
        'wikipedia-titles.2018.04.18.json': titles_checksum
    }
    path_formatted = 'data/external/datasets/qanta.trick-additional-ir-round2.json'
    with open(path_formatted, 'w') as f:
        json.dump(dataset, f)
    log.info(f'File: {path_formatted} Checksum: {md5sum(path_formatted)}')

Example #14

0

Show file

File: pipeline.py Project: theJasonFan/qb

 def run(self):
     protobowl_questions = Protobowl.parse_tossups(PROTOBOWL_TOSSUPS_PATH)
     quizdb_tournaments = QuizdbOrg.parse_tournaments(QDB_TOURNAMENTS_PATH)
     quizdb_categories = QuizdbOrg.parse_categories(QDB_CATEGORIES_PATH)
     quizdb_subcategories = QuizdbOrg.parse_subcategories(
         QDB_SUBCATEGORIES_PATH)
     quizdb_questions = QuizdbOrg.parse_tossups(quizdb_tournaments,
                                                quizdb_categories,
                                                quizdb_subcategories,
                                                QDB_TOSSUPS_PATH)
     qanta_questions = merge_datasets(protobowl_questions, quizdb_questions)
     with open(safe_path(QANTA_UNMAPPED_DATASET_PATH), 'w') as f:
         json.dump(format_qanta_json(qanta_questions, DS_VERSION), f)

Example #15

0

Show file

File: trickme.py Project: Pinafore/qb

def split_ds(id_model_path, expo_path, version, rnn_out, es_out):
    with open(id_model_path) as f:
        lookup = json.load(f)

    with open(expo_path) as f:
        questions = json.load(f)['questions']

    es_questions = []
    rnn_questions = []
    for q in questions:
        qanta_id = str(q['qanta_id'])
        if lookup[qanta_id] == 'es':
            es_questions.append(q)
        elif lookup[qanta_id] == 'rnn':
            rnn_questions.append(q)
        else:
            raise ValueError('Unhandled question source')

    with open(rnn_out, 'w') as f:
        json.dump(format_qanta_json(rnn_questions, version), f)

    with open(es_out, 'w') as f:
        json.dump(format_qanta_json(es_questions, version), f)

Example #16

0

Show file

def split_ds(id_model_path, expo_path, version, rnn_out, es_out):
    with open(id_model_path) as f:
        lookup = json.load(f)

    with open(expo_path) as f:
        questions = json.load(f)['questions']

    es_questions = []
    rnn_questions = []
    for q in questions:
        qanta_id = str(q['qanta_id'])
        if lookup[qanta_id] == 'es':
            es_questions.append(q)
        elif lookup[qanta_id] == 'rnn':
            rnn_questions.append(q)
        else:
            raise ValueError('Unhandled question source')

    with open(rnn_out, 'w') as f:
        json.dump(format_qanta_json(rnn_questions, version), f)

    with open(es_out, 'w') as f:
        json.dump(format_qanta_json(es_questions, version), f)

Example #17

0

Show file

def split_ds(id_model_path, expo_path, version, rnn_out, es_out):
    with open(id_model_path) as f:
        lookup = json.load(f)

    with open(expo_path) as f:
        questions = json.load(f)["questions"]

    es_questions = []
    rnn_questions = []
    for q in questions:
        qanta_id = str(q["qanta_id"])
        if lookup[qanta_id] == "es":
            es_questions.append(q)
        elif lookup[qanta_id] == "rnn":
            rnn_questions.append(q)
        else:
            raise ValueError("Unhandled question source")

    with open(rnn_out, "w") as f:
        json.dump(format_qanta_json(rnn_questions, version), f)

    with open(es_out, "w") as f:
        json.dump(format_qanta_json(es_questions, version), f)

Example #18

0

Show file

File: pipeline.py Project: theJasonFan/qb

    def run(self):
        with open(ANSWER_MAP_PATH) as f:
            content = json.load(f)
            answer_map = content['answer_map']
            ambig_answer_map = content['ambig_answer_map']
        with open(QANTA_FOLDED_DATASET_PATH) as f:
            qanta_questions = json.load(f)['questions']

        with open('data/internal/page_assignment/unmappable.yaml') as f:
            unmappable = yaml.load(f)

        page_assigner = PageAssigner()
        mapping_report = unmapped_to_mapped_questions(qanta_questions,
                                                      answer_map,
                                                      ambig_answer_map,
                                                      unmappable,
                                                      page_assigner)

        with open(QANTA_MAPPED_DATASET_PATH, 'w') as f:
            json.dump(format_qanta_json(qanta_questions, DS_VERSION), f)

        with open(QANTA_MAP_REPORT_PATH, 'w') as f:
            json.dump(mapping_report, f)

Example #19

0

Show file

File: pipeline.py Project: Pinafore/qb

    def run(self):
        with open(ANSWER_MAP_PATH) as f:
            content = json.load(f)
            answer_map = content['answer_map']
            ambig_answer_map = content['ambig_answer_map']
        with open(QANTA_FOLDED_DATASET_PATH) as f:
            qanta_questions = json.load(f)['questions']

        with open('data/internal/page_assignment/unmappable.yaml') as f:
            unmappable = yaml.load(f)

        page_assigner = PageAssigner()
        mapping_report = unmapped_to_mapped_questions(
            qanta_questions,
            answer_map, ambig_answer_map,
            unmappable, page_assigner
        )

        with open(QANTA_MAPPED_DATASET_PATH, 'w') as f:
            json.dump(format_qanta_json(qanta_questions, DS_VERSION), f)

        with open(QANTA_MAP_REPORT_PATH, 'w') as f:
            json.dump(mapping_report, f)

Example #20

0

Show file

File: trickme.py Project: theJasonFan/qb

    def parse_tossups(
            qanta_ds_path='data/external/datasets/qanta.mapped.2018.04.18.json',
            trick_path='data/external/datasets/trickme_questions_12-15-2018.json',
            start_idx=2000000,
            version='2018.04.18'):
        with open(qanta_ds_path) as f:
            qanta_ds = json.load(f)['questions']
        answer_set = {q['page'] for q in qanta_ds if q['page'] is not None}
        lookup = {a.lower().replace(' ', '_'): a for a in answer_set}
        with open(trick_path) as f:
            questions = []
            for i, q in enumerate(json.load(f)):
                if 'Question' in q:
                    text = q['Question']
                elif 'question' in q:
                    text = q['question']
                else:
                    raise ValueError(
                        'Could not find question field in question')

                if 'Answer' in q:
                    answer = q['Answer'].replace(' ', '_')
                elif 'answer' in q:
                    answer = q['answer'].replace(' ', '_')
                else:
                    raise ValueError('Could not find answer field in question')
                if len(answer) == 0 or len(text) == 0:
                    continue
                if i in DIRECT_MAP:
                    m_ans, m_page = DIRECT_MAP[i]
                    if m_ans == answer:
                        if m_page is None:
                            continue  # Skip this explicitly
                        elif m_page in answer_set:
                            page = m_page
                        else:
                            raise ValueError(
                                f'{m_page} not in answer set\n Q: {text}')
                    else:
                        raise ValueError(f'Mapping error: {answer} != {m_ans}')
                elif answer in lookup:
                    page = lookup[answer]
                else:
                    raise ValueError(
                        f'Could not find: idx: {i} Q:"{text}" \nA: "{answer}"')
                q_out = {
                    'text': text,
                    'answer': answer,
                    'page': page,
                    'fold': 'advtest',
                    'year': 2018,
                    'dataset': 'trickme',
                    'proto_id': None,
                    'qdb_id': None,
                    'trickme_id': i,
                    'difficulty': None,
                    'category': None,
                    'subcategory': None,
                    'qanta_id': start_idx + i,
                    'tournament':
                    'Adversarial Question Writing UMD December 15',
                    'gameplay': False
                }
                if 'email' in q:
                    q_out['author_email'] = q['email']
                if 'category' in q and q['category'] != "None":
                    q_out['category'] = q['category']
                questions.append(q_out)
            add_sentences_(questions, parallel=True)
            dataset = format_qanta_json(questions, version)
            return dataset

Example #21

0

Show file

def trick_to_ds(
    answer_map_path,
    qanta_ds_path,
    wiki_titles_path,
    trick_path,
    id_model_path,
    out_path,
    start_idx,
    version,
    fold,
    year,
    tournament,
    separate_rounds,
):
    with open(answer_map_path) as f:
        answer_map = yaml.load(f)

    with open(qanta_ds_path) as f:
        qanta_ds = json.load(f)["questions"]
    answer_set = {q["page"] for q in qanta_ds if q["page"] is not None}
    with open(wiki_titles_path) as f:
        titles = set(json.load(f))
    lookup = {a.lower().replace(" ", "_"): a for a in answer_set}
    id_model_map = {}
    skipped = 0
    with open(trick_path) as f:
        questions = []
        for i, q in enumerate(json.load(f)):
            if "Question" in q:
                text = q["Question"]
            elif "question" in q:
                text = q["question"]
            else:
                raise ValueError("Could not find question field in question")

            if "Answer" in q:
                answer = q["Answer"].replace(" ", "_")
            elif "answer" in q:
                answer = q["answer"].replace(" ", "_")
            else:
                raise ValueError("Could not find answer field in question")

            if "trick_id" in q:
                trick_id = q["trick_id"]
            else:
                trick_id = None

            if len(answer) == 0:
                raise ValueError(f"Empty answer for trick_id={trick_id}")
            elif len(text) == 0:
                raise ValueError(f"Empty text for trick_id={trick_id}")

            if answer in titles or answer in answer_set:
                page = answer
            elif answer in lookup:
                page = lookup[answer]
            elif answer in answer_map:
                m_page = answer_map[answer]
                if m_page is None:
                    if "model" in q:
                        log.info(
                            f'Explicitly Skipping {answer}, int-model: {q["model"]}'
                        )
                    else:
                        log.info(f"Explicitly Skipping {answer}")
                    continue  # Skip this explicitly
                elif m_page in answer_set:
                    page = m_page
                else:
                    raise ValueError(f"{m_page} not in answer set\n Q: {text}")
            else:
                log.error(
                    f'Unhandled Skipping: idx: {i} trick_id: {trick_id} A: "{answer}"\nQ:"{text}"'
                )
                skipped += 1
                continue

            q_out = {
                "text": text,
                "answer": answer,
                "page": page,
                "fold": fold,
                "year": year,
                "dataset": "trickme",
                "proto_id": None,
                "qdb_id": None,
                "difficulty": None,
                "category": None,
                "subcategory": None,
                "qanta_id": start_idx + i,
                "tournament": tournament,
                "gameplay": False,
                "trick_id": trick_id,
            }
            if "email" in q:
                q_out["author_email"] = q["email"]
            if "category" in q and q["category"] != "None":
                q_out["category"] = q["category"]
            if "round" in q:
                q_out["round"] = q["round"]
            if "model" in q:
                id_model_map[q_out["qanta_id"]] = q["model"]
            questions.append(q_out)
        log.info(f"Total: {len(questions)} Skipped: {skipped}")
        add_sentences_(questions, parallel=False)
        if separate_rounds:
            rounds = defaultdict(list)
            for q in questions:
                rounds[q["round"]].append(q)
            for name, round_questions in rounds.items():
                dataset = format_qanta_json(round_questions, version)
                file_name = out_path.split(".")
                if file_name[-1] == "json":
                    file_name.pop()
                    file_name.extend([name, "json"])
                else:
                    file_name.extend([name, "json"])
                round_out_path = ".".join(file_name)
                log.info(f"Writing round {name} to {round_out_path}")
                with open(round_out_path, "w") as f:
                    json.dump(dataset, f)
        else:
            dataset = format_qanta_json(questions, version)

            with open(out_path, "w") as f:
                json.dump(dataset, f)

        with open(id_model_path, "w") as f:
            json.dump(id_model_map, f)

Example #22

0

Show file

File: trickme.py Project: Pinafore/qb

def trick_to_ds(answer_map_path, qanta_ds_path, wiki_titles_path, trick_path,
                id_model_path, out_path,
                start_idx, version, fold, year, tournament,
                separate_rounds):
    with open(answer_map_path) as f:
        answer_map = yaml.load(f)

    with open(qanta_ds_path) as f:
        qanta_ds = json.load(f)['questions']
    answer_set = {q['page'] for q in qanta_ds if q['page'] is not None}
    with open(wiki_titles_path) as f:
        titles = set(json.load(f))
    lookup = {a.lower().replace(' ', '_'): a for a in answer_set}
    id_model_map = {}
    skipped = 0
    with open(trick_path) as f:
        questions = []
        for i, q in enumerate(json.load(f)):
            if 'Question' in q:
                text = q['Question']
            elif 'question' in q:
                text = q['question']
            else:
                raise ValueError('Could not find question field in question')

            if 'Answer' in q:
                answer = q['Answer'].replace(' ', '_')
            elif 'answer' in q:
                answer = q['answer'].replace(' ', '_')
            else:
                raise ValueError('Could not find answer field in question')

            if 'trick_id' in q:
                trick_id = q['trick_id']
            else:
                trick_id = None


            if len(answer) == 0:
                raise ValueError(f'Empty answer for trick_id={trick_id}')
            elif len(text) == 0:
                raise ValueError(f'Empty text for trick_id={trick_id}')

            if answer in titles or answer in answer_set:
                page = answer
            elif answer in lookup:
                page = lookup[answer]
            elif answer in answer_map:
                m_page = answer_map[answer]
                if m_page is None:
                    if 'model' in q:
                        log.info(f'Explicitly Skipping {answer}, int-model: {q["model"]}')
                    else:
                        log.info(f'Explicitly Skipping {answer}')
                    continue  # Skip this explicitly
                elif m_page in answer_set:
                    page = m_page
                else:
                    raise ValueError(f'{m_page} not in answer set\n Q: {text}')
            else:
                log.error(f'Unhandled Skipping: idx: {i} trick_id: {trick_id} A: "{answer}"\nQ:"{text}"')
                skipped += 1
                continue

            q_out = {
                'text': text,
                'answer': answer,
                'page': page,
                'fold': fold,
                'year': year,
                'dataset': 'trickme',
                'proto_id': None,
                'qdb_id': None,
                'difficulty': None,
                'category': None,
                'subcategory': None,
                'qanta_id': start_idx + i,
                'tournament': tournament,
                'gameplay': False,
                'trick_id': trick_id
            }
            if 'email' in q:
                q_out['author_email'] = q['email']
            if 'category' in q and q['category'] != "None":
                q_out['category'] = q['category']
            if 'round' in q:
                q_out['round'] = q['round']
            if 'model' in q:
                id_model_map[q_out['qanta_id']] = q['model']
            questions.append(q_out)
        log.info(f'Total: {len(questions)} Skipped: {skipped}')
        add_sentences_(questions, parallel=False)
        if separate_rounds:
            rounds = defaultdict(list)
            for q in questions:
                rounds[q['round']].append(q)
            for name, round_questions in rounds.items():
                dataset = format_qanta_json(round_questions, version)
                file_name = out_path.split('.')
                if file_name[-1] == 'json':
                    file_name.pop()
                    file_name.extend([name, 'json'])
                else:
                    file_name.extend([name, 'json'])
                round_out_path = '.'.join(file_name)
                log.info(f'Writing round {name} to {round_out_path}')
                with open(round_out_path, 'w') as f:
                    json.dump(dataset, f)
        else:
            dataset = format_qanta_json(questions, version)

            with open(out_path, 'w') as f:
                json.dump(dataset, f)

        with open(id_model_path, 'w') as f:
            json.dump(id_model_map, f)

Example #23

0

Show file

File: command.py Project: NPSDC/qb

def ingestion_cli(start_idx):
    """
    Input format is for jason's HS project, but can be changed. The original code for answer
    mapping was designed to map everything over multiple passes, not yield a callable function to map
    an arbitrary answer line to a QB answer. Rather than implement this, a hacky way to achieve similar
    functionality to map a new dataset is to combine already mapped questions with new questions, have
    the code map answer for both at the same time, then only use the mappings from the new questions.
    There are some edge cases, but this should in general work (hopefully).
    """
    with open(QANTA_PREPROCESSED_DATASET_PATH) as f:
        unmapped_questions = json.load(f)["questions"]

    with open("data/external/high_school_project/quizdb-20190313164802.json"
              ) as f:
        raw_questions = json.load(f)["data"]["tossups"]

    new_questions = []
    idx = start_idx
    for q in raw_questions:
        new_questions.append({
            "qanta_id": idx,
            "text": q["text"],
            "answer": q["answer"],
            "page": None,
            "category": None,
            "subcategory": None,
            "tournament": q["tournament"]["name"],
            "difficulty": q["tournament"]["difficulty"],
            "year": int(q["tournament"]["year"]),
            "proto_id": None,
            "qdb_id": q["id"],
            "dataset": "quizdb.org",
            "fold": "guesstest",
        })
        idx += 1
    questions = unmapped_questions + new_questions
    answer_map, amb_answer_map, unbound_answers, report = create_answer_map(
        questions)
    with safe_open("data/external/high_school_project/automatic_report.json",
                   "w") as f:
        json.dump(report, f)

    write_answer_map(
        answer_map,
        amb_answer_map,
        unbound_answers,
        "data/external/high_school_project/answer_map.json",
        "data/external/high_school_project/unbound_answers.json",
    )
    with open("data/internal/page_assignment/unmappable.yaml") as f:
        unmappable = yaml.load(f)

    page_assigner = PageAssigner()
    mapping_report = unmapped_to_mapped_questions(new_questions, answer_map,
                                                  amb_answer_map, unmappable,
                                                  page_assigner)

    add_sentences_(new_questions)
    with open(
            "data/external/high_school_project/qanta.acf-regionals-2018.json",
            "w") as f:
        json.dump(format_qanta_json(new_questions, DS_VERSION), f)

    with open("data/external/high_school_project/mapping_report.json",
              "w") as f:
        json.dump(mapping_report, f)

Example #24

0

Show file

def trick_to_ds(answer_map_path, qanta_ds_path, wiki_titles_path, trick_path,
                id_model_path, out_path, start_idx, version, fold, year,
                tournament, separate_rounds):
    with open(answer_map_path) as f:
        answer_map = yaml.load(f)

    with open(qanta_ds_path) as f:
        qanta_ds = json.load(f)['questions']
    answer_set = {q['page'] for q in qanta_ds if q['page'] is not None}
    with open(wiki_titles_path) as f:
        titles = set(json.load(f))
    lookup = {a.lower().replace(' ', '_'): a for a in answer_set}
    id_model_map = {}
    skipped = 0
    with open(trick_path) as f:
        questions = []
        for i, q in enumerate(json.load(f)):
            if 'Question' in q:
                text = q['Question']
            elif 'question' in q:
                text = q['question']
            else:
                raise ValueError('Could not find question field in question')

            if 'Answer' in q:
                answer = q['Answer'].replace(' ', '_')
            elif 'answer' in q:
                answer = q['answer'].replace(' ', '_')
            else:
                raise ValueError('Could not find answer field in question')

            if 'trick_id' in q:
                trick_id = q['trick_id']
            else:
                trick_id = None

            if len(answer) == 0:
                raise ValueError(f'Empty answer for trick_id={trick_id}')
            elif len(text) == 0:
                raise ValueError(f'Empty text for trick_id={trick_id}')

            if answer in titles or answer in answer_set:
                page = answer
            elif answer in lookup:
                page = lookup[answer]
            elif answer in answer_map:
                m_page = answer_map[answer]
                if m_page is None:
                    if 'model' in q:
                        log.info(
                            f'Explicitly Skipping {answer}, int-model: {q["model"]}'
                        )
                    else:
                        log.info(f'Explicitly Skipping {answer}')
                    continue  # Skip this explicitly
                elif m_page in answer_set:
                    page = m_page
                else:
                    raise ValueError(f'{m_page} not in answer set\n Q: {text}')
            else:
                log.error(
                    f'Unhandled Skipping: idx: {i} trick_id: {trick_id} A: "{answer}"\nQ:"{text}"'
                )
                skipped += 1
                continue

            q_out = {
                'text': text,
                'answer': answer,
                'page': page,
                'fold': fold,
                'year': year,
                'dataset': 'trickme',
                'proto_id': None,
                'qdb_id': None,
                'difficulty': None,
                'category': None,
                'subcategory': None,
                'qanta_id': start_idx + i,
                'tournament': tournament,
                'gameplay': False,
                'trick_id': trick_id
            }
            if 'email' in q:
                q_out['author_email'] = q['email']
            if 'category' in q and q['category'] != "None":
                q_out['category'] = q['category']
            if 'round' in q:
                q_out['round'] = q['round']
            if 'model' in q:
                id_model_map[q_out['qanta_id']] = q['model']
            questions.append(q_out)
        log.info(f'Total: {len(questions)} Skipped: {skipped}')
        add_sentences_(questions, parallel=False)
        if separate_rounds:
            rounds = defaultdict(list)
            for q in questions:
                rounds[q['round']].append(q)
            for name, round_questions in rounds.items():
                dataset = format_qanta_json(round_questions, version)
                file_name = out_path.split('.')
                if file_name[-1] == 'json':
                    file_name.pop()
                    file_name.extend([name, 'json'])
                else:
                    file_name.extend([name, 'json'])
                round_out_path = '.'.join(file_name)
                log.info(f'Writing round {name} to {round_out_path}')
                with open(round_out_path, 'w') as f:
                    json.dump(dataset, f)
        else:
            dataset = format_qanta_json(questions, version)

            with open(out_path, 'w') as f:
                json.dump(dataset, f)

        with open(id_model_path, 'w') as f:
            json.dump(id_model_map, f)

Example #25

0

Show file

File: cli.py Project: Eric-Wallace-WebHost/qb

def nonnaqt_to_json(csv_input, json_dir):
    question_sentences = defaultdict(list)
    with open(csv_input) as f:
        csv_rows = list(csv.reader(f))
        for r in csv_rows[1:]:
            if len(r) != 5:
                raise ValueError('Invalid csv row, must have 5 columns')
            qnum, sent, text, page, fold = r
            qnum = int(qnum)
            sent = int(sent)
            question_sentences[qnum].append({
                'qnum': qnum,
                'sent': sent,
                'text': text,
                'page': page,
                'fold': fold
            })

    questions = []
    for sentences in tqdm.tqdm(question_sentences.values()):
        ordered_sentences = sorted(sentences, key=lambda s: s['sent'])
        text = ' '.join(s['text'] for s in ordered_sentences)
        tokenizations = []
        position = 0
        for i in range(len(ordered_sentences)):
            sent = ordered_sentences[i]['text']
            length = len(sent)
            tokenizations.append((position, position + length))
            position += length + 1
        q = ordered_sentences[0]
        questions.append({
            'answer': '',
            'category': '',
            'subcategory': '',
            'tournament': '',
            'year': -1,
            'dataset': 'non_naqt',
            'difficulty': '',
            'first_sentence': ordered_sentences[0]['text'],
            'qanta_id': q['qnum'],
            'fold': q['fold'],
            'gameplay': False,
            'page': q['page'],
            'proto_id': None,
            'qdb_id': None,
            'text': text,
            'tokenizations': tokenizations
        })

    train_questions = [q for q in questions if q['fold'] == 'guesstrain']
    dev_questions = [q for q in questions if q['fold'] == 'guessdev']
    test_questions = [q for q in questions if q['fold'] == 'test']
    for q in test_questions:
        q['fold'] = 'guesstest'

    from qanta.ingestion.preprocess import format_qanta_json
    from qanta.util.constants import DS_VERSION

    with open(path.join(json_dir, f'qanta.mapped.{DS_VERSION}.json'),
              'w') as f:
        json.dump(format_qanta_json(questions, DS_VERSION), f)

    with open(path.join(json_dir, f'qanta.train.{DS_VERSION}.json'), 'w') as f:
        json.dump(format_qanta_json(train_questions, DS_VERSION), f)

    with open(path.join(json_dir, f'qanta.dev.{DS_VERSION}.json'), 'w') as f:
        json.dump(format_qanta_json(dev_questions, DS_VERSION), f)

    with open(path.join(json_dir, f'qanta.test.{DS_VERSION}.json'), 'w') as f:
        json.dump(format_qanta_json(test_questions, DS_VERSION), f)

    from sklearn.model_selection import train_test_split
    guess_train, guess_val = train_test_split(train_questions,
                                              random_state=42,
                                              train_size=.9)
    with open(path.join(json_dir, f'qanta.torchtext.train.{DS_VERSION}.json'),
              'w') as f:
        json.dump(format_qanta_json(guess_train, DS_VERSION), f)

    with open(path.join(json_dir, f'qanta.torchtext.val.{DS_VERSION}.json'),
              'w') as f:
        json.dump(format_qanta_json(guess_val, DS_VERSION), f)

    with open(path.join(json_dir, f'qanta.torchtext.dev.{DS_VERSION}.json'),
              'w') as f:
        json.dump(format_qanta_json(dev_questions, DS_VERSION), f)

Example #26

0

Show file

File: pipeline.py Project: theJasonFan/qb

 def run(self):
     with open(QANTA_UNMAPPED_DATASET_PATH) as f:
         qanta_questions = json.load(f)['questions']
     add_sentences_(qanta_questions)
     with open(QANTA_PREPROCESSED_DATASET_PATH, 'w') as f:
         json.dump(format_qanta_json(qanta_questions, DS_VERSION), f)

Example #27

0

Show file

File: pipeline.py Project: Pinafore/qb

 def run(self):
     with open(QANTA_UNMAPPED_DATASET_PATH) as f:
         qanta_questions = json.load(f)['questions']
     add_sentences_(qanta_questions)
     with open(QANTA_PREPROCESSED_DATASET_PATH, 'w') as f:
         json.dump(format_qanta_json(qanta_questions, DS_VERSION), f)