Esempio n. 1
0
def adversarial_to_json(adversarial_json, json_dir):
    from qanta.datasets.quiz_bowl import QantaDatabase
    db = QantaDatabase()
    lookup = {q.page.lower(): q.page for q in db.mapped_questions}
    with open(adversarial_json) as f:
        questions = json.load(f)
        rows = []
        for i, q in enumerate(questions):
            answer = q['answer'].strip().replace(' ', '_')
            if answer in lookup:
                answer = lookup[answer]
            else:
                log.warning(f'Could not find: {answer}')
            rows.append({
                'text': q['question'].strip(),
                'page': answer,
                'answer': '',
                'qanta_id': 1000000 + i,
                'proto_id': None,
                'qdb_id': None,
                'category': '',
                'subcategory': '',
                'tournament': '',
                'difficulty': '',
                'dataset': 'adversarial',
                'year': -1,
                'fold': 'expo',
                'gameplay': False
            })

    from qanta.ingestion.preprocess import add_sentences_, format_qanta_json
    from qanta.util.constants import DS_VERSION
    add_sentences_(rows, parallel=False)
    with open(path.join(json_dir, f'qanta.expo.{DS_VERSION}.json'), 'w') as f:
        json.dump(format_qanta_json(rows, DS_VERSION), f)
Esempio n. 2
0
def create_wikipedia_redirect_pickle(redirect_csv, output_pickle):
    countries = {}
    with open(COUNTRY_LIST_PATH) as f:
        for line in f:
            k, v = line.split('\t')
            countries[k] = v.strip()

    db = QantaDatabase()
    pages = {q.page for q in db.train_questions}

    with open(redirect_csv) as redirect_f:
        redirects = {}
        n_total = 0
        n_selected = 0
        for row in csv.reader(redirect_f, quotechar='"', escapechar='\\'):
            n_total += 1
            source = row[0]
            target = row[1]
            if (target not in pages or source in countries
                    or target.startswith('WikiProject')
                    or target.endswith("_topics")
                    or target.endswith("_(overview)")):
                continue
            else:
                redirects[source] = target
                n_selected += 1

        log.info(
            'Filtered {} raw wikipedia redirects to {} matching redirects'.
            format(n_total, n_selected))

    with open(output_pickle, 'wb') as output_f:
        pickle.dump(redirects, output_f)
Esempio n. 3
0
def create_wikipedia_cache(
        parsed_wiki_path='data/external/wikipedia/parsed-wiki',
        output_path=WIKI_LOOKUP_PATH):
    from qanta.spark import create_spark_context

    sc = create_spark_context()
    db = QantaDatabase()
    train_questions = db.train_questions
    answers = {q.page for q in train_questions}
    b_answers = sc.broadcast(answers)
    # Paths used in spark need to be absolute and it needs to exist
    page_path = os.path.abspath(parsed_wiki_path)
    page_pattern = os.path.join(page_path, '*', '*')

    def parse_page(json_text):
        page = json.loads(json_text)
        return {
            'id': int(page['id']),
            'title': page['title'].replace(' ', '_'),
            'text': page['text'],
            'url': page['url']
        }

    wiki_pages = sc.textFile(page_pattern).map(parse_page).filter(
        lambda p: p['title'] in b_answers.value).collect()
    wiki_lookup = {p['title']: p for p in wiki_pages}
    with open(output_path, 'w') as f:
        json.dump(wiki_lookup, f)

    return wiki_lookup
Esempio n. 4
0
def create_answer_mapping_csvs(output_dir='data/external/answer_mapping'):
    with open(QANTA_MAP_REPORT_PATH) as f:
        report = json.load(f)
        match_report = report['match_report']
    db = QantaDatabase()
    qb_lookup: Dict[int, Question] = {q.qanta_id: q for q in db.all_questions}
    train_rows = unmapped_rows(match_report, report['train_unmatched'])
    test_rows = unmapped_rows(match_report, report['test_unmatched'])
    train_df = pd.DataFrame.from_records(train_rows, columns=UNMAPPED_COLUMNS)
    test_df = pd.DataFrame.from_records(test_rows, columns=UNMAPPED_COLUMNS)
    train_df.to_csv(os.path.join(output_dir, 'unmapped_train.csv'))
    test_df.to_csv(os.path.join(output_dir, 'unmapped_test.csv'))

    disagree_rows = []
    for qanta_id, row in match_report.items():
        if row['result'] == 'disagree':
            q = qb_lookup[int(qanta_id)]
            start, end = q.tokenizations[-1]
            is_train = q.fold == GUESSER_TRAIN_FOLD or q.fold == BUZZER_TRAIN_FOLD
            disagree_rows.append(
                ('disagree', None, q.proto_id, q.qdb_id, q.qanta_id, is_train,
                 q.text[start:end], q.answer, row['automatic_page'],
                 row['annotated_page']))
    disagree_df = pd.DataFrame.from_records(disagree_rows,
                                            columns=DISAGREE_COLUMNS)
    disagree_df[disagree_df.is_train == True].to_csv(
        os.path.join(output_dir, 'disagree_train.csv'))
    disagree_df[disagree_df.is_train == False].to_csv(
        os.path.join(output_dir, 'disagree_test.csv'))
Esempio n. 5
0
def create_wikipedia_cache(
        parsed_wiki_path="data/external/wikipedia/parsed-wiki",
        output_path=WIKI_LOOKUP_PATH):
    from qanta.spark import create_spark_context

    sc = create_spark_context()
    db = QantaDatabase()
    train_questions = db.train_questions
    answers = {q.page for q in train_questions}
    b_answers = sc.broadcast(answers)
    # Paths used in spark need to be absolute and it needs to exist
    page_path = os.path.abspath(parsed_wiki_path)
    page_pattern = os.path.join(page_path, "*", "*")

    def parse_page(json_text):
        page = json.loads(json_text)
        return {
            "id": int(page["id"]),
            "title": page["title"].replace(" ", "_"),
            "text": page["text"],
            "url": page["url"],
        }

    wiki_pages = (sc.textFile(page_pattern).map(parse_page).filter(
        lambda p: p["title"] in b_answers.value).collect())
    wiki_lookup = {p["title"]: p for p in wiki_pages}
    with open(output_path, "w") as f:
        json.dump(wiki_lookup, f)

    return wiki_lookup
Esempio n. 6
0
    def create_report(self, directory: str, fold):
        with open(os.path.join(directory, f"guesser_params.pickle"),
                  "rb") as f:
            params = pickle.load(f)

        qdb = QantaDatabase()
        guesser_train = qdb.guess_train_questions
        questions_by_fold = qdb.by_fold()
        guesser_report_questions = questions_by_fold[fold]

        train_pages = {q.page for q in guesser_train}
        dev_pages = {q.page for q in guesser_report_questions}

        unanswerable_answer_percent = len(dev_pages -
                                          train_pages) / len(dev_pages)
        answerable = 0
        for q in guesser_report_questions:
            if q.page in train_pages:
                answerable += 1
        unanswerable_question_percent = 1 - answerable / len(
            guesser_report_questions)

        train_example_counts = Counter()
        for q in guesser_train:
            train_example_counts[q.page] += 1

        dev_df = pd.DataFrame({
            "page": [q.page for q in guesser_report_questions],
            "qanta_id": [q.qanta_id for q in guesser_report_questions],
            "text_length": [len(q.text) for q in guesser_report_questions],
            "n_train":
            [train_example_counts[q.page] for q in guesser_report_questions],
            "category": [q.category for q in guesser_report_questions],
        })

        char_guess_df = AbstractGuesser.load_guesses(directory,
                                                     folds=[fold],
                                                     output_type="char")
        char_df = char_guess_df.merge(dev_df, on="qanta_id")
        char_df["correct"] = (char_df.guess == char_df.page).astype("int")
        char_df["char_percent"] = (char_df["char_index"] /
                                   char_df["text_length"]).clip_upper(1.0)

        first_guess_df = AbstractGuesser.load_guesses(directory,
                                                      folds=[fold],
                                                      output_type="first")
        first_df = first_guess_df.merge(dev_df, on="qanta_id").sort_values(
            "score", ascending=False)
        first_df["correct"] = (first_df.guess == first_df.page).astype("int")
        grouped_first_df = first_df.groupby("qanta_id")
        first_accuracy = grouped_first_df.nth(0).correct.mean()
        first_recall = grouped_first_df.agg({"correct": "max"}).correct.mean()

        full_guess_df = AbstractGuesser.load_guesses(directory,
                                                     folds=[fold],
                                                     output_type="full")
        full_df = full_guess_df.merge(dev_df, on="qanta_id").sort_values(
            "score", ascending=False)
        full_df["correct"] = (full_df.guess == full_df.page).astype("int")
        grouped_full_df = full_df.groupby("qanta_id")
        full_accuracy = grouped_full_df.nth(0).correct.mean()
        full_recall = grouped_full_df.agg({"correct": "max"}).correct.mean()

        with open(os.path.join(directory, f"guesser_report_{fold}.pickle"),
                  "wb") as f:
            pickle.dump(
                {
                    "first_accuracy": first_accuracy,
                    "first_recall": first_recall,
                    "full_accuracy": full_accuracy,
                    "full_recall": full_recall,
                    "char_df": char_df,
                    "first_df": first_df,
                    "full_df": full_df,
                    "n_guesses": conf["n_guesses"],
                    "unanswerable_answer_percent": unanswerable_answer_percent,
                    "unanswerable_question_percent":
                    unanswerable_question_percent,
                    "guesser_name": self.display_name(),
                    "guesser_params": params,
                },
                f,
            )