Beispiel #1
1
 def output(self):
     return [LocalTarget(AbstractGuesser.guess_path(bc.GUESSES_DIR, fold)) for fold in c.BUZZER_INPUT_FOLDS]
Beispiel #2
1
def stack(model_dir, model_name, fold=BUZZER_DEV_FOLD):
    guesses_dir = AbstractGuesser.output_path(
        'qanta.guesser.rnn', 'RnnGuesser', 0, '')
    guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, 'char')
    with open(guesses_dir, 'rb') as f:
        guesses = pickle.load(f)
    guesses = guesses.groupby('qanta_id')

    buzzes_dir = os.path.join(model_dir, '{}_buzzes.pkl'.format(fold))
    with open(buzzes_dir, 'rb') as f:
        buzzes = pickle.load(f)

    questions = QuizBowlDataset(buzzer_train=True).questions_by_fold()
    questions = {q.qanta_id: q for q in questions[fold]}

    stack_freq = {'Position': [], 'Buzzing': []}
    count = defaultdict(lambda: 0)
    for qid, (char_indices, scores) in buzzes.items():
        gs = guesses.get_group(qid).groupby('char_index')
        gs = gs.aggregate(lambda x: x.head(1)).to_dict()['guess']
        question = questions[qid]
        q_len = len(question.text)
        for i, char_index in enumerate(char_indices):
            buzz_oracle = gs[char_index] == question.page
            buzz_buzzer = scores[i][1] > scores[i][0]

            only_oracle = buzz_oracle and (not buzz_buzzer)
            only_buzzer = buzz_buzzer and (not buzz_oracle)
            both = buzz_buzzer and buzz_oracle
            neither = (not buzz_buzzer) and (not buzz_oracle)

            rel_position = np.round(char_index / q_len, decimals=1)
            count[rel_position] += 1

            if only_oracle:
                stack_freq['Position'].append(rel_position)
                stack_freq['Buzzing'].append('Only optimal')

            if only_buzzer:
                stack_freq['Position'].append(rel_position)
                stack_freq['Buzzing'].append('Only buzzer')

            if both:
                stack_freq['Position'].append(rel_position)
                stack_freq['Buzzing'].append('Both')

            if neither:
                stack_freq['Position'].append(rel_position)
                stack_freq['Buzzing'].append('Neither')

    df = pd.DataFrame(stack_freq)
    df = df.groupby(['Position', 'Buzzing'])
    df = df.size().reset_index().rename(columns={0: 'Frequency'})
    df['Frequency'] = df.apply(
        lambda row: row['Frequency'] / count[row['Position']],
        axis=1)
    df['Model'] = pd.Series([model_name for _ in range(len(df))])
    stack_dir = os.path.join(model_dir, '{}_stack.pkl'.format(fold))
    with open(stack_dir, 'wb') as f:
        pickle.dump(df, f)

    return df
Beispiel #3
1
import pickle
from qanta.guesser.abstract import AbstractGuesser
from qanta.guesser.dan import DanGuesser
from qanta.util.constants import BUZZER_TRAIN_FOLD, BUZZER_DEV_FOLD

guesser_directory = AbstractGuesser.output_path(
    'qanta.guesser.dan', 'DanGuesser', 0, '')
guesser = DanGuesser.load(guesser_directory)  # type: AbstractGuesser
guesser.batch_size /= 8

word_skip = 2
folds = [BUZZER_TRAIN_FOLD, BUZZER_DEV_FOLD]
for fold in folds:
    df = guesser.generate_guesses(1, [fold], word_skip=word_skip)
    output_path = AbstractGuesser.guess_path(guesser_directory, fold)
    with open(output_path, 'wb') as f:
        pickle.dump(df, f)
Beispiel #4
0
def merge_dfs():
    GUESSERS = ["{0}.{1}".format(
        x.guesser_module, x.guesser_class) \
        for x in AbstractGuesser.list_enabled_guessers()]
    log.info("Merging guesser DataFrames.")
    merged_dir = os.path.join(c.GUESSER_TARGET_PREFIX, 'merged')
    if not os.path.exists(merged_dir):
        os.makedirs(merged_dir)
    for fold in c.BUZZER_INPUT_FOLDS:
        if os.path.exists(AbstractGuesser.guess_path(merged_dir, fold)):
            log.info("Merged {0} exists, skipping.".format(fold))
            continue
        new_guesses = pd.DataFrame(columns=[
            'fold', 'guess', 'guesser', 'qnum', 'score', 'sentence', 'token'
        ],
                                   dtype='object')
        for guesser in GUESSERS:
            guesser_dir = os.path.join(c.GUESSER_TARGET_PREFIX, guesser)
            guesses = AbstractGuesser.load_guesses(guesser_dir, folds=[fold])
            new_guesses = new_guesses.append(guesses)
        for col in ['qnum', 'sentence', 'token', 'score']:
            new_guesses[col] = pd.to_numeric(new_guesses[col],
                                             downcast='integer')
        AbstractGuesser.save_guesses(new_guesses, merged_dir, folds=[fold])
        log.info("Merging: {0} finished.".format(fold))
Beispiel #5
0
def protobowl(model, fold=BUZZER_DEV_FOLD):
    buzzes = get_buzzes(model, fold)
    '''eval'''
    guesses_dir = AbstractGuesser.output_path('qanta.guesser.rnn',
                                              'RnnGuesser', 0, '')
    guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, 'char')
    with open(guesses_dir, 'rb') as f:
        guesses = pickle.load(f)
    guesses = guesses.groupby('qanta_id')

    questions = QuizBowlDataset(buzzer_train=True).questions_by_fold()
    questions = questions[fold]

    df = load_protobowl()
    df = df.groupby('qid')

    worker = partial(simulate_game, guesses, buzzes, df)

    possibility = []
    outcome = []
    for question in tqdm(questions):
        pos, out = worker(question)
        possibility += pos
        outcome += out

    result_df = pd.DataFrame({
        'Possibility': possibility,
        'Outcome': outcome,
    })

    result_dir = os.path.join(model.model_dir, '{}_protobowl.pkl'.format(fold))
    with open(result_dir, 'wb') as f:
        pickle.dump(result_df, f)
Beispiel #6
0
def protobowl(model, fold=BUZZER_DEV_FOLD):
    buzzes = get_buzzes(model, fold)

    """eval"""
    guesses_dir = AbstractGuesser.output_path("qanta.guesser.rnn", "RnnGuesser", 0, "")
    guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, "char")
    with open(guesses_dir, "rb") as f:
        guesses = pickle.load(f)
    guesses = guesses.groupby("qanta_id")

    questions = QuizBowlDataset(buzzer_train=True).questions_by_fold()
    questions = questions[fold]

    df = load_protobowl()
    df = df.groupby("qid")

    worker = partial(simulate_game, guesses, buzzes, df)

    possibility = []
    outcome = []
    for question in tqdm(questions):
        pos, out = worker(question)
        possibility += pos
        outcome += out

    result_df = pd.DataFrame({"Possibility": possibility, "Outcome": outcome,})

    result_dir = os.path.join(model.model_dir, "{}_protobowl.pkl".format(fold))
    with open(result_dir, "wb") as f:
        pickle.dump(result_df, f)
Beispiel #7
0
def read_data(fold,
              output_type='char',
              guesser_module='qanta.guesser.dan',
              guesser_class='DanGuesser',
              guesser_config_num=0,
              vector_converter=vector_converter_0):

    if os.path.isfile(dataset_dir.format(fold)):
        with open(dataset_dir.format(fold), 'rb') as f:
            return pickle.load(f)

    g_dir = AbstractGuesser.output_path(guesser_module, guesser_class,
                                        guesser_config_num, '')
    g_path = AbstractGuesser.guess_path(g_dir, fold, output_type)
    with open(g_path, 'rb') as f:
        df = pickle.load(f)
    df_groups = df.groupby('qanta_id')

    questions = QuizBowlDataset(buzzer_train=True).questions_by_fold()
    questions = {q.qanta_id: q for q in questions[fold]}

    pool = Pool(8)
    worker = partial(process_question, questions, vector_converter)
    dataset = pool.map(worker, df_groups)

    with open(dataset_dir.format(fold), 'wb') as f:
        return pickle.dump(dataset, f)

    return dataset
Beispiel #8
0
def ew(model, fold=BUZZER_DEV_FOLD):
    buzzes = get_buzzes(model, fold)

    guesses_dir = AbstractGuesser.output_path('qanta.guesser.rnn',
                                              'RnnGuesser', 0, '')
    guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, 'char')
    with open(guesses_dir, 'rb') as f:
        guesses = pickle.load(f)
    guesses = guesses.groupby('qanta_id')

    answers = dict()
    for qid, bs in buzzes.items():
        answers[qid] = []
        groups = guesses.get_group(qid).groupby('char_index')
        for char_index, scores in zip(*bs):
            guess = groups.get_group(char_index).head(1)['guess']
            guess = guess.values[0]
            buzz = scores[0] < scores[1]
            answers[qid].append({
                'char_index': char_index,
                'guess': guess,
                'buzz': buzz,
            })

    questions = QuizBowlDataset(buzzer_train=True).questions_by_fold()
    questions = {q.qanta_id: q for q in questions[fold]}

    curve_score = CurveScore()
    ew = []
    ew_opt = []
    for qid, answer in answers.items():
        question = questions[qid]
        q = {'text': question.text, 'page': question.page}
        ew.append(curve_score.score(answer, q))
        ew_opt.append(curve_score.score_optimal(answer, q))
    eval_out = {
        'expected_wins': sum(ew),
        'n_examples': len(ew),
        'expected_wins_optimal': sum(ew_opt),
    }
    print(json.dumps(eval_out))
    return eval_out
Beispiel #9
0
def ew(model, fold=BUZZER_DEV_FOLD):
    buzzes = get_buzzes(model, fold)

    guesses_dir = AbstractGuesser.output_path(
        'qanta.guesser.rnn', 'RnnGuesser', 0, '')
    guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, 'char')
    with open(guesses_dir, 'rb') as f:
        guesses = pickle.load(f)
    guesses = guesses.groupby('qanta_id')

    answers = dict()
    for qid, bs in buzzes.items():
        answers[qid] = []
        groups = guesses.get_group(qid).groupby('char_index')
        for char_index, scores in zip(*bs):
            guess = groups.get_group(char_index).head(1)['guess']
            guess = guess.values[0]
            buzz = scores[0] < scores[1]
            answers[qid].append({
                'char_index': char_index,
                'guess': guess,
                'buzz': buzz,
            })

    questions = QuizBowlDataset(buzzer_train=True).questions_by_fold()
    questions = {q.qanta_id: q for q in questions[fold]}

    curve_score = CurveScore()
    ew = []
    ew_opt = []
    for qid, answer in answers.items():
        question = questions[qid]
        q = {'text': question.text, 'page': question.page}
        ew.append(curve_score.score(answer, q))
        ew_opt.append(curve_score.score_optimal(answer, q))
    eval_out = {
        'expected_wins': sum(ew),
        'n_examples': len(ew),
        'expected_wins_optimal': sum(ew_opt),
    }
    print(json.dumps(eval_out))
    return eval_out
Beispiel #10
0
def ew(model, fold=BUZZER_DEV_FOLD):
    buzzes = get_buzzes(model, fold)

    guesses_dir = AbstractGuesser.output_path("qanta.guesser.rnn", "RnnGuesser", 0, "")
    guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, "char")
    with open(guesses_dir, "rb") as f:
        guesses = pickle.load(f)
    guesses = guesses.groupby("qanta_id")

    answers = dict()
    for qid, bs in buzzes.items():
        answers[qid] = []
        groups = guesses.get_group(qid).groupby("char_index")
        for char_index, scores in zip(*bs):
            guess = groups.get_group(char_index).head(1)["guess"]
            guess = guess.values[0]
            buzz = scores[0] < scores[1]
            answers[qid].append(
                {"char_index": char_index, "guess": guess, "buzz": buzz,}
            )

    questions = QuizBowlDataset(buzzer_train=True).questions_by_fold()
    questions = {q.qanta_id: q for q in questions[fold]}

    curve_score = CurveScore()
    ew = []
    ew_opt = []
    for qid, answer in answers.items():
        question = questions[qid]
        q = {"text": question.text, "page": question.page}
        ew.append(curve_score.score(answer, q))
        ew_opt.append(curve_score.score_optimal(answer, q))
    eval_out = {
        "expected_wins": sum(ew),
        "n_examples": len(ew),
        "expected_wins_optimal": sum(ew_opt),
    }
    print(json.dumps(eval_out))
    return eval_out
Beispiel #11
0
def export(output_file: str, fold: str = "buzztest"):
    fold = "buzztest"
    guesses_dir = AbstractGuesser.output_path("qanta.guesser.rnn",
                                              "RnnGuesser", 0, "")
    guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, "char")
    with open(guesses_dir, "rb") as f:
        guesses = pickle.load(f)
    guesses = guesses.groupby("qanta_id")

    questions = QuizBowlDataset(buzzer_train=True).questions_by_fold()
    questions = {q.qanta_id: q for q in questions[fold]}
    buzzers = {}
    for name in ["RNNBuzzer", "ThresholdBuzzer", "MLPBuzzer"]:
        model_dir = f"output/buzzer/{name}"
        buzzes_dir = os.path.join(model_dir, "{}_buzzes.pkl".format(fold))
        with open(buzzes_dir, "rb") as f:
            buzzers[name] = pickle.load(f)

    qid_to_buzzes = defaultdict(dict)
    for name, buzzes in track(buzzers.items()):
        for qid, (char_indices, scores) in buzzes.items():
            gs = (guesses.get_group(qid).groupby("char_index").aggregate(
                lambda x: x.head(1)).to_dict()["guess"])
            question = questions[qid]
            q_len = len(question.text)
            buzz_oracle_position = -1
            buzz_model_position = -1
            oracle_guess = None
            buzz_guess = None
            for i, char_index in enumerate(char_indices):
                buzz_oracle = gs[char_index] == question.page
                if buzz_oracle:
                    if buzz_oracle_position == -1 or char_index <= buzz_oracle_position:
                        oracle_guess = question.page
                        buzz_oracle_position = char_index

                if scores[i][1] > scores[i][0]:
                    if buzz_model_position == -1 or char_index < buzz_model_position:
                        buzz_guess = gs[char_index]
                        buzz_model_position = char_index
            qid_to_buzzes[qid][name] = {
                "oracle":
                buzz_oracle_position,
                "oracle_fraction":
                buzz_oracle_position /
                q_len if buzz_oracle_position != -1 else -1,
                "position":
                buzz_model_position,
                "position_fraction":
                buzz_model_position /
                q_len if buzz_model_position != -1 else -1,
                "q_len":
                q_len,
                "oracle_guess":
                oracle_guess,
                "buzz_guess":
                buzz_guess,
                "answer":
                question.page,
                "impossible":
                oracle_guess is None,
            }
    write_json(output_file, qid_to_buzzes)
Beispiel #12
0
 def output(self):
     return [
         LocalTarget(AbstractGuesser.guess_path(bc.GUESSES_DIR, fold))
         for fold in c.BUZZER_INPUT_FOLDS
     ]
Beispiel #13
0
def stack(model_dir, model_name, fold=BUZZER_DEV_FOLD):
    guesses_dir = AbstractGuesser.output_path('qanta.guesser.rnn',
                                              'RnnGuesser', 0, '')
    guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, 'char')
    with open(guesses_dir, 'rb') as f:
        guesses = pickle.load(f)
    guesses = guesses.groupby('qanta_id')

    buzzes_dir = os.path.join(model_dir, '{}_buzzes.pkl'.format(fold))
    with open(buzzes_dir, 'rb') as f:
        buzzes = pickle.load(f)

    questions = QuizBowlDataset(buzzer_train=True).questions_by_fold()
    questions = {q.qanta_id: q for q in questions[fold]}

    stack_freq = {'Position': [], 'Buzzing': []}
    count = defaultdict(lambda: 0)
    for qid, (char_indices, scores) in buzzes.items():
        gs = guesses.get_group(qid).groupby('char_index')
        gs = gs.aggregate(lambda x: x.head(1)).to_dict()['guess']
        question = questions[qid]
        q_len = len(question.text)
        for i, char_index in enumerate(char_indices):
            buzz_oracle = gs[char_index] == question.page
            buzz_buzzer = scores[i][1] > scores[i][0]

            only_oracle = buzz_oracle and (not buzz_buzzer)
            only_buzzer = buzz_buzzer and (not buzz_oracle)
            both = buzz_buzzer and buzz_oracle
            neither = (not buzz_buzzer) and (not buzz_oracle)

            rel_position = np.round(char_index / q_len, decimals=1)
            count[rel_position] += 1

            if only_oracle:
                stack_freq['Position'].append(rel_position)
                stack_freq['Buzzing'].append('Only optimal')

            if only_buzzer:
                stack_freq['Position'].append(rel_position)
                stack_freq['Buzzing'].append('Only buzzer')

            if both:
                stack_freq['Position'].append(rel_position)
                stack_freq['Buzzing'].append('Both')

            if neither:
                stack_freq['Position'].append(rel_position)
                stack_freq['Buzzing'].append('Neither')

    df = pd.DataFrame(stack_freq)
    df = df.groupby(['Position', 'Buzzing'])
    df = df.size().reset_index().rename(columns={0: 'Frequency'})
    df['Frequency'] = df.apply(
        lambda row: row['Frequency'] / count[row['Position']], axis=1)
    df['Model'] = pd.Series([model_name for _ in range(len(df))])
    stack_dir = os.path.join(model_dir, '{}_stack.pkl'.format(fold))
    with open(stack_dir, 'wb') as f:
        pickle.dump(df, f)

    return df
Beispiel #14
0
Datei: plot.py Projekt: NPSDC/qb
def stack(model_dir, model_name, fold=BUZZER_DEV_FOLD):
    guesses_dir = AbstractGuesser.output_path("qanta.guesser.rnn", "RnnGuesser", 0, "")
    guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, "char")
    with open(guesses_dir, "rb") as f:
        guesses = pickle.load(f)
    guesses = guesses.groupby("qanta_id")

    buzzes_dir = os.path.join(model_dir, "{}_buzzes.pkl".format(fold))
    with open(buzzes_dir, "rb") as f:
        buzzes = pickle.load(f)

    questions = QuizBowlDataset(buzzer_train=True).questions_by_fold()
    questions = {q.qanta_id: q for q in questions[fold]}

    stack_freq = {"Position": [], "Buzzing": []}
    count = defaultdict(lambda: 0)
    for qid, (char_indices, scores) in buzzes.items():
        gs = guesses.get_group(qid).groupby("char_index")
        gs = gs.aggregate(lambda x: x.head(1)).to_dict()["guess"]
        question = questions[qid]
        q_len = len(question.text)
        for i, char_index in enumerate(char_indices):
            buzz_oracle = gs[char_index] == question.page
            buzz_buzzer = scores[i][1] > scores[i][0]

            only_oracle = buzz_oracle and (not buzz_buzzer)
            only_buzzer = buzz_buzzer and (not buzz_oracle)
            both = buzz_buzzer and buzz_oracle
            neither = (not buzz_buzzer) and (not buzz_oracle)

            rel_position = np.round(char_index / q_len, decimals=1)
            count[rel_position] += 1

            if only_oracle:
                stack_freq["Position"].append(rel_position)
                stack_freq["Buzzing"].append("Only optimal")

            if only_buzzer:
                stack_freq["Position"].append(rel_position)
                stack_freq["Buzzing"].append("Only buzzer")

            if both:
                stack_freq["Position"].append(rel_position)
                stack_freq["Buzzing"].append("Both")

            if neither:
                stack_freq["Position"].append(rel_position)
                stack_freq["Buzzing"].append("Neither")

    df = pd.DataFrame(stack_freq)
    df = df.groupby(["Position", "Buzzing"])
    df = df.size().reset_index().rename(columns={0: "Frequency"})
    df["Frequency"] = df.apply(
        lambda row: row["Frequency"] / count[row["Position"]], axis=1
    )
    df["Model"] = pd.Series([model_name for _ in range(len(df))])
    stack_dir = os.path.join(model_dir, "{}_stack.pkl".format(fold))
    with open(stack_dir, "wb") as f:
        pickle.dump(df, f)

    return df
Beispiel #15
0
import pickle
from qanta.guesser.abstract import AbstractGuesser
from qanta.guesser.dan import DanGuesser
from qanta.util.constants import BUZZER_TRAIN_FOLD, BUZZER_DEV_FOLD

guesser_directory = AbstractGuesser.output_path(
    "qanta.guesser.dan", "DanGuesser", 0, ""
)
guesser = DanGuesser.load(guesser_directory)  # type: AbstractGuesser
guesser.batch_size /= 8

word_skip = 2
folds = [BUZZER_TRAIN_FOLD, BUZZER_DEV_FOLD]
for fold in folds:
    df = guesser.generate_guesses(1, [fold], word_skip=word_skip)
    output_path = AbstractGuesser.guess_path(guesser_directory, fold)
    with open(output_path, "wb") as f:
        pickle.dump(df, f)
Beispiel #16
-1
def read_data(
        fold,
        output_type='char',
        guesser_module='qanta.guesser.rnn',
        guesser_class='RnnGuesser',
        guesser_config_num=0,
        vector_converter=vector_converter_0):

    if os.path.isfile(dataset_dir.format(fold)):
        with open(dataset_dir.format(fold), 'rb') as f:
            return pickle.load(f)

    g_dir = AbstractGuesser.output_path(
        guesser_module, guesser_class, guesser_config_num, '')
    g_path = AbstractGuesser.guess_path(g_dir, fold, output_type)
    with open(g_path, 'rb') as f:
        df = pickle.load(f)
    df_groups = df.groupby('qanta_id')

    questions = QuizBowlDataset(buzzer_train=True).questions_by_fold()
    questions = {q.qanta_id: q for q in questions[fold]}

    pool = Pool(8)
    worker = partial(process_question, questions, vector_converter)
    dataset = pool.map(worker, df_groups)

    with open(dataset_dir.format(fold), 'wb') as f:
        pickle.dump(dataset, f)

    return dataset
Beispiel #17
-2
def protobowl(model, fold=BUZZER_DEV_FOLD):
    buzzes = get_buzzes(model, fold)

    '''eval'''
    guesses_dir = AbstractGuesser.output_path(
        'qanta.guesser.rnn', 'RnnGuesser', 0, '')
    guesses_dir = AbstractGuesser.guess_path(guesses_dir, fold, 'char')
    with open(guesses_dir, 'rb') as f:
        guesses = pickle.load(f)
    guesses = guesses.groupby('qanta_id')

    questions = QuizBowlDataset(buzzer_train=True).questions_by_fold()
    questions = questions[fold]

    df = load_protobowl()
    df = df.groupby('qid')

    worker = partial(simulate_game, guesses, buzzes, df)

    possibility = []
    outcome = []
    for question in tqdm(questions):
        pos, out = worker(question)
        possibility += pos
        outcome += out

    result_df = pd.DataFrame({
        'Possibility': possibility,
        'Outcome': outcome,
    })

    result_dir = os.path.join(
        model.model_dir, '{}_protobowl.pkl'.format(fold))
    with open(result_dir, 'wb') as f:
        pickle.dump(result_df, f)