Example #1
0
File: util.py Project: Agnon1573/qb
def load_quizbowl(folds=c.BUZZER_INPUT_FOLDS) \
                    -> Tuple[Dict[str, int], Dict[str, list]]:
    # merge_dfs()
    log.info('Loading data')
    question_db = QuestionDatabase()
    quizbowl_db = QuizBowlDataset(bc.MIN_ANSWERS,
                                  guesser_train=True,
                                  buzzer_train=True)
    all_questions = question_db.all_questions()
    if not os.path.isfile(bc.OPTIONS_DIR):
        log.info('Loading the set of options')
        all_options = set(quizbowl_db.training_data()[1])

        id2option = list(all_options)
        with open(safe_path(bc.OPTIONS_DIR), 'wb') as outfile:
            pickle.dump(id2option, outfile)
    else:
        with open(safe_path(bc.OPTIONS_DIR), 'rb') as infile:
            id2option = pickle.load(infile)
    option2id = {o: i for i, o in enumerate(id2option)}
    num_options = len(id2option)
    log.info('Number of options {0}'.format(len(id2option)))

    guesses_by_fold = dict()
    for fold in folds:
        save_dir = '%s_processed.pickle' % (os.path.join(bc.GUESSES_DIR, fold))
        if os.path.isfile(save_dir):
            with open(safe_path(save_dir), 'rb') as infile:
                guesses_by_fold[fold] = pickle.load(infile)
            log.info('Loading {0} guesses'.format(fold))
            continue

        log.info('Processing {0} guesses'.format(fold))
        guesses = AbstractGuesser.load_guesses(bc.GUESSES_DIR, folds=[fold])

        worker = partial(_process_question, option2id, all_questions)
        inputs = guesses.groupby('qnum')
        guesses_by_fold[fold] = _multiprocess(worker,
                                              inputs,
                                              info='df data',
                                              multi=True)
        guesses_by_fold[fold] = [
            x for x in guesses_by_fold[fold] if x is not None
        ]
        print(len(guesses_by_fold[fold]))

        with open(safe_path(save_dir), 'wb') as outfile:
            pickle.dump(guesses_by_fold[fold], outfile)

        log.info('Processed {0} guesses saved to {1}'.format(fold, save_dir))

    return option2id, guesses_by_fold
Example #2
0
 def load_qbml(self, dir, pkl_dir):
     qbml_dirs = glob.glob(dir + '*.qbml')
     bonus_questions = []
     for qbml_dir in tqdm(qbml_dirs):
         with open(qbml_dir) as f:
             soup = BeautifulSoup(f.read(), 'xml')
         questions = soup.find_all('QUESTION')
         bonus_qs = [(q.attrs['ID'], next(q.children).title()) for q in questions if
                 q.attrs['KIND'] == 'BONUS']
         bonus_qs = _multiprocess(self._process_question, bonus_qs, progress=False)
         bonus_qs = [x for x in bonus_qs if x is not None]
         bonus_questions += bonus_qs
     with open(pkl_dir, 'wb') as f:
         pickle.dump(bonus_questions, f)
     return bonus_questions
Example #3
0
def buzzer2vwexpo(guesses_df: pd.DataFrame,
                  buzzes: Dict[int, List[List[float]]], fold: str) -> None:
    # TODO: Will be deprecated after VW stuff is remove from the pipeline
    '''Given buzzing positions, generate vw_pred, vw_meta, buzz and final files
    guesses_df: pd.DataFrame of guesses
    buzzes: dictionary of qnum -> buzzing position
    fold: string indicating the data fold
    '''
    warnings.warn(
        "buzzer2vwexpo will be deprecated after VW stuff is completely removed from the pipeline",
        DeprecationWarning)

    inputs = guesses_df.groupby('qnum')
    worker = partial(_buzzer2vwexpo, buzzes)
    result = _multiprocess(worker, inputs, info='buzzer2vwexpo')
    result = [x for x in result if x is not None]
    buzzf, predf, metaf, finalf = list(map(list, zip(*result)))

    with codecs.open(safe_path(c.PRED_TARGET.format(fold)), 'w', 'utf-8') as pred_file, \
         codecs.open(safe_path(c.META_TARGET.format(fold)), 'w', 'utf-8') as meta_file, \
         codecs.open(safe_path(c.EXPO_BUZZ.format(fold)), 'w', 'utf-8') as buzz_file, \
         codecs.open(safe_path(c.EXPO_FINAL.format(fold)), 'w', 'utf-8') as final_file:

        buzz_file.write('question|sentence|word|page|evidence|final|weight\n')
        final_file.write('question,answer\n')

        log.info('\n\n[buzzer2vwexpo] writing to files')

        buzz_template = '|'.join(['{}' for _ in range(7)])
        buzz_out = '\n'.join(
            buzz_template.format(*r) for r in itertools.chain(*buzzf))
        buzz_file.write(buzz_out)
        log.info('buzz file written')

        final_out = '\n'.join('{0},{1}'.format(*r)
                              for r in itertools.chain(*finalf))
        final_file.write(final_out)
        log.info('final file written')

        pred_out = '\n'.join('{0} {1}_{2}_{3}'.format(*r)
                             for r in itertools.chain(*predf))
        pred_file.write(pred_out)
        log.info('vw_pred file written')

        meta_out = '\n'.join('{0} {1} {2} {3}'.format(*r)
                             for r in itertools.chain(*metaf))
        meta_file.write(meta_out)
        log.info('vw_meta file written')
Example #4
0
def main(folds, model_name):

    all_questions = QuestionDatabase().all_questions()
    answers = {k: v.page for k, v in all_questions.items()}
    question_texts = {k: v.text for k, v in all_questions.items()}
    protobowl_ids = {
        k: all_questions[k].protobowl
        for k in all_questions if all_questions[k].protobowl != ""
    }
    protobowl_df = load_protobowl().groupby("qid")

    save_dir = "output/summary/new_performance/"
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    # feature -> fold -> value
    variables = defaultdict(lambda: defaultdict())
    for fold in folds:
        guesses_df = AbstractGuesser.load_guesses(bc.GUESSES_DIR, folds=[fold])
        questions = guesses_df.groupby("qnum")

        buzzes_dir = bc.BUZZES_DIR.format(fold, model_name)
        with open(buzzes_dir, "rb") as infile:
            buzzes = pickle.load(infile)
        log.info("Buzzes loaded from {}.".format(buzzes_dir))

        # qnum -> n_guessers * length
        top_guesses = _multiprocess(_get_top_guesses,
                                    questions,
                                    info="Top guesses",
                                    multi=True)
        top_guesses = {k: v for k, v in top_guesses}
        inputs = [top_guesses, buzzes, answers, variables, fold, save_dir]

        # get_eop_stats(*inputs)
        get_his_stats(*inputs)
        # get_hyper_search(*inputs)

        p_inputs = [question_texts, protobowl_ids, protobowl_df, questions
                    ] + inputs
        get_protobowl(p_inputs)

    for key, value in variables.items():
        variables[key] = dict(value)
    variables = dict(variables)

    report(variables, save_dir, folds)
Example #5
0
 def load_qbml(self, dir, pkl_dir):
     qbml_dirs = glob.glob(dir + '*.qbml')
     bonus_questions = []
     for qbml_dir in tqdm(qbml_dirs):
         with open(qbml_dir) as f:
             soup = BeautifulSoup(f.read(), 'xml')
         questions = soup.find_all('QUESTION')
         bonus_qs = [(q.attrs['ID'], next(q.children).title())
                     for q in questions if q.attrs['KIND'] == 'BONUS']
         bonus_qs = _multiprocess(self._process_question,
                                  bonus_qs,
                                  progress=False)
         bonus_qs = [x for x in bonus_qs if x is not None]
         bonus_questions += bonus_qs
     with open(pkl_dir, 'wb') as f:
         pickle.dump(bonus_questions, f)
     return bonus_questions
Example #6
0
    def create_batches(self):
        self.batches = []
        buckets = defaultdict(list)
        total = len(self.dataset)
        returns = _multiprocess(self._process_example,
                                self.dataset,
                                info="creat batches",
                                multi=False)
        for example, padded_length in returns:
            buckets[padded_length].append(example)

        for examples in buckets.values():
            for i in range(0, len(examples), self.batch_size):
                qids, answers, mask, vecs, results = \
                        zip(*examples[i : i + self.batch_size])
                batch = Batch(qids, answers, mask, vecs, results)
                self.batches.append(batch)
Example #7
0
File: dataset.py Project: NPSDC/qb
 def load_qbml(self, dir, pkl_dir):
     qbml_dirs = glob.glob(dir + "*.qbml")
     bonus_questions = []
     for qbml_dir in tqdm(qbml_dirs):
         with open(qbml_dir) as f:
             soup = BeautifulSoup(f.read(), "xml")
         questions = soup.find_all("QUESTION")
         bonus_qs = [(q.attrs["ID"], next(q.children).title())
                     for q in questions if q.attrs["KIND"] == "BONUS"]
         bonus_qs = _multiprocess(self._process_question,
                                  bonus_qs,
                                  progress=False)
         bonus_qs = [x for x in bonus_qs if x is not None]
         bonus_questions += bonus_qs
     with open(pkl_dir, "wb") as f:
         pickle.dump(bonus_questions, f)
     return bonus_questions
Example #8
0
def get_eop_stats(top_guesses, buzzes, answers, variables, fold, save_dir):
    log.info("[{}] End-of-pipelin reporting".format(fold))

    inputs = top_guesses.items()
    worker = partial(_get_eop_stats, buzzes, answers)
    eop_stats = _multiprocess(worker,
                              inputs,
                              info="End-of-pipeline stats",
                              multi=True)

    # qnum -> key -> int
    eop_stats = {k: v for k, v in eop_stats}
    # key -> int
    _eop_stats = defaultdict(lambda: [])

    eop_output = ""
    for qnum, stat in eop_stats.items():
        for key in EOP_STAT_KEYS_0 + EOP_STAT_KEYS_1:
            if stat[key] != -1:
                _eop_stats[key].append(stat[key])

    for key in EOP_STAT_KEYS_0:
        values = _eop_stats[key]
        value = sum(values) / len(values) if len(values) > 0 else 0
        _eop_stats[key] = value
        output = "{0} {1:.3f}".format(key, value)
        eop_output += output + "\n"
        # print(output)

    for key in EOP_STAT_KEYS_1:
        output = key
        values = _eop_stats[key]
        _eop_stats[key] = dict()
        for i, guesser in enumerate(GUESSERS):
            output += " {0} {1}".format(guesser, values.count(i))
            _eop_stats[key][guesser] = values.count(i)
        eop_output += output + "\n"
        # print(output)

    if variables is not None:
        variables["eop_stats"][fold] = _eop_stats

    return _eop_stats
Example #9
0
def main(folds, model_name):
    
    all_questions = QuestionDatabase().all_questions()
    answers = {k: v.page for k, v in all_questions.items()}
    question_texts = {k: v.text for k, v in all_questions.items()}
    protobowl_ids = {k: all_questions[k].protobowl 
        for k in all_questions if all_questions[k].protobowl != ''}
    protobowl_df = load_protobowl().groupby('qid')

    save_dir = 'output/summary/new_performance/'
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    # feature -> fold -> value
    variables = defaultdict(lambda: defaultdict())
    for fold in folds:
        guesses_df = AbstractGuesser.load_guesses(
                bc.GUESSES_DIR, folds=[fold])
        questions = guesses_df.groupby('qnum')

        buzzes_dir = bc.BUZZES_DIR.format(fold, model_name)
        with open(buzzes_dir, 'rb') as infile:
            buzzes = pickle.load(infile)
        log.info('Buzzes loaded from {}.'.format(buzzes_dir))

        # qnum -> n_guessers * length
        top_guesses = _multiprocess(_get_top_guesses, questions, 
            info='Top guesses', multi=True)
        top_guesses = {k: v for k, v in top_guesses}
        inputs = [top_guesses, buzzes, answers, variables, fold, save_dir]

        # get_eop_stats(*inputs)
        get_his_stats(*inputs)
        # get_hyper_search(*inputs)
        
        p_inputs = [question_texts, protobowl_ids, protobowl_df, questions] + inputs
        get_protobowl(p_inputs)

    for key, value in variables.items():
        variables[key] = dict(value)
    variables = dict(variables)

    report(variables, save_dir, folds)
Example #10
0
def get_eop_stats(top_guesses, buzzes, answers, variables, fold, save_dir):
    log.info('[{}] End-of-pipelin reporting'.format(fold))

    inputs = top_guesses.items()
    worker = partial(_get_eop_stats, buzzes, answers)
    eop_stats = _multiprocess(worker, inputs, info='End-of-pipeline stats',
            multi=True)

    # qnum -> key -> int
    eop_stats = {k: v for k, v in eop_stats}
    # key -> int
    _eop_stats = defaultdict(lambda: [])

    eop_output = ""
    for qnum, stat in eop_stats.items():
        for key in EOP_STAT_KEYS_0 + EOP_STAT_KEYS_1:
            if stat[key] != -1:
                _eop_stats[key].append(stat[key])

    for key in EOP_STAT_KEYS_0:
        values = _eop_stats[key]
        value = sum(values) / len(values) if len(values) > 0 else 0
        _eop_stats[key] = value
        output = "{0} {1:.3f}".format(key, value)
        eop_output += output + '\n'
        # print(output)

    for key in EOP_STAT_KEYS_1:
        output = key
        values = _eop_stats[key]
        _eop_stats[key] = dict()
        for i, guesser in enumerate(GUESSERS):
            output += " {0} {1}".format(guesser, values.count(i))
            _eop_stats[key][guesser] = values.count(i)
        eop_output += output + '\n'
        # print(output)

    if variables is not None:
        variables['eop_stats'][fold] = _eop_stats

    return _eop_stats
Example #11
0
def report_ultimate():
    all_questions = QuestionDatabase().all_questions()
    answers = {k: v.page for k, v in all_questions.items()}
    question_texts = {k: v.text for k, v in all_questions.items()}
    protobowl_ids = {
        k: all_questions[k].protobowl
        for k in all_questions if all_questions[k].protobowl != ''
    }
    protobowl_df, user_count = load_protobowl()
    guesses_df = AbstractGuesser.load_guesses(bc.GUESSES_DIR,
                                              folds=[c.BUZZER_DEV_FOLD])
    questions = guesses_df.groupby('qnum')
    top_guesses = _multiprocess(_get_top_guesses,
                                questions,
                                info='Top guesses',
                                multi=True)
    top_guesses = {k: v for k, v in top_guesses}

    option2id, all_guesses = load_quizbowl()
    test_iter = QuestionIterator(all_guesses[c.BUZZER_DEV_FOLD],
                                 option2id,
                                 batch_size=128)
    buzzes = ultimate_buzzer(test_iter)

    save_dir = 'output/summary/new_performance/'
    inputs = [top_guesses, buzzes, answers, None, c.BUZZER_DEV_FOLD, save_dir]
    user_answers_thresholds = [1, 10, 50, 100, 500, 1000, 2000]
    threshold_stats = []
    for threshold in user_answers_thresholds:
        pdf1 = protobowl_df[protobowl_df.user_answers > threshold]
        p_inputs = [
            question_texts, protobowl_ids,
            pdf1.groupby('qid'), questions
        ] + inputs
        pstats = get_protobowl(p_inputs)
        threshold_stats.append(pstats)
        print('ultimate', threshold, pstats)
    print('ultimate', [x['reward'] for x in threshold_stats])
Example #12
0
def report(buzzes_dir):
    all_questions = QuestionDatabase().all_questions()
    answers = {k: v.page for k, v in all_questions.items()}
    question_texts = {k: v.text for k, v in all_questions.items()}
    protobowl_ids = {
        k: all_questions[k].protobowl
        for k in all_questions if all_questions[k].protobowl != ''
    }
    protobowl_df, user_count = load_protobowl()
    guesses_df = AbstractGuesser.load_guesses(bc.GUESSES_DIR,
                                              folds=[c.BUZZER_DEV_FOLD])
    questions = guesses_df.groupby('qnum')
    top_guesses = _multiprocess(_get_top_guesses,
                                questions,
                                info='Top guesses',
                                multi=True)
    top_guesses = {k: v for k, v in top_guesses}

    with open(buzzes_dir, 'rb') as infile:
        buzzes = pickle.load(infile)

    save_dir = 'output/summary/new_performance/'
    inputs = [top_guesses, buzzes, answers, None, c.BUZZER_DEV_FOLD, save_dir]
    user_answers_thresholds = [1, 10, 50, 100, 500, 1000, 2000]
    threshold_stats = []
    for threshold in user_answers_thresholds:
        pdf1 = protobowl_df[protobowl_df.user_answers > threshold]
        p_inputs = [
            question_texts, protobowl_ids,
            pdf1.groupby('qid'), questions
        ] + inputs
        pstats = get_protobowl(p_inputs)
        threshold_stats.append(pstats)
        print(threshold, pstats)
    with open(buzzes_dir + '.pstats', 'wb') as f:
        pickle.dump(threshold_stats, f)
    print([x['reward'] for x in threshold_stats])
Example #13
0
def get_his_stats(top_guesses, buzzes, answers, variables, fold, save_dir):
    log.info("[{}] Histogram reporting".format(fold))

    inputs = top_guesses.items()
    worker = partial(_get_his_stats, buzzes, answers)
    his_stats = _multiprocess(worker,
                              inputs,
                              info="Histogram stats",
                              multi=True)
    # qnum -> key -> list(int)
    his_stats = {k: v for k, v in his_stats}
    # key -> list(int)
    _his_stats = defaultdict(lambda: [[] for _ in HISTO_RATIOS])

    for stats in his_stats.values():
        for key in HISTO_KEYS_0 + HISTO_KEYS_1:
            for i, r in enumerate(HISTO_RATIOS):
                if stats[key][i] != -1:
                    _his_stats[key][i].append(stats[key][i])

    for key in HISTO_KEYS_0 + HISTO_KEYS_1:
        for i, r in enumerate(HISTO_RATIOS):
            s = _his_stats[key][i]
            _his_stats[key][i] = sum(s) / len(s) if len(s) > 0 else 0

    _his_stats = dict(_his_stats)

    his_output = ""
    for i, r in enumerate(HISTO_RATIOS):
        output = "{}:".format(r)
        for key in HISTO_KEYS_0 + HISTO_KEYS_1:
            output += "  {0} {1:.2f}".format(key, _his_stats[key][i])
        his_output += output + "\n"
        # print(output)

    ##### plot lines #####
    fig, ax = plt.subplots()
    lines = []
    for k in HISTO_KEYS_0:
        v = _his_stats[k]
        lines.append(plt.plot(HISTO_RATIOS, v, LINE_STYLES[k], label=k)[0])

    ax.set_xticks(HISTO_RATIOS)
    plt.legend(handles=lines)
    plt.title("{} histogram lines chart".format(fold))
    if save_dir is not None:
        his_lines_dir = os.path.join(save_dir, "his_{}_lines.pdf".format(fold))
        plt.savefig(his_lines_dir, bbox_inches="tight")
    else:
        plt.show()
    plt.close()

    ##### plot stacked area chart #####
    plt.plot([], [], color="c", alpha=0.5, label="buzz_correct")
    plt.plot([], [], color="y", alpha=0.5, label="buzz_miss")
    plt.plot([], [], color="r", alpha=0.5, label="buzz_wrong")
    plt.plot([], [], color="k", alpha=0.5, label="buzz_impossible")
    plt.plot([], [], color="m", alpha=0.5, label="wait_wrong")
    plt.plot([], [], color="g", alpha=0.5, label="wait_correct")
    plt.plot([], [], color="w", alpha=0.5, label="wait_impossible")

    plt.stackplot(
        list(range(len(HISTO_RATIOS))),
        _his_stats["buzz_correct"],
        _his_stats["buzz_miss"],
        _his_stats["buzz_wrong"],
        _his_stats["buzz_impossible"],
        _his_stats["wait_wrong"],
        _his_stats["wait_correct"],
        _his_stats["wait_impossible"],
        colors=["c", "y", "r", "k", "m", "g", "w"],
        alpha=0.5,
    )
    plt.legend()
    plt.title("{} stacked area chart".format(fold))
    if save_dir is not None:
        his_stacked_dir = os.path.join(save_dir,
                                       "his_{}_stacked.pdf".format(fold))
        plt.savefig(his_stacked_dir, bbox_inches="tight")
    else:
        plt.show()
    plt.close()

    if variables is not None:
        variables["his_stats"][fold] = _his_stats
        variables["his_lines"][fold] = his_lines_dir
        variables["his_stacked"][fold] = his_stacked_dir

    return _his_stats
Example #14
0
def get_his_stats(top_guesses, buzzes, answers, variables, fold, save_dir):
    log.info('[{}] Histogram reporting'.format(fold))

    inputs = top_guesses.items()
    worker = partial(_get_his_stats, buzzes, answers)
    his_stats = _multiprocess(worker, inputs, info='Histogram stats',
            multi=True)
    # qnum -> key -> list(int)
    his_stats = {k: v for k, v in his_stats}
    # key -> list(int)
    _his_stats = defaultdict(lambda: [[] for _ in HISTO_RATIOS])

    for stats in his_stats.values():
        for key in HISTO_KEYS_0 + HISTO_KEYS_1:
            for i, r in enumerate(HISTO_RATIOS):
                if stats[key][i] != -1:
                    _his_stats[key][i].append(stats[key][i])

    for key in HISTO_KEYS_0 + HISTO_KEYS_1:
        for i, r in enumerate(HISTO_RATIOS):
            s = _his_stats[key][i]
            _his_stats[key][i] = sum(s) / len(s) if len(s) > 0 else 0

    _his_stats = dict(_his_stats)
    
    his_output = ""
    for i, r in enumerate(HISTO_RATIOS):
        output = "{}:".format(r)
        for key in HISTO_KEYS_0 + HISTO_KEYS_1:
            output += "  {0} {1:.2f}".format(key, _his_stats[key][i])
        his_output += output + '\n'
        # print(output)

    ##### plot lines #####
    fig, ax = plt.subplots()
    lines = []
    for k in HISTO_KEYS_0:
        v = _his_stats[k]
        lines.append(plt.plot(HISTO_RATIOS, v, LINE_STYLES[k], label=k)[0])

    ax.set_xticks(HISTO_RATIOS)
    plt.legend(handles=lines)
    plt.title('{} histogram lines chart'.format(fold))
    if save_dir is not None:
        his_lines_dir = os.path.join(save_dir, 'his_{}_lines.pdf'.format(fold))
        plt.savefig(his_lines_dir, bbox_inches='tight')
    else:
        plt.show()
    plt.close()

    ##### plot stacked area chart #####
    plt.plot([],[],color='c', alpha=0.5, label='buzz_correct')
    plt.plot([],[],color='y', alpha=0.5, label='buzz_miss')
    plt.plot([],[],color='r', alpha=0.5, label='buzz_wrong')
    plt.plot([],[],color='k', alpha=0.5, label='buzz_impossible')
    plt.plot([],[],color='m', alpha=0.5, label='wait_wrong')
    plt.plot([],[],color='g', alpha=0.5, label='wait_correct')
    plt.plot([],[],color='w', alpha=0.5, label='wait_impossible')

    plt.stackplot(list(range(len(HISTO_RATIOS))), 
            _his_stats['buzz_correct'], 
            _his_stats['buzz_miss'],
            _his_stats['buzz_wrong'], 
            _his_stats['buzz_impossible'],
            _his_stats['wait_wrong'], 
            _his_stats['wait_correct'], 
            _his_stats['wait_impossible'], 
            colors=['c', 'y', 'r', 'k', 'm', 'g', 'w'], alpha=0.5)
    plt.legend()
    plt.title('{} stacked area chart'.format(fold))
    if save_dir is not None:
        his_stacked_dir = os.path.join(save_dir, 'his_{}_stacked.pdf'.format(fold))
        plt.savefig(his_stacked_dir, bbox_inches='tight')
    else:
        plt.show()
    plt.close()

    if variables is not None:
        variables['his_stats'][fold] = _his_stats
        variables['his_lines'][fold] = his_lines_dir
        variables['his_stacked'][fold] = his_stacked_dir

    return _his_stats
Example #15
0
def generate(buzzes,
             answers,
             guesses_df,
             fold,
             checkpoint_dir=None,
             plot_dir=None,
             multiprocessing=True):
    questions = guesses_df.groupby('qnum')

    # qnum -> n_guessers * length
    top_guesses = _multiprocess(get_top_guesses,
                                questions,
                                info='Top guesses',
                                multi=multiprocessing)
    top_guesses = {k: v for k, v in top_guesses}

    ############# end-of-pipeline stats #############

    inputs = top_guesses.items()
    worker = partial(end_of_pipeline, buzzes, answers)
    eop_stats = _multiprocess(worker,
                              inputs,
                              info='End-of-pipeline stats',
                              multi=multiprocessing)

    # qnum -> key -> int
    eop_stats = {k: v for k, v in eop_stats}
    # key -> int
    _eop_stats = defaultdict(lambda: [])

    eop_output = ""
    for qnum, stat in eop_stats.items():
        for key in EOP_STAT_KEYS_0 + EOP_STAT_KEYS_1:
            if stat[key] != -1:
                _eop_stats[key].append(stat[key])

    for key in EOP_STAT_KEYS_0:
        values = _eop_stats[key]
        value = sum(values) / len(values) if len(values) > 0 else 0
        output = "{0} {1:.3f}".format(key, value)
        eop_output += output + '\n'
        print(output)

    for key in EOP_STAT_KEYS_1:
        output = key
        values = _eop_stats[key]
        for i in range(len(GUESSERS)):
            output += " {0} {1}".format(GUESSERS[i], values.count(i))
        eop_output += output + '\n'
        print(output)

    ############# histogram stats #############
    inputs = top_guesses.items()
    worker = partial(histogram, buzzes, answers)
    his_stats = _multiprocess(worker,
                              inputs,
                              info='Histogram stats',
                              multi=multiprocessing)
    # qnum -> key -> list(int)
    his_stats = {k: v for k, v in his_stats}
    # key -> list(int)
    _his_stats = defaultdict(lambda: [[] for _ in HISTO_RATIOS])

    for stats in his_stats.values():
        for key in HISTO_KEYS:
            for i, r in enumerate(HISTO_RATIOS):
                if stats[key][i] != -1:
                    _his_stats[key][i].append(stats[key][i])

    for key in HISTO_KEYS:
        for i, r in enumerate(HISTO_RATIOS):
            s = _his_stats[key][i]
            _his_stats[key][i] = sum(s) / len(s) if len(s) > 0 else 0

    _his_stats = dict(_his_stats)

    his_output = ""
    for i, r in enumerate(HISTO_RATIOS):
        output = "{}:".format(r)
        for key in HISTO_KEYS:
            output += "  {0} {1:.2f}".format(key, _his_stats[key][i])
        his_output += output + '\n'
        print(output)

    if plot_dir is not None:
        lines = []
        for k, v in _his_stats.items():
            lines.append(plt.plot(HISTO_RATIOS, v, LINE_STYLES[k], label=k)[0])
        plt.legend(handles=lines)
        plt.savefig(plot_dir, dpi=200, format='png')
        plt.clf()

    if checkpoint_dir is not None:
        checkpoint = {
            'buzzes': buzzes,
            'top_guesses': top_guesses,
            'eop_keys': EOP_STAT_KEYS_0 + EOP_STAT_KEYS_1,
            'his_keys': HISTO_KEYS,
            'eop_stats': eop_stats,
            'his_stats': his_stats,
            '_his_stats': _his_stats
        }
        with open(checkpoint_dir, 'wb') as outfile:
            pickle.dump(checkpoint, outfile)

    return eop_output, his_output
Example #16
0
def get_his_stats(top_guesses, buzzes, answers, variables, fold, save_dir):
    log.info('[{}] Histogram reporting'.format(fold))

    inputs = top_guesses.items()
    worker = partial(_get_his_stats, buzzes, answers)
    his_stats = _multiprocess(worker,
                              inputs,
                              info='Histogram stats',
                              multi=True)
    # qnum -> key -> list(int)
    his_stats = {k: v for k, v in his_stats}
    # key -> list(int)
    _his_stats = defaultdict(lambda: [[] for _ in HISTO_RATIOS])

    for stats in his_stats.values():
        for key in HISTO_KEYS_0 + HISTO_KEYS_1:
            for i, r in enumerate(HISTO_RATIOS):
                if stats[key][i] != -1:
                    _his_stats[key][i].append(stats[key][i])

    for key in HISTO_KEYS_0 + HISTO_KEYS_1:
        for i, r in enumerate(HISTO_RATIOS):
            s = _his_stats[key][i]
            _his_stats[key][i] = sum(s) / len(s) if len(s) > 0 else 0

    _his_stats = dict(_his_stats)

    his_output = ""
    for i, r in enumerate(HISTO_RATIOS):
        output = "{}:".format(r)
        for key in HISTO_KEYS_0 + HISTO_KEYS_1:
            output += "  {0} {1:.2f}".format(key, _his_stats[key][i])
        his_output += output + '\n'
        # print(output)

    ##### plot lines #####
    fig, ax = plt.subplots()
    lines = []
    for k in HISTO_KEYS_0:
        v = _his_stats[k]
        lines.append(plt.plot(HISTO_RATIOS, v, LINE_STYLES[k], label=k)[0])

    ax.set_xticks(HISTO_RATIOS)
    plt.legend(handles=lines)
    plt.title('{} histogram lines chart'.format(fold))
    if save_dir is not None:
        his_lines_dir = os.path.join(save_dir, 'his_{}_lines.pdf'.format(fold))
        plt.savefig(his_lines_dir, bbox_inches='tight')
    else:
        plt.show()
    plt.close()

    ##### plot stacked area chart #####
    plt.plot([], [], color='c', alpha=0.5, label='buzz_correct')
    plt.plot([], [], color='y', alpha=0.5, label='buzz_miss')
    plt.plot([], [], color='r', alpha=0.5, label='buzz_wrong')
    plt.plot([], [], color='k', alpha=0.5, label='buzz_impossible')
    plt.plot([], [], color='m', alpha=0.5, label='wait_wrong')
    plt.plot([], [], color='g', alpha=0.5, label='wait_correct')
    plt.plot([], [], color='w', alpha=0.5, label='wait_impossible')

    plt.stackplot(list(range(len(HISTO_RATIOS))),
                  _his_stats['buzz_correct'],
                  _his_stats['buzz_miss'],
                  _his_stats['buzz_wrong'],
                  _his_stats['buzz_impossible'],
                  _his_stats['wait_wrong'],
                  _his_stats['wait_correct'],
                  _his_stats['wait_impossible'],
                  colors=['c', 'y', 'r', 'k', 'm', 'g', 'w'],
                  alpha=0.5)
    plt.legend()
    plt.title('{} stacked area chart'.format(fold))
    if save_dir is not None:
        his_stacked_dir = os.path.join(save_dir,
                                       'his_{}_stacked.pdf'.format(fold))
        plt.savefig(his_stacked_dir, bbox_inches='tight')
    else:
        plt.show()
    plt.close()

    if variables is not None:
        variables['his_stats'][fold] = _his_stats
        variables['his_lines'][fold] = his_lines_dir
        variables['his_stacked'][fold] = his_stacked_dir

    return _his_stats