def audit_report(df, output): df.to_csv(output) df.head(25).plot.bar('feature', 'value') plt.title('Feature Magnitudes') plt.xlabel('Magnitude') plt.savefig('/tmp/feature_importance.png', dpi=200, format='png') pd.set_option('display.width', 1000) pd.set_option('display.max_rows', 100) pd.set_option('display.max_colwidth', 30) top_features = str(df.head(100)) report = ReportGenerator( { 'feature_importance_plot': '/tmp/feature_importance.png', 'top_features': top_features }, 'audit_regressor.md') output = safe_path(VW_AUDIT_REGRESSOR_REPORT) report.create(output) plt.clf() plt.cla() plt.close()
def report(variables, save_dir, folds): # use this to have jinja skip non-existent features jinja_keys = ['his_lines', 'his_stacked', 'rush_late_plot', 'choice_plot', 'hype_configs', 'protobowl_plot', 'protobowl_stats'] _variables = {k: dict() for k in jinja_keys} _variables.update(variables) if len(folds) == 1: output = os.path.join(save_dir, 'report_{}.pdf'.format(folds[0])) else: output = os.path.join(save_dir, 'report_all.pdf') report_generator = ReportGenerator('new_performance.md') report_generator.create(_variables, output)
def report(variables, save_dir, folds): # use this to have jinja skip non-existent features jinja_keys = [ 'his_lines', 'his_stacked', 'rush_late_plot', 'choice_plot', 'hype_configs', 'protobowl_plot', 'protobowl_stats' ] _variables = {k: dict() for k in jinja_keys} _variables.update(variables) if len(folds) == 1: output = os.path.join(save_dir, 'report_{}.pdf'.format(folds[0])) else: output = os.path.join(save_dir, 'report_all.pdf') report_generator = ReportGenerator('new_performance.md') report_generator.create(_variables, output)
def report(variables, save_dir, folds): # use this to have jinja skip non-existent features jinja_keys = [ "his_lines", "his_stacked", "rush_late_plot", "choice_plot", "hype_configs", "protobowl_plot", "protobowl_stats", ] _variables = {k: dict() for k in jinja_keys} _variables.update(variables) if len(folds) == 1: output = os.path.join(save_dir, "report_{}.pdf".format(folds[0])) else: output = os.path.join(save_dir, "report_all.pdf") report_generator = ReportGenerator("new_performance.md") report_generator.create(_variables, output)
def n_guesser_report(report_path, fold, n_samples=10): qdb = QuestionDatabase() question_lookup = qdb.all_questions() questions = [q for q in question_lookup.values() if q.fold == fold] guess_dataframes = [] folds = [fold] for g_spec in AbstractGuesser.list_enabled_guessers(): path = AbstractGuesser.output_path(g_spec.guesser_module, g_spec.guesser_class, '') guess_dataframes.append(AbstractGuesser.load_guesses(path, folds=folds)) df = pd.concat(guess_dataframes) # type: pd.DataFrame guessers = set(df['guesser'].unique()) n_guessers = len(guessers) guesses = [] for name, group in df.groupby(['guesser', 'qnum', 'sentence', 'token']): top_guess = group.sort_values('score', ascending=False).iloc[0] guesses.append(top_guess) top_df = pd.DataFrame.from_records(guesses) guess_lookup = {} for name, group in top_df.groupby(['qnum', 'sentence', 'token']): guess_lookup[name] = group performance = {} question_positions = {} n_correct_samples = defaultdict(list) for q in questions: page = q.page positions = [(sent, token) for sent, token, _ in q.partials()] # Since partials() passes word_skip=-1 each entry is guaranteed to be a sentence n_sentences = len(positions) q_positions = { 'start': 1, 'p_25': max(1, round(n_sentences * .25)), 'p_50': max(1, round(n_sentences * .5)), 'p_75': max(1, round(n_sentences * .75)), 'end': len(positions) } question_positions[q.qnum] = q_positions for sent, token in positions: key = (q.qnum, sent, token) if key in guess_lookup: guesses = guess_lookup[key] n_correct = (guesses.guess == page).sum() n_correct_samples[n_correct].append(key) if n_correct == 0: correct_guessers = 'None' elif n_correct == n_guessers: correct_guessers = 'All' else: correct_guessers = '/'.join( sorted(guesses[guesses.guess == page].guesser.values)) else: n_correct = 0 correct_guessers = 'None' performance[key] = (n_correct, correct_guessers) start_accuracies = [] p_25_accuracies = [] p_50_accuracies = [] p_75_accuracies = [] end_accuracies = [] for q in questions: qnum = q.qnum start_pos = question_positions[qnum]['start'] p_25_pos = question_positions[qnum]['p_25'] p_50_pos = question_positions[qnum]['p_50'] p_75_pos = question_positions[qnum]['p_75'] end_pos = question_positions[qnum]['end'] start_accuracies.append((*performance[(qnum, start_pos, 0)], 'start')) p_25_accuracies.append((*performance[(qnum, p_25_pos, 0)], 'p_25')) p_50_accuracies.append((*performance[(qnum, p_50_pos, 0)], 'p_50')) p_75_accuracies.append((*performance[(qnum, p_75_pos, 0)], 'p_75')) end_accuracies.append((*performance[(qnum, end_pos, 0)], 'end')) all_accuracies = start_accuracies + p_25_accuracies + p_50_accuracies + p_75_accuracies + end_accuracies perf_df = pd.DataFrame.from_records( all_accuracies, columns=['n_guessers_correct', 'correct_guessers', 'position']) perf_df['count'] = 1 n_questions = len(questions) aggregate_df = (perf_df.groupby( ['position', 'n_guessers_correct', 'correct_guessers']).count() / n_questions).reset_index() fig, ax = plt.subplots(figsize=(12, 8), nrows=2, ncols=3, sharey=True, sharex=True) positions = { 'start': (0, 0), 'p_25': (0, 1), 'p_50': (1, 0), 'p_75': (1, 1), 'end': (1, 2) } position_labels = { 'start': 'Start', 'p_25': '25%', 'p_50': '50%', 'p_75': '75%', 'end': '100%' } ax[(0, 2)].axis('off') for p, key in positions.items(): data = aggregate_df[aggregate_df.position == p].pivot( index='n_guessers_correct', columns='correct_guessers').fillna(0)['count'] plot_ax = ax[key] data.plot.bar(stacked=True, ax=plot_ax, title='Question Position: {}'.format(position_labels[p])) handles, labels = plot_ax.get_legend_handles_labels() ax_legend = plot_ax.legend() ax_legend.set_visible(False) plot_ax.set(xlabel='Number of Correct Guessers', ylabel='Accuracy') for plot_ax in list(ax.flatten()): for tk in plot_ax.get_yticklabels(): tk.set_visible(True) for tk in plot_ax.get_xticklabels(): tk.set_rotation('horizontal') fig.legend(handles, labels, bbox_to_anchor=(.8, .75)) fig.suptitle('Accuracy Breakdown by Guesser') accuracy_by_n_correct_plot_path = '/tmp/accuracy_by_n_correct_{}.png'.format( fold) fig.savefig(accuracy_by_n_correct_plot_path, dpi=200) sampled_questions_by_correct = sample_n_guesser_correct_questions( question_lookup, guess_lookup, n_correct_samples, n_samples=n_samples) report = ReportGenerator('compare_guessers.md') report.create( { 'dev_accuracy_by_n_correct_plot': accuracy_by_n_correct_plot_path, 'sampled_questions_by_correct': sampled_questions_by_correct }, safe_path(report_path))
def create_report(self, directory: str): with open(os.path.join(directory, 'guesser_params.pickle'), 'rb') as f: params = pickle.load(f) dev_guesses = AbstractGuesser.load_guesses(directory, folds=[c.GUESSER_DEV_FOLD]) qdb = QuestionDatabase() questions = qdb.all_questions() # Compute recall and accuracy dev_recall = compute_fold_recall(dev_guesses, questions) dev_questions = { qnum: q for qnum, q in questions.items() if q.fold == c.GUESSER_DEV_FOLD } dev_recall_stats = compute_recall_at_positions(dev_recall) dev_summary_accuracy = compute_summary_accuracy( dev_questions, dev_recall_stats) dev_summary_recall = compute_summary_recall(dev_questions, dev_recall_stats) accuracy_plot('/tmp/dev_accuracy.png', dev_summary_accuracy, 'Guesser Dev') recall_plot('/tmp/dev_recall.png', dev_questions, dev_summary_recall, 'Guesser Dev') # Obtain metrics on number of answerable questions based on the dataset requested all_answers = {g for g in qdb.all_answers().values()} all_questions = list(qdb.all_questions().values()) answer_lookup = { qnum: guess for qnum, guess in qdb.all_answers().items() } dataset = self.qb_dataset() training_data = dataset.training_data() min_n_answers = {g for g in training_data[1]} train_questions = [ q for q in all_questions if q.fold == c.GUESSER_TRAIN_FOLD ] train_answers = {q.page for q in train_questions} dev_questions = [ q for q in all_questions if q.fold == c.GUESSER_DEV_FOLD ] dev_answers = {q.page for q in dev_questions} min_n_train_questions = [ q for q in train_questions if q.page in min_n_answers ] all_common_train_dev = train_answers.intersection(dev_answers) min_common_train_dev = min_n_answers.intersection(dev_answers) all_train_answerable_questions = [ q for q in train_questions if q.page in train_answers ] all_dev_answerable_questions = [ q for q in dev_questions if q.page in train_answers ] min_train_answerable_questions = [ q for q in train_questions if q.page in min_n_answers ] min_dev_answerable_questions = [ q for q in dev_questions if q.page in min_n_answers ] # The next section of code generates the percent of questions correct by the number # of training examples. Row = namedtuple('Row', [ 'fold', 'guess', 'guesser', 'qnum', 'score', 'sentence', 'token', 'correct', 'answerable_1', 'answerable_2', 'n_examples' ]) train_example_count_lookup = seq(train_questions) \ .group_by(lambda q: q.page) \ .smap(lambda page, group: (page, len(group))) \ .dict() def guess_to_row(*args): guess = args[1] qnum = args[3] answer = answer_lookup[qnum] return Row( *args, answer == guess, answer in train_answers, answer in min_n_answers, train_example_count_lookup[answer] if answer in train_example_count_lookup else 0) dev_data = seq(dev_guesses) \ .smap(guess_to_row) \ .group_by(lambda r: (r.qnum, r.sentence)) \ .smap(lambda key, group: seq(group).max_by(lambda q: q.sentence)) \ .to_pandas(columns=Row._fields) dev_data['correct_int'] = dev_data['correct'].astype(int) dev_data['ones'] = 1 dev_counts = dev_data\ .groupby('n_examples')\ .agg({'correct_int': np.mean, 'ones': np.sum})\ .reset_index() correct_by_n_count_plot('/tmp/dev_correct_by_count.png', dev_counts, 'Guesser Dev') n_train_vs_fold_plot('/tmp/n_train_vs_dev.png', dev_counts, 'Guesser Dev') with open(os.path.join(directory, 'guesser_report.pickle'), 'wb') as f: pickle.dump( { 'dev_accuracy': dev_summary_accuracy, 'guesser_name': self.display_name(), 'guesser_params': params }, f) output = safe_path(os.path.join(directory, 'guesser_report.pdf')) report = ReportGenerator('guesser.md') report.create( { 'dev_recall_plot': '/tmp/dev_recall.png', 'dev_accuracy_plot': '/tmp/dev_accuracy.png', 'dev_accuracy': dev_summary_accuracy, 'guesser_name': self.display_name(), 'guesser_params': params, 'n_answers_all_folds': len(all_answers), 'n_total_train_questions': len(train_questions), 'n_train_questions': len(min_n_train_questions), 'n_dev_questions': len(dev_questions), 'n_total_train_answers': len(train_answers), 'n_train_answers': len(min_n_answers), 'n_dev_answers': len(dev_answers), 'all_n_common_train_dev': len(all_common_train_dev), 'all_p_common_train_dev': len(all_common_train_dev) / max(1, len(dev_answers)), 'min_n_common_train_dev': len(min_common_train_dev), 'min_p_common_train_dev': len(min_common_train_dev) / max(1, len(dev_answers)), 'all_n_answerable_train': len(all_train_answerable_questions), 'all_p_answerable_train': len(all_train_answerable_questions) / len(train_questions), 'all_n_answerable_dev': len(all_dev_answerable_questions), 'all_p_answerable_dev': len(all_dev_answerable_questions) / len(dev_questions), 'min_n_answerable_train': len(min_train_answerable_questions), 'min_p_answerable_train': len(min_train_answerable_questions) / len(train_questions), 'min_n_answerable_dev': len(min_dev_answerable_questions), 'min_p_answerable_dev': len(min_dev_answerable_questions) / len(dev_questions), 'dev_correct_by_count_plot': '/tmp/dev_correct_by_count.png', 'n_train_vs_dev_plot': '/tmp/n_train_vs_dev.png', }, output)
def create_report(classifier, class_type, question_db=None): if question_db is None: question_db = QuestionDatabase(QB_QUESTION_DB) all_questions = question_db.questions_with_pages() train = compute_features(all_questions, 'train', class_type) train_x = train['text'] train_y = train['label'] dev = compute_features(all_questions, 'dev', class_type) dev_x = dev['text'] dev_y = dev['label'] train_score = classifier.score(train_x, train_y) dev_score = classifier.score(dev_x, dev_y) true_labels = dev['label'].values predicted_labels = classifier.predict(dev_x) cf_norm = '/tmp/norm_confusion.png' plot_confusion( 'Row Normalized Confusion Matrix of {} Classification'.format( class_type), true_labels, predicted_labels, normalized=True) plt.savefig(cf_norm, format='png', dpi=200) plt.clf() plt.cla() plt.close() cf_unnorm = '/tmp/unnorm_confusion.png' plot_confusion('Unnormalized Confusion Matrix of {} Classification'.format( class_type), true_labels, predicted_labels, normalized=False) plt.savefig(cf_unnorm, format='png', dpi=200) correct_by_position = '/tmp/correct_by_position.png' dev['prediction'] = pd.Series(predicted_labels) dev['correct'] = dev['prediction'] == dev['label'] pd.pivot_table(dev, values=['text'], index=['sentence', 'correct'], aggfunc=lambda x: len(x)).unstack(fill_value=0).plot.bar( title='Number of Questions Correct vs Sentence Number') plt.xlabel('Sentence Number') plt.ylabel('Number Correct') handles, labels = plt.gca().get_legend_handles_labels() plt.gca().legend(handles, ['Number Incorrect', 'Number Correct']) plt.savefig(correct_by_position, format='png', dpi=200) report = ReportGenerator( { 'unnormalized_confusion_plot': cf_unnorm, 'normalized_confusion_plot': cf_norm, 'correct_by_position_plot': correct_by_position, 'train_score': train_score, 'dev_score': dev_score, 'class_type': class_type }, 'classifier.md') output = safe_path(CLASSIFIER_REPORT_PATH.format(class_type)) report.create(output) plt.clf() plt.cla() plt.close()
if __name__ == '__main__': args = parse_args() if args.fold != None: folds = [args.fold] else: folds = c.BUZZ_FOLDS all_questions = QuestionDatabase().all_questions() answers = {k: v.page for k, v in all_questions.items()} variables = dict() for fold in folds: guesses_df = AbstractGuesser.load_guesses(bc.GUESSES_DIR, folds=[fold]) buzzes_dir = bc.BUZZES_DIR.format(fold) with open(buzzes_dir, 'rb') as infile: buzzes = pickle.load(infile) log.info('Buzzes loaded from {}.'.format(buzzes_dir)) checkpoint_dir = "output/summary/performance_{}.pkl".format(fold) plot_dir = "output/summary/performance_{}_his.png".format(fold) eop_output, his_output = generate(buzzes, answers, guesses_df, fold, checkpoint_dir, plot_dir) variables['eop_{}_output'.format(fold)] = eop_output variables['his_{}_output'.format(fold)] = his_output variables['his_{}_plot'.format(fold)] = plot_dir output = 'output/summary/new_performance.pdf' report_generator = ReportGenerator('new_performance.md') report_generator.create(variables, output)