def plot(self, variable, label=None, color=None, ref_val=None, alpha=.25, bins=None, title=None, hdi=None, outfile=None, ref_color=None): """ Plot the histogram of a variable trace Parameters ---------- variable : str The name of one of self.variables to plot label : str Alternative label for the legend ref_val : float A reference value location at which to draw a vertical line alpha : float in [0 1) The transparency of the histogram bins : int The number of histogram bins title : str The title of the plot hdi : float in [0, 1] The amount of probability mass within the Highest Density Interval to display on the histogram. outfile : str The name of an output file to save the figure to. """ from matplotlib import pyplot as plt # lazy import from abra.vis import plot_interval if variable not in self.variables: print(self.variables) raise ValueError('Variable `{}` not available'.format(variable)) label = label if label else variable trace = getattr(self, variable) if bins is None: bins = int(len(trace.data) / 50.) trace.hist(color=color, alpha=alpha, bins=bins, ref_val=ref_val, label=label) if hdi is not None: # highest density interval median = round(trace.percentiles(50), 3) _hdi = [round(h, 3) for h in trace.hdi(1 - hdi)] plot_interval(*_hdi, middle=median, display_text=True, color=color, offset=5) if title is None: if ref_val is not None: gt = round(100 * trace.prob_greater_than(ref_val)) title = " {}% < {} = {} < {}%".format(100 - gt, variable, ref_val, gt) else: title = '' plt.title(title, fontsize=16) if outfile: plt.savefig(outfile)
def visualize_rates_results(results, figsize=(15, 10), outfile=None, *args, **kwargs): fig, axs = plt.subplots(3, 1, figsize=figsize) # Sample Comparison plot plt.sca(axs[0]) control_pmf = Poisson(results.control.mean, color=CONTROL_COLOR, label=results.control.name) variation_pmf = Poisson(results.variation.mean, color=VARIATION_COLOR, label=results.variation.name) control_pmf.plot(plot_type='bar', alpha=.5) variation_pmf.plot(plot_type='bar', alpha=.5) plt.legend() plt.title("Sample Comparison") # Rates +/- standard error plot plt.sca(axs[1]) y_min, y_max = plt.ylim() y_dist = (y_max - y_min) / LABEL_Y_OFFSET_FACTOR plot_interval(*results.control.std_err(), middle=results.control.mean, y=y_dist, offset=-0.015, color=CONTROL_COLOR, display_text=True, label=results.control.name) plot_interval(*results.variation.std_err(), middle=results.variation.mean, y=-y_dist, offset=0.005, color=VARIATION_COLOR, display_text=True, label=results.variation.name) plt.legend() plt.gca().get_yaxis().set_ticks([]) plt.title("Rates +/- Standard Error") # Differences plot plt.sca(axs[2]) plot_interval(*results.ci[0], middle=results.delta, color=DIFF_COLOR, display_text=True) plt.axvline(1., color=DIFF_COLOR, linestyle='--', linewidth=1.5) plt.gca().get_yaxis().set_ticks([]) plt.title(results.comparison_type) if outfile: plt.savefig(outfile, bbox_inches='tight', dpi=300)
def visualize_bootstrap_results(results, figsize=(15, 10), outfile=None, plot_type='bar', *args, **kwargs): fig, axs = plt.subplots(3, 1, figsize=figsize) # Sample Comparison plot plt.sca(axs[0]) if plot_type == 'bar': bins = 50 if results.control.nobs >= 100 or results.variation.nobs >= 100 else 20 results.control.hist(bins=bins, color=CONTROL_COLOR, alpha=.5, label=results.control.name) results.variation.hist(bins=bins, color=VARIATION_COLOR, alpha=.5, label=results.variation.name) else: control_pmf = KdePdf(samples=results.control.data, color=CONTROL_COLOR, label=results.control.name) variation_pmf = KdePdf(samples=results.variation.data, color=VARIATION_COLOR, label=results.variation.name) control_pmf.plot(alpha=.5) variation_pmf.plot(alpha=.5) plt.legend() plt.title("Sample Comparison") # Bootstrapped statistic +/- HDI plt.sca(axs[1]) y_min, y_max = plt.ylim() y_dist = (y_max - y_min) / LABEL_Y_OFFSET_FACTOR plot_interval(*results.aux['control'].hdi(), middle=results.aux['control'].mean, y=y_dist, offset=-0.015, color=CONTROL_COLOR, display_text=True, label=results.control.name) plot_interval(*results.aux['variation'].hdi(), middle=results.aux['variation'].mean, y=-y_dist, offset=0.005, color=VARIATION_COLOR, display_text=True, label=results.variation.name) plt.legend() plt.gca().get_yaxis().set_ticks([]) plt.title(f"Bootstrap({results.test_statistic}) +/- 95% HDI") # Differences plot plt.sca(axs[2]) plot_interval(*results.ci[0], middle=results.delta, color=DIFF_COLOR, display_text=True) plt.axvline(0., color=DIFF_COLOR, linestyle='--', linewidth=1.5) plt.gca().get_yaxis().set_ticks([]) plt.title(f"{results.comparison_type}({results.test_statistic})") if outfile: plt.savefig(outfile, bbox_inches='tight', dpi=300)
def visualize_binomial_results(results, figsize=(15, 10), outfile=None, *args, **kwargs): """ Visualize the results that use Gaussian approximation. """ tol = 1e-4 pmf_control = Binomial(p=results.control.mean, n=results.control.nobs, label=results.control.name, color=CONTROL_COLOR) pmf_variation = Binomial(p=results.variation.mean, n=results.variation.nobs, label=results.variation.name, color=VARIATION_COLOR) xy_control = zip(pmf_control.xgrid(), pmf_control.density(pmf_control.xgrid())) xy_variation = zip(pmf_variation.xgrid(), pmf_variation.density(pmf_variation.xgrid())) valid_xy_control = sorted([x for x in xy_control if x[1] >= tol], key=lambda x: x[0]) valid_xy_variation = sorted([x for x in xy_variation if x[1] >= tol], key=lambda x: x[0]) x_min = int(min(valid_xy_control[0][0], valid_xy_variation[0][0])) x_max = int(max(valid_xy_control[-1][0], valid_xy_variation[-1][0])) mean_diff = results.variation.mean - results.control.mean std_diff = (results.control.var / results.control.nobs + \ results.variation.var / results.control.nobs) ** .5 pdf_diff = Gaussian(mean_diff, std_diff, label='Difference', color=DIFF_COLOR) fig, axs = plt.subplots(3, 1, figsize=figsize) plt.sca(axs[0]) # make plotting more scalable if pmf_control.n > 1000 or pmf_variation.n > 1000: plot_type = 'step' else: plot_type = 'bar' pmf_control.plot(plot_type=plot_type, alpha=.5) pmf_variation.plot(plot_type=plot_type, alpha=.5) raise_y(axs[0]) plt.xlim(x_min, x_max) # plt.gca().get_xaxis().set_ticks([]) # plt.gca().get_yaxis().set_ticks([]) plt.legend() plt.title("Sample Comparison") plt.sca(axs[1]) y_min, y_max = plt.ylim() y_dist = (y_max - y_min) / LABEL_Y_OFFSET_FACTOR plot_interval(*results.control.std_err(), middle=results.control.mean, y=y_dist, offset=-0.015, color=CONTROL_COLOR, display_text=True, label=results.control.name) plot_interval(*results.variation.std_err(), middle=results.variation.mean, y=-y_dist, offset=0.005, color=VARIATION_COLOR, display_text=True, label=results.variation.name) plt.legend() plt.gca().get_yaxis().set_ticks([]) plt.title("Proportions +/- Standard Error") # Differences plot plt.sca(axs[2]) plt.axvline(0., color=DIFF_COLOR, linestyle='--', linewidth=1.5) # xs = pdf_diff.xgrid() if results.inference_procedure.hypothesis == 'larger': left_bound = results.ci[0][0] right_bound = np.inf elif results.inference_procedure.hypothesis == 'smaller': right_bound = results.ci[0][1] left_bound = np.inf else: left_bound = results.ci[0][0] right_bound = results.ci[0][1] plot_interval(left_bound, right_bound, mean_diff, color=DIFF_COLOR, display_text=True) plt.gca().get_yaxis().set_ticks([]) plt.title(results.comparison_type) if outfile: plt.savefig(outfile, bbox_inches='tight', dpi=300)
def visualize_gaussian_results(results, figsize=(15, 10), outfile=None, *args, **kwargs): """ Visualize the results that use Gaussian approximation. """ pdf_control = Gaussian(mean=results.control.mean, std=results.control.std, label=results.control.name, color=CONTROL_COLOR) pdf_variation = Gaussian(mean=results.variation.mean, std=results.variation.std, label=results.variation.name, color=VARIATION_COLOR) pdfs = Pdfs([pdf_control, pdf_variation]) mean_diff = results.variation.mean - results.control.mean std_diff = ((results.control.var / results.control.nobs) + \ (results.variation.var / results.control.nobs)) ** .5 pdf_diff = Gaussian(mean_diff, std_diff, label='Difference', color=DIFF_COLOR) fig, axs = plt.subplots(3, 1, figsize=figsize) plt.sca(axs[0]) pdfs.plot() raise_y(axs[0]) plt.gca().get_yaxis().set_ticks([]) plt.title("Sample Comparison") x_min, x_max = plt.xlim() plt.sca(axs[1]) y_min, y_max = plt.ylim() y_dist = (y_max - y_min) / LABEL_Y_OFFSET_FACTOR plot_interval(*results.control.std_err(), middle=results.control.mean, y=y_dist, offset=-.015, color=CONTROL_COLOR, display_text=True, label=results.control.name) plot_interval(*results.variation.std_err(), middle=results.variation.mean, y=-y_dist, offset=0.005, color=VARIATION_COLOR, display_text=True, label=results.variation.name) plt.legend() plt.xlim(x_min, x_max) plt.gca().get_yaxis().set_ticks([]) plt.title("Mean +/- Standard Error") # plot differences distribution plt.sca(axs[2]) plt.axvline(0., color=DIFF_COLOR, linestyle='--', linewidth=1.5) if results.inference_procedure.hypothesis == 'larger': left_bound = results.ci[0][0] right_bound = np.inf elif results.inference_procedure.hypothesis == 'smaller': right_bound = results.ci[0][1] left_bound = np.inf else: left_bound = results.ci[0][0] right_bound = results.ci[0][1] plot_interval(left_bound, right_bound, mean_diff, color=DIFF_COLOR, display_text=True) plt.gca().get_yaxis().set_ticks([]) plt.title(results.comparison_type) if outfile: plt.savefig(outfile, bbox_inches='tight', dpi=300)