Ejemplo n.º 1
0
    def plot(self, variable, label=None,
             color=None, ref_val=None, alpha=.25,
             bins=None, title=None,
             hdi=None, outfile=None,
             ref_color=None):
        """
        Plot the histogram of a variable trace

        Parameters
        ----------
        variable : str
            The name of one of self.variables to plot
        label : str
            Alternative label for the legend
        ref_val : float
            A reference value location at which to draw a vertical line
        alpha : float in [0 1)
            The transparency of the histogram
        bins : int
            The number of histogram bins
        title : str
            The title of the plot
        hdi : float in [0, 1]
            The amount of probability mass within the Highest Density Interval
            to display on the histogram.
        outfile : str
            The name of an output file to save the figure to.
        """
        from matplotlib import pyplot as plt  # lazy import
        from abra.vis import plot_interval

        if variable not in self.variables:
            print(self.variables)
            raise ValueError('Variable `{}` not available'.format(variable))

        label = label if label else variable
        trace = getattr(self, variable)

        if bins is None:
            bins = int(len(trace.data) / 50.)

        trace.hist(color=color, alpha=alpha, bins=bins, ref_val=ref_val, label=label)

        if hdi is not None:  # highest density interval
            median = round(trace.percentiles(50), 3)
            _hdi = [round(h, 3) for h in trace.hdi(1 - hdi)]
            plot_interval(*_hdi, middle=median, display_text=True, color=color, offset=5)

        if title is None:
            if ref_val is not None:
                gt = round(100 * trace.prob_greater_than(ref_val))
                title = " {}% < {} = {} < {}%".format(100 - gt, variable, ref_val, gt)
            else:
                title = ''
        plt.title(title, fontsize=16)

        if outfile:
            plt.savefig(outfile)
Ejemplo n.º 2
0
def visualize_rates_results(results,
                            figsize=(15, 10),
                            outfile=None,
                            *args,
                            **kwargs):
    fig, axs = plt.subplots(3, 1, figsize=figsize)

    # Sample Comparison plot
    plt.sca(axs[0])
    control_pmf = Poisson(results.control.mean,
                          color=CONTROL_COLOR,
                          label=results.control.name)

    variation_pmf = Poisson(results.variation.mean,
                            color=VARIATION_COLOR,
                            label=results.variation.name)
    control_pmf.plot(plot_type='bar', alpha=.5)
    variation_pmf.plot(plot_type='bar', alpha=.5)
    plt.legend()
    plt.title("Sample Comparison")

    # Rates +/- standard error plot
    plt.sca(axs[1])
    y_min, y_max = plt.ylim()
    y_dist = (y_max - y_min) / LABEL_Y_OFFSET_FACTOR
    plot_interval(*results.control.std_err(),
                  middle=results.control.mean,
                  y=y_dist,
                  offset=-0.015,
                  color=CONTROL_COLOR,
                  display_text=True,
                  label=results.control.name)
    plot_interval(*results.variation.std_err(),
                  middle=results.variation.mean,
                  y=-y_dist,
                  offset=0.005,
                  color=VARIATION_COLOR,
                  display_text=True,
                  label=results.variation.name)
    plt.legend()
    plt.gca().get_yaxis().set_ticks([])
    plt.title("Rates +/- Standard Error")

    # Differences plot
    plt.sca(axs[2])

    plot_interval(*results.ci[0],
                  middle=results.delta,
                  color=DIFF_COLOR,
                  display_text=True)
    plt.axvline(1., color=DIFF_COLOR, linestyle='--', linewidth=1.5)
    plt.gca().get_yaxis().set_ticks([])
    plt.title(results.comparison_type)
    if outfile:
        plt.savefig(outfile, bbox_inches='tight', dpi=300)
Ejemplo n.º 3
0
def visualize_bootstrap_results(results,
                                figsize=(15, 10),
                                outfile=None,
                                plot_type='bar',
                                *args,
                                **kwargs):
    fig, axs = plt.subplots(3, 1, figsize=figsize)

    # Sample Comparison plot
    plt.sca(axs[0])

    if plot_type == 'bar':
        bins = 50 if results.control.nobs >= 100 or results.variation.nobs >= 100 else 20
        results.control.hist(bins=bins,
                             color=CONTROL_COLOR,
                             alpha=.5,
                             label=results.control.name)
        results.variation.hist(bins=bins,
                               color=VARIATION_COLOR,
                               alpha=.5,
                               label=results.variation.name)
    else:
        control_pmf = KdePdf(samples=results.control.data,
                             color=CONTROL_COLOR,
                             label=results.control.name)

        variation_pmf = KdePdf(samples=results.variation.data,
                               color=VARIATION_COLOR,
                               label=results.variation.name)
        control_pmf.plot(alpha=.5)
        variation_pmf.plot(alpha=.5)

    plt.legend()
    plt.title("Sample Comparison")

    # Bootstrapped statistic +/- HDI
    plt.sca(axs[1])
    y_min, y_max = plt.ylim()
    y_dist = (y_max - y_min) / LABEL_Y_OFFSET_FACTOR
    plot_interval(*results.aux['control'].hdi(),
                  middle=results.aux['control'].mean,
                  y=y_dist,
                  offset=-0.015,
                  color=CONTROL_COLOR,
                  display_text=True,
                  label=results.control.name)
    plot_interval(*results.aux['variation'].hdi(),
                  middle=results.aux['variation'].mean,
                  y=-y_dist,
                  offset=0.005,
                  color=VARIATION_COLOR,
                  display_text=True,
                  label=results.variation.name)
    plt.legend()
    plt.gca().get_yaxis().set_ticks([])
    plt.title(f"Bootstrap({results.test_statistic}) +/- 95% HDI")

    # Differences plot
    plt.sca(axs[2])

    plot_interval(*results.ci[0],
                  middle=results.delta,
                  color=DIFF_COLOR,
                  display_text=True)
    plt.axvline(0., color=DIFF_COLOR, linestyle='--', linewidth=1.5)
    plt.gca().get_yaxis().set_ticks([])
    plt.title(f"{results.comparison_type}({results.test_statistic})")
    if outfile:
        plt.savefig(outfile, bbox_inches='tight', dpi=300)
Ejemplo n.º 4
0
def visualize_binomial_results(results,
                               figsize=(15, 10),
                               outfile=None,
                               *args,
                               **kwargs):
    """
    Visualize the results that use Gaussian approximation.
    """
    tol = 1e-4

    pmf_control = Binomial(p=results.control.mean,
                           n=results.control.nobs,
                           label=results.control.name,
                           color=CONTROL_COLOR)

    pmf_variation = Binomial(p=results.variation.mean,
                             n=results.variation.nobs,
                             label=results.variation.name,
                             color=VARIATION_COLOR)

    xy_control = zip(pmf_control.xgrid(),
                     pmf_control.density(pmf_control.xgrid()))
    xy_variation = zip(pmf_variation.xgrid(),
                       pmf_variation.density(pmf_variation.xgrid()))

    valid_xy_control = sorted([x for x in xy_control if x[1] >= tol],
                              key=lambda x: x[0])
    valid_xy_variation = sorted([x for x in xy_variation if x[1] >= tol],
                                key=lambda x: x[0])

    x_min = int(min(valid_xy_control[0][0], valid_xy_variation[0][0]))
    x_max = int(max(valid_xy_control[-1][0], valid_xy_variation[-1][0]))

    mean_diff = results.variation.mean - results.control.mean
    std_diff = (results.control.var / results.control.nobs + \
                results.variation.var / results.control.nobs) ** .5
    pdf_diff = Gaussian(mean_diff,
                        std_diff,
                        label='Difference',
                        color=DIFF_COLOR)

    fig, axs = plt.subplots(3, 1, figsize=figsize)
    plt.sca(axs[0])

    # make plotting more scalable
    if pmf_control.n > 1000 or pmf_variation.n > 1000:
        plot_type = 'step'
    else:
        plot_type = 'bar'

    pmf_control.plot(plot_type=plot_type, alpha=.5)
    pmf_variation.plot(plot_type=plot_type, alpha=.5)
    raise_y(axs[0])
    plt.xlim(x_min, x_max)
    # plt.gca().get_xaxis().set_ticks([])
    # plt.gca().get_yaxis().set_ticks([])
    plt.legend()
    plt.title("Sample Comparison")

    plt.sca(axs[1])
    y_min, y_max = plt.ylim()
    y_dist = (y_max - y_min) / LABEL_Y_OFFSET_FACTOR
    plot_interval(*results.control.std_err(),
                  middle=results.control.mean,
                  y=y_dist,
                  offset=-0.015,
                  color=CONTROL_COLOR,
                  display_text=True,
                  label=results.control.name)
    plot_interval(*results.variation.std_err(),
                  middle=results.variation.mean,
                  y=-y_dist,
                  offset=0.005,
                  color=VARIATION_COLOR,
                  display_text=True,
                  label=results.variation.name)

    plt.legend()
    plt.gca().get_yaxis().set_ticks([])
    plt.title("Proportions +/- Standard Error")

    # Differences plot
    plt.sca(axs[2])
    plt.axvline(0., color=DIFF_COLOR, linestyle='--', linewidth=1.5)

    # xs = pdf_diff.xgrid()
    if results.inference_procedure.hypothesis == 'larger':
        left_bound = results.ci[0][0]
        right_bound = np.inf
    elif results.inference_procedure.hypothesis == 'smaller':
        right_bound = results.ci[0][1]
        left_bound = np.inf
    else:
        left_bound = results.ci[0][0]
        right_bound = results.ci[0][1]

    plot_interval(left_bound,
                  right_bound,
                  mean_diff,
                  color=DIFF_COLOR,
                  display_text=True)
    plt.gca().get_yaxis().set_ticks([])
    plt.title(results.comparison_type)
    if outfile:
        plt.savefig(outfile, bbox_inches='tight', dpi=300)
Ejemplo n.º 5
0
def visualize_gaussian_results(results,
                               figsize=(15, 10),
                               outfile=None,
                               *args,
                               **kwargs):
    """
    Visualize the results that use Gaussian approximation.
    """
    pdf_control = Gaussian(mean=results.control.mean,
                           std=results.control.std,
                           label=results.control.name,
                           color=CONTROL_COLOR)
    pdf_variation = Gaussian(mean=results.variation.mean,
                             std=results.variation.std,
                             label=results.variation.name,
                             color=VARIATION_COLOR)
    pdfs = Pdfs([pdf_control, pdf_variation])

    mean_diff = results.variation.mean - results.control.mean
    std_diff = ((results.control.var / results.control.nobs) + \
                (results.variation.var / results.control.nobs)) ** .5
    pdf_diff = Gaussian(mean_diff,
                        std_diff,
                        label='Difference',
                        color=DIFF_COLOR)

    fig, axs = plt.subplots(3, 1, figsize=figsize)
    plt.sca(axs[0])
    pdfs.plot()
    raise_y(axs[0])
    plt.gca().get_yaxis().set_ticks([])
    plt.title("Sample Comparison")
    x_min, x_max = plt.xlim()

    plt.sca(axs[1])
    y_min, y_max = plt.ylim()
    y_dist = (y_max - y_min) / LABEL_Y_OFFSET_FACTOR
    plot_interval(*results.control.std_err(),
                  middle=results.control.mean,
                  y=y_dist,
                  offset=-.015,
                  color=CONTROL_COLOR,
                  display_text=True,
                  label=results.control.name)
    plot_interval(*results.variation.std_err(),
                  middle=results.variation.mean,
                  y=-y_dist,
                  offset=0.005,
                  color=VARIATION_COLOR,
                  display_text=True,
                  label=results.variation.name)
    plt.legend()
    plt.xlim(x_min, x_max)
    plt.gca().get_yaxis().set_ticks([])
    plt.title("Mean +/- Standard Error")

    # plot differences distribution
    plt.sca(axs[2])
    plt.axvline(0., color=DIFF_COLOR, linestyle='--', linewidth=1.5)

    if results.inference_procedure.hypothesis == 'larger':
        left_bound = results.ci[0][0]
        right_bound = np.inf
    elif results.inference_procedure.hypothesis == 'smaller':
        right_bound = results.ci[0][1]
        left_bound = np.inf
    else:
        left_bound = results.ci[0][0]
        right_bound = results.ci[0][1]

    plot_interval(left_bound,
                  right_bound,
                  mean_diff,
                  color=DIFF_COLOR,
                  display_text=True)
    plt.gca().get_yaxis().set_ticks([])
    plt.title(results.comparison_type)
    if outfile:
        plt.savefig(outfile, bbox_inches='tight', dpi=300)