Example #1
0
def test_filter_outliers():
    data = np.arange(100) + 1

    filtered = filter_outliers(data, 0.99)
    assert len(filtered) == 99
    assert filtered.max() == 99
    assert data.max() == 100
Example #2
0
def get_bootstrap_samples(
    data,
    stat_fn=np.mean,
    num_samples=10000,
    seed_start=None,
    threshold_quantile=None,
    sc=None,
):
    """Return ``stat_fn`` evaluated on resampled and original data.

    Do the resampling in parallel over the cluster.

    Args:
        data: The data as a list, 1D numpy array, or pandas series
        stat_fn: Either a function that aggregates each resampled
            population to a scalar (e.g. the default value ``np.mean``
            lets you bootstrap means), or a function that aggregates
            each resampled population to a dict of scalars. In both
            cases, this function must accept a one-dimensional ndarray
            as its input.
        num_samples: The number of samples to return
        seed_start: A seed for the random number generator; this
            function will use seeds in the range::

                [seed_start, seed_start + num_samples)

            and these particular seeds must not be used elsewhere
            in this calculation. By default, use a random seed.
        threshold_quantile (float, optional): An optional threshold
            quantile, above which to discard outliers. E.g. ``0.9999``.
        sc (optional): The Spark context, if available

    Returns:
        ``stat_fn`` evaluated over ``num_samples`` samples.

            * By default, a pandas Series of sampled means
            * if ``stat_fn`` returns a scalar, a pandas Series
            * if ``stat_fn`` returns a dict, a pandas DataFrame
              with columns set to the dict keys.
    """
    if not type(data) == np.ndarray:
        data = np.array(data)

    if np.isnan(data).any():
        raise ValueError("'data' contains null values")

    if threshold_quantile:
        data = filter_outliers(data, threshold_quantile)

    if seed_start is None:
        seed_start = np.random.randint(np.iinfo(np.uint32).max)

    # Deterministic "randomness" requires careful state handling :(
    # Need to ensure every call has a unique, deterministic seed.
    seed_range = range(seed_start, seed_start + num_samples)

    if sc is None:
        summary_stat_samples = [
            _resample_and_agg_once(data, stat_fn, unique_seed)
            for unique_seed in seed_range
        ]

    else:
        try:
            broadcast_data = sc.broadcast(data)

            summary_stat_samples = (sc.parallelize(seed_range).map(
                lambda seed: _resample_and_agg_once_bcast(
                    broadcast_data=broadcast_data,
                    stat_fn=stat_fn,
                    unique_seed=seed % np.iinfo(np.uint32).max,
                )).collect())

        finally:
            broadcast_data.unpersist()

    summary_df = pd.DataFrame(summary_stat_samples)
    if len(summary_df.columns) == 1:
        # Return a Series if stat_fn returns a scalar
        return summary_df.iloc[:, 0]

    # Else return a DataFrame if stat_fn returns a dict
    return summary_df
Example #3
0
def test_filter_outliers_2():
    data = np.ones(100)

    filtered = filter_outliers(data, 0.99)
    assert len(filtered) == 100
def get_bootstrap_samples(data,
                          stat_fn=bb_mean,
                          num_samples=10000,
                          seed_start=None,
                          threshold_quantile=None,
                          sc=None):
    """Return ``stat_fn`` evaluated on resampled data.

    Args:
        data: The data as a list, 1D numpy array, or pandas series
        stat_fn (callable, optional): A function that either:

            * Aggregates each resampled population to a scalar (e.g.
              the default, ``bb_mean``), or
            * Aggregates each resampled population to a dict of
              scalars (e.g. the func returned by
              ``make_bb_quantile_closure`` when given multiple
              quantiles.

            In both cases, this function must accept two parameters:

            * a one-dimensional ndarray or pandas Series of values,
            * an identically shaped object of weights for these values

        num_samples: The number of samples to return
        seed_start: A seed for the random number generator; this
            function will use seeds in the range::

                [seed_start, seed_start + num_samples)

            and these particular seeds must not be used elsewhere
            in this calculation. By default, use a random seed.

        threshold_quantile (float, optional): An optional threshold
            quantile, above which to discard outliers. E.g. ``0.9999``.
        sc (optional): The Spark context, if available

    Returns:
        A Series or DataFrame with one row per sample and one column
        per output of ``stat_fn``.

    References:
        Rubin, Donald B. The Bayesian Bootstrap. Ann. Statist. 9 (1981),
            no. 1, 130--134. https://dx.doi.org/10.1214/aos/1176345338
    """
    if not type(data) == np.ndarray:
        data = np.array(data)

    if np.isnan(data).any():
        raise ValueError("'data' contains null values")

    if threshold_quantile:
        data = filter_outliers(data, threshold_quantile)

    # For computational efficiency, tally the data. If you are careful
    # with the resulting draws from the dirichlet then this should be
    # equivalent to not doing this step (and passing np.ones() as the
    # counts)
    data_values, data_counts = np.unique(data, return_counts=True)

    if seed_start is None:
        seed_start = np.random.randint(np.iinfo(np.uint32).max)

    # Deterministic "randomness" requires careful state handling :(
    # Need to ensure every call has a unique, deterministic seed.
    seed_range = range(seed_start, seed_start + num_samples)

    if sc is None:
        summary_stat_samples = [
            _resample_and_agg_once(data_values, data_counts, stat_fn,
                                   unique_seed) for unique_seed in seed_range
        ]

    else:
        try:
            broadcast_data_values = sc.broadcast(data_values)
            broadcast_data_counts = sc.broadcast(data_counts)

            summary_stat_samples = sc.parallelize(seed_range).map(
                lambda seed: _resample_and_agg_once_bcast(
                    broadcast_data_values=broadcast_data_values,
                    broadcast_data_counts=broadcast_data_counts,
                    stat_fn=stat_fn,
                    unique_seed=seed % np.iinfo(np.uint32).max,
                )).collect()

        finally:
            broadcast_data_values.unpersist()
            broadcast_data_counts.unpersist()

    summary_df = pd.DataFrame(summary_stat_samples)
    if len(summary_df.columns) == 1:
        # Return a Series if stat_fn returns a scalar
        return summary_df.iloc[:, 0]

    # Else return a DataFrame if stat_fn returns a dict
    return summary_df