def test_filter_outliers(): data = np.arange(100) + 1 filtered = filter_outliers(data, 0.99) assert len(filtered) == 99 assert filtered.max() == 99 assert data.max() == 100
def get_bootstrap_samples( data, stat_fn=np.mean, num_samples=10000, seed_start=None, threshold_quantile=None, sc=None, ): """Return ``stat_fn`` evaluated on resampled and original data. Do the resampling in parallel over the cluster. Args: data: The data as a list, 1D numpy array, or pandas series stat_fn: Either a function that aggregates each resampled population to a scalar (e.g. the default value ``np.mean`` lets you bootstrap means), or a function that aggregates each resampled population to a dict of scalars. In both cases, this function must accept a one-dimensional ndarray as its input. num_samples: The number of samples to return seed_start: A seed for the random number generator; this function will use seeds in the range:: [seed_start, seed_start + num_samples) and these particular seeds must not be used elsewhere in this calculation. By default, use a random seed. threshold_quantile (float, optional): An optional threshold quantile, above which to discard outliers. E.g. ``0.9999``. sc (optional): The Spark context, if available Returns: ``stat_fn`` evaluated over ``num_samples`` samples. * By default, a pandas Series of sampled means * if ``stat_fn`` returns a scalar, a pandas Series * if ``stat_fn`` returns a dict, a pandas DataFrame with columns set to the dict keys. """ if not type(data) == np.ndarray: data = np.array(data) if np.isnan(data).any(): raise ValueError("'data' contains null values") if threshold_quantile: data = filter_outliers(data, threshold_quantile) if seed_start is None: seed_start = np.random.randint(np.iinfo(np.uint32).max) # Deterministic "randomness" requires careful state handling :( # Need to ensure every call has a unique, deterministic seed. seed_range = range(seed_start, seed_start + num_samples) if sc is None: summary_stat_samples = [ _resample_and_agg_once(data, stat_fn, unique_seed) for unique_seed in seed_range ] else: try: broadcast_data = sc.broadcast(data) summary_stat_samples = (sc.parallelize(seed_range).map( lambda seed: _resample_and_agg_once_bcast( broadcast_data=broadcast_data, stat_fn=stat_fn, unique_seed=seed % np.iinfo(np.uint32).max, )).collect()) finally: broadcast_data.unpersist() summary_df = pd.DataFrame(summary_stat_samples) if len(summary_df.columns) == 1: # Return a Series if stat_fn returns a scalar return summary_df.iloc[:, 0] # Else return a DataFrame if stat_fn returns a dict return summary_df
def test_filter_outliers_2(): data = np.ones(100) filtered = filter_outliers(data, 0.99) assert len(filtered) == 100
def get_bootstrap_samples(data, stat_fn=bb_mean, num_samples=10000, seed_start=None, threshold_quantile=None, sc=None): """Return ``stat_fn`` evaluated on resampled data. Args: data: The data as a list, 1D numpy array, or pandas series stat_fn (callable, optional): A function that either: * Aggregates each resampled population to a scalar (e.g. the default, ``bb_mean``), or * Aggregates each resampled population to a dict of scalars (e.g. the func returned by ``make_bb_quantile_closure`` when given multiple quantiles. In both cases, this function must accept two parameters: * a one-dimensional ndarray or pandas Series of values, * an identically shaped object of weights for these values num_samples: The number of samples to return seed_start: A seed for the random number generator; this function will use seeds in the range:: [seed_start, seed_start + num_samples) and these particular seeds must not be used elsewhere in this calculation. By default, use a random seed. threshold_quantile (float, optional): An optional threshold quantile, above which to discard outliers. E.g. ``0.9999``. sc (optional): The Spark context, if available Returns: A Series or DataFrame with one row per sample and one column per output of ``stat_fn``. References: Rubin, Donald B. The Bayesian Bootstrap. Ann. Statist. 9 (1981), no. 1, 130--134. https://dx.doi.org/10.1214/aos/1176345338 """ if not type(data) == np.ndarray: data = np.array(data) if np.isnan(data).any(): raise ValueError("'data' contains null values") if threshold_quantile: data = filter_outliers(data, threshold_quantile) # For computational efficiency, tally the data. If you are careful # with the resulting draws from the dirichlet then this should be # equivalent to not doing this step (and passing np.ones() as the # counts) data_values, data_counts = np.unique(data, return_counts=True) if seed_start is None: seed_start = np.random.randint(np.iinfo(np.uint32).max) # Deterministic "randomness" requires careful state handling :( # Need to ensure every call has a unique, deterministic seed. seed_range = range(seed_start, seed_start + num_samples) if sc is None: summary_stat_samples = [ _resample_and_agg_once(data_values, data_counts, stat_fn, unique_seed) for unique_seed in seed_range ] else: try: broadcast_data_values = sc.broadcast(data_values) broadcast_data_counts = sc.broadcast(data_counts) summary_stat_samples = sc.parallelize(seed_range).map( lambda seed: _resample_and_agg_once_bcast( broadcast_data_values=broadcast_data_values, broadcast_data_counts=broadcast_data_counts, stat_fn=stat_fn, unique_seed=seed % np.iinfo(np.uint32).max, )).collect() finally: broadcast_data_values.unpersist() broadcast_data_counts.unpersist() summary_df = pd.DataFrame(summary_stat_samples) if len(summary_df.columns) == 1: # Return a Series if stat_fn returns a scalar return summary_df.iloc[:, 0] # Else return a DataFrame if stat_fn returns a dict return summary_df