def compute_ci(data, outcome, estimates, ci_method="percentile", alpha=0.05, n_cores=1): """Compute confidence interval of bootstrap estimates. Parts of the code of the subfunctions of this function are taken from Daniel Saxton's resample library, as found on https://github.com/dsaxton/resample/ . Args: data (pandas.DataFrame): original dataset. outcome (callable): function of the data calculating statistic of interest. Needs to return a pandas Series. estimates (pandas.DataFrame): DataFrame of estimates in the bootstrap samples. ci_method (str): method of choice for confidence interval computation. alpha (float): significance level of choice. n_cores (int): number of jobs for parallelization. Returns: cis (pandas.DataFrame): DataFrame where k'th row contains CI for k'th parameter. """ check_inputs(data=data, alpha=alpha, ci_method=ci_method) funcname = "_ci_" + ci_method cis = globals()[funcname](data, outcome, estimates, alpha, n_cores) return pd.DataFrame(cis, index=estimates.columns.tolist(), columns=["lower_ci", "upper_ci"])
def test_check_inputs_cluster_by(setup): cluster_by = "this is not a column name of df" expected_msg = "Input 'cluster_by' must be None or a column name of 'data'." with pytest.raises(ValueError) as error: check_inputs(data=setup["df"], cluster_by=cluster_by) assert str(error.value) == expected_msg
def test_check_inputs_data(): data = "this is not a data frame" expected_msg = "Data must be a pandas.DataFrame or pandas.Series." with pytest.raises(TypeError) as error: check_inputs(data=data) assert str(error.value) == expected_msg
def test_check_inputs_ci_level(setup): ci_level = 666 expected_msg = "Input 'ci_level' must be in [0,1]." with pytest.raises(ValueError) as error: check_inputs(data=setup["df"], ci_level=ci_level) assert str(error.value) == expected_msg
def test_check_inputs_ci_method(setup, expected): ci_method = 4 with pytest.raises(ValueError) as excinfo: check_inputs(data=setup["df"], ci_method=ci_method) expected_msg = ("ci_method must be 'percentile', 'bc'," f" 'bca', 't', 'basic' or 'normal', '{ci_method}'" f" was supplied") assert str(excinfo.value) == expected_msg
def get_bootstrap_outcomes( data, outcome, outcome_kwargs=None, cluster_by=None, seed=None, n_draws=1000, n_cores=1, error_handling="continue", batch_evaluator=joblib_batch_evaluator, ): """Draw bootstrap samples and calculate outcomes. Args: data (pandas.DataFrame): original dataset. outcome (callable): function of the dataset calculating statistic of interest. Needs to return array-like object or pd.Series. cluster_by (str): column name of the variable to cluster by. seed (int): Random seed. n_draws (int): number of draws, only relevant if seeds is None. n_cores (int): number of jobs for parallelization. error_handling (str): One of "continue", "raise". Default "continue" which means that bootstrap estimates are only calculated for those samples where no errors occur and a warning is produced if any error occurs. batch_evaluator (str or Callable): Name of a pre-implemented batch evaluator (currently 'joblib' and 'pathos_mp') or Callable with the same interface as the estimagic batch_evaluators. See :ref:`batch_evaluators`. Returns: estimates (pandas.DataFrame): Outcomes for different bootstrap samples. The columns are the index of the result of ``outcome``. """ check_inputs(data=data, cluster_by=cluster_by) if outcome_kwargs is not None: outcome = partial(outcome, *outcome_kwargs) indices = get_bootstrap_indices( data=data, cluster_by=cluster_by, seed=seed, n_draws=n_draws, ) estimates = _get_bootstrap_outcomes_from_indices( indices=indices, data=data, outcome=outcome, n_cores=n_cores, error_handling=error_handling, batch_evaluator=batch_evaluator, ) return estimates
def get_bootstrap_outcomes( data, outcome, cluster_by=None, rng=None, n_draws=1000, n_cores=1, error_handling="continue", batch_evaluator="joblib", ): """Draw bootstrap samples and calculate outcomes. Args: data (pandas.DataFrame): original dataset. outcome (callable): function of the dataset calculating statistic of interest. Returns a general pytree (e.g. pandas Series, dict, numpy array, etc.). cluster_by (str): column name of the variable to cluster by. rng (numpy.random.Generator): A random number generator. n_draws (int): number of bootstrap draws. n_cores (int): number of jobs for parallelization. error_handling (str): One of "continue", "raise". Default "continue" which means that bootstrap estimates are only calculated for those samples where no errors occur and a warning is produced if any error occurs. batch_evaluator (str or Callable): Name of a pre-implemented batch evaluator (currently 'joblib' and 'pathos_mp') or Callable with the same interface as the estimagic batch_evaluators. See :ref:`batch_evaluators`. Returns: estimates (list): List of pytrees of estimated bootstrap outcomes. """ check_inputs(data=data, cluster_by=cluster_by) batch_evaluator = process_batch_evaluator(batch_evaluator) indices = get_bootstrap_indices( data=data, rng=rng, cluster_by=cluster_by, n_draws=n_draws, ) estimates = _get_bootstrap_outcomes_from_indices( indices=indices, data=data, outcome=outcome, n_cores=n_cores, error_handling=error_handling, batch_evaluator=batch_evaluator, ) return estimates
def calculate_ci( base_outcome, estimates, ci_method="percentile", ci_level=0.95, ): """Compute confidence interval of bootstrap estimates. Parts of the code of the subfunctions of this function are taken from Daniel Saxton's resample library, as found on https://github.com/dsaxton/resample/ Args: base_outcome (list): List of flat base outcomes, i.e. the outcome statistic(s) evaluated on the original data set. estimates (np.ndarray): Array of estimates computed on the bootstrapped samples. ci_method (str): Method of choice for computing confidence intervals. The default is "percentile". ci_level (float): Confidence level for the calculation of confidence intervals. The default is 0.95. Returns: np.ndarray: 1d array of the lower confidence interval, where the k'th entry contains the lower confidence interval for the k'th parameter. np.ndarray: 1d array of the upper confidence interval, where the k'th entry contains the upper confidence interval for the k'th parameter. """ check_inputs(ci_method=ci_method, ci_level=ci_level, skipdata=True) alpha = 1 - ci_level if ci_method == "percentile": cis = _ci_percentile(estimates, alpha) elif ci_method == "bc": cis = _ci_bc(estimates, base_outcome, alpha) elif ci_method == "t": cis = _ci_t(estimates, base_outcome, alpha) elif ci_method == "basic": cis = _ci_basic(estimates, base_outcome, alpha) elif ci_method == "normal": cis = _ci_normal(estimates, base_outcome, alpha) return cis[:, 0], cis[:, 1]
def bootstrap_from_outcomes( data, outcome, bootstrap_outcomes, ci_method="percentile", alpha=0.05, n_cores=1 ): """Set up results table containing mean, standard deviation and confidence interval for each estimated parameter. Args: data (pandas.DataFrame): original dataset. outcome (callable): function of the data calculating statistic of interest. Needs to return a pandas Series. bootstrap_outcomes (pandas.DataFrame): DataFrame of bootstrap_outcomes in the bootstrap samples. ci_method (str): method of choice for confidence interval computation. n_cores (int): number of jobs for parallelization. alpha (float): significance level of choice. Returns: results (pandas.DataFrame): table of results. """ check_inputs(data=data, ci_method=ci_method, alpha=alpha) summary = pd.DataFrame(bootstrap_outcomes.mean(axis=0), columns=["mean"]) summary["std"] = bootstrap_outcomes.std(axis=0) cis = compute_ci(data, outcome, bootstrap_outcomes, ci_method, alpha, n_cores) summary["lower_ci"] = cis["lower_ci"] summary["upper_ci"] = cis["upper_ci"] cov = bootstrap_outcomes.cov() out = {"summary": summary, "cov": cov, "outcomes": bootstrap_outcomes} return out
seeds (numpy.array): array of seeds for bootstrap samples, default is none. n_cores (int): number of jobs for parallelization. error_handling (str): One of "continue", "raise". Default "continue" which means that bootstrap estimates are only calculated for those samples where no errors occur and a warning is produced if any error occurs. batch_evaluator (str or Callable): Name of a pre-implemented batch evaluator (currently 'joblib' and 'pathos_mp') or Callable with the same interface as the estimagic batch_evaluators. See :ref:`batch_evaluators`. Returns: results (pandas.DataFrame): DataFrame where k'th row contains mean estimate, standard error, and confidence interval of k'th parameter. """ check_inputs(data, cluster_by, ci_method, alpha) estimates = get_bootstrap_outcomes( data=data, outcome=outcome, outcome_kwargs=outcome_kwargs, cluster_by=cluster_by, seed=seed, n_draws=n_draws, n_cores=n_cores, error_handling=error_handling, batch_evaluator=batch_evaluator, ) out = bootstrap_from_outcomes(data, outcome, estimates, ci_method, alpha, n_cores)
def test_check_inputs_alpha(setup, expected): alpha = 666 with pytest.raises(ValueError) as excinfo: check_inputs(data=setup["df"], alpha=alpha) assert "Input 'alpha' must be in [0,1]." == str(excinfo.value)
def test_check_inputs_cluster_by(setup, expected): cluster_by = "this is not a column name of df" with pytest.raises(ValueError) as excinfo: check_inputs(data=setup["df"], cluster_by=cluster_by) assert "Input 'cluster_by' must be None or a column name of DataFrame." == str( excinfo.value)
def test_check_inputs_data(setup, expected): data = "this is not a data frame" with pytest.raises(ValueError) as excinfo: check_inputs(data=data) assert "Input 'data' must be DataFrame." == str(excinfo.value)
Generator instance then that instance is used. n_cores (int): number of jobs for parallelization. error_handling (str): One of "continue", "raise". Default "continue" which means that bootstrap estimates are only calculated for those samples where no errors occur and a warning is produced if any error occurs. batch_evaluator (str or Callable): Name of a pre-implemented batch evaluator (currently 'joblib' and 'pathos_mp') or Callable with the same interface as the estimagic batch_evaluators. See :ref:`batch_evaluators`. Returns: BootstrapResult: A BootstrapResult object storing information on summary statistics, the covariance matrix, and estimated boostrap outcomes. """ if callable(outcome): check_inputs(data=data, cluster_by=cluster_by) if outcome_kwargs is not None: outcome = functools.partial(outcome, **outcome_kwargs) else: raise ValueError("outcome must be a callable.") if existing_result is None: base_outcome = outcome(data) existing_outcomes = [] elif isinstance(existing_result, BootstrapResult): base_outcome = existing_result.base_outcome existing_outcomes = existing_result.outcomes else: raise ValueError("existing_result must be None or a BootstrapResult.")