def test_compute_violation_ratio_exact(self): """ Test the value of the violation ratio given some exact CDFs. """ test_dists = [ ( np.random.normal, norm.ppf, {"loc": 0.275, "scale": 1.5}, {"loc": 0.25, "scale": 1}, ), ( np.random.laplace, laplace.ppf, {"loc": 0.275, "scale": 1.5}, {"loc": 0.25, "scale": 1}, ), (np.random.rayleigh, rayleigh.ppf, {"scale": 1.05}, {"scale": 1}), ] for sample_func, ppf, params_a, params_b in test_dists: quantile_func_a = lambda x: ppf(x, **params_a) quantile_func_b = lambda x: ppf(x, **params_b) violation_ratio_ab_exact = compute_violation_ratio( quantile_func_a=quantile_func_a, quantile_func_b=quantile_func_b ) violation_ratio_ba_exact = compute_violation_ratio( quantile_func_a=quantile_func_b, quantile_func_b=quantile_func_a ) samples_a = sample_func(size=self.num_samples, **params_a) samples_b = sample_func(size=self.num_samples, **params_b) violation_ratio_ab_sampled = compute_violation_ratio( scores_a=samples_a, scores_b=samples_b ) violation_ratio_ba_sampled = compute_violation_ratio( scores_a=samples_b, scores_b=samples_a ) # Check symmetries self.assertAlmostEqual( violation_ratio_ab_exact, 1 - violation_ratio_ba_exact, delta=0.05 ) self.assertAlmostEqual( violation_ratio_ab_sampled, 1 - violation_ratio_ba_sampled, delta=0.05 ) # Check closeness to exact value self.assertAlmostEqual( violation_ratio_ab_exact, violation_ratio_ab_sampled, delta=0.05 ) self.assertAlmostEqual( violation_ratio_ba_exact, violation_ratio_ba_sampled, delta=0.05 )
def _bootstrap_iter(seed: Optional[int] = None): """ One bootstrap iteration. Wrapped in a function so it can be handed to joblib.Parallel. """ # When running multiple jobs, these modules have to be re-imported for some reason to avoid an error # Use dir() to check whether module is available in local scope: # https://stackoverflow.com/questions/30483246/how-to-check-if-a-module-has-been-imported if "numpy" not in dir() or "deepsig" not in dir(): import numpy as np from deepsig.aso import compute_violation_ratio if seed is not None: np.random.seed(seed) sampled_scores_a = quantile_func_a( np.random.uniform(0, 1, len(scores_a))) sampled_scores_b = quantile_func_b( np.random.uniform(0, 1, len(scores_b))) sample = compute_violation_ratio( scores_a=sampled_scores_a, scores_b=sampled_scores_b, dt=dt, ) return sample
def test_compute_violation_ratio_correlation(self): """ Test whether violation ratio is being computed correctly. """ samples_normal2 = np.random.normal( scale=2, size=self.num_samples ) # Scores for algorithm B violation_ratios = [] inv_sqw_dists = [] # Shift the distribution of A away (algorithm A becomes better and better) for loc in np.arange(0, 1, 0.05): samples_normal1 = np.random.normal(loc=loc, size=self.num_samples) violation_ratio = compute_violation_ratio( samples_normal1, samples_normal2, dt=0.05 ) w_dist = wasserstein_distance(samples_normal1, samples_normal2) violation_ratios.append(violation_ratio) inv_sqw_dists.append(1 / w_dist ** 2) # I didn't find a closed-form solution for the violation ratio for two gaussians - so instead I am checking # whether it is positively correlated with the inverse squared 1-Wasserstein distance computed via scipy rho, _ = pearsonr(violation_ratios, inv_sqw_dists) self.assertGreaterEqual(rho, 0.85)
def test_argument_combos(self): """ Try different combinations of inputs arguments for compute_violation_ratio(). """ scores_a = np.random.normal(size=5) scores_b = np.random.normal(size=5) quantile_func_a = norm.ppf quantile_func_b = norm.ppf # All of these should work for kwarg1, kwarg2 in product( [{"scores_a": scores_a}, {"quantile_func_a": quantile_func_a}], [{"scores_b": scores_b}, {"quantile_func_b": quantile_func_b}], ): compute_violation_ratio(**{**kwarg1, **kwarg2}) # These should create errors with self.assertRaises(AssertionError): compute_violation_ratio(scores_a=scores_a, quantile_func_a=quantile_func_a) with self.assertRaises(AssertionError): compute_violation_ratio(scores_b=scores_b, quantile_func_b=quantile_func_b)
def aso( scores_a: ArrayLike, scores_b: ArrayLike, confidence_level: float = 0.95, num_comparisons: int = 1, num_samples: int = 1000, num_bootstrap_iterations: int = 1000, dt: float = 0.005, num_jobs: int = 1, show_progress: bool = True, seed: Optional[int] = None, _progress_bar: Optional[tqdm] = None, ) -> float: """ Performs the Almost Stochastic Order test by Dror et al. (2019). The function takes two list of scores as input (they do not have to be of the same length) and returns an upper bound to the violation ratio - the minimum epsilon threshold. `scores_a` should contain scores of the algorithm which we suspect to be better (in this setup, higher = better). The null hypothesis (which we would like to reject), is that the algorithm that generated `scores_a` is *not* better than the one `scores_b` originated from. If the violation ratio is below 0.5, the null hypothesis can be rejected safely (and the model scores_a belongs to is deemed better than the model of scores_b). Intuitively, the violation ratio denotes the degree to which total stochastic order (algorithm A is *always* better than B) is being violated. The more scores and the higher num_samples / num_bootstrap_iterations, the more reliable is the result. Parameters ---------- scores_a: List[float] Scores of algorithm A. scores_b: List[float] Scores of algorithm B. confidence_level: float Desired confidence level of test. Set to 0.95 by default. num_comparisons: int Number of comparisons that the test is being used for. Is used to perform a Bonferroni correction. num_samples: int DEPRECATED: Number of samples from the score distributions during every bootstrap iteration when estimating sigma. Currently ignored, and will be deprecated in next major release. num_bootstrap_iterations: int Number of bootstrap iterations when estimating sigma. dt: float Differential for t during integral calculation. num_jobs: int Number of threads that bootstrap iterations are divided among. show_progress: bool Show progress bar. Default is True. seed: Optional[int] Set seed for reproducibility purposes. Default is None (meaning no seed is used). _progress_bar: Optional[tqdm] Hands over a progress bar object when called by multi_aso(). Only for internal use. Returns ------- float Return an upper bound to the violation ratio. If it falls below 0.5, the null hypothesis can be rejected. """ assert (len(scores_a) > 0 and len(scores_b) > 0), "Both lists of scores must be non-empty." assert (num_bootstrap_iterations > 0), "num_samples must be positive, {} found.".format( num_bootstrap_iterations) assert ( num_jobs > 0 or num_jobs == -1 ), "Number of jobs has to be at least 1 or -1, {} found.".format(num_jobs) assert (num_comparisons > 0), "Number of comparisons has to be at least 1, {} found.".format( num_comparisons) # Determine the maximum number of jobs possible if num_jobs == -1: num_jobs = psutil.cpu_count(logical=True) if num_jobs is None: warn( "Number of available CPUs could not be determined, setting num_jobs=1." ) num_jobs = 1 # TODO: Remove in future version if num_samples != 1000: warn( "'num_samples' argument is being ignored in the current version and will be deprecated in version 1.3!", DeprecationWarning, ) # TODO: Remove in future version if confidence_level < 0.95: warn( "'confidence_level' was refactored in version 1.2.4 to be more intuitive and usually should be in the .95 -" f".99 range, but {confidence_level} was found. If you tried to adjust the confidence level for multiple " f"comparisons, try the new num_comparisons argument instead.", UserWarning, ) if num_comparisons > 1: confidence_level += (1 - confidence_level) / num_comparisons violation_ratio = compute_violation_ratio(scores_a=scores_a, scores_b=scores_b, dt=dt) # Based on the actual number of samples quantile_func_a = get_quantile_function(scores_a) quantile_func_b = get_quantile_function(scores_b) samples = get_bootstrapped_violation_ratios( scores_a, scores_b, quantile_func_a, quantile_func_b, num_bootstrap_iterations, dt, num_jobs, show_progress, seed, _progress_bar, ) samples = np.array(samples) const = np.sqrt( len(scores_a) * len(scores_b) / (len(scores_a) + len(scores_b))) sigma_hat = np.std(const * (samples - violation_ratio)) # Compute eps_min and make sure it stays in [0, 1] min_epsilon = np.clip( violation_ratio - (1 / const) * sigma_hat * normal.ppf(1 - confidence_level), 0, 1, ) return min_epsilon
def multi_aso( scores: ScoreCollection, confidence_level: float = 0.95, use_bonferroni: bool = True, use_symmetry: bool = True, num_samples: int = 1000, num_bootstrap_iterations: int = 1000, dt: float = 0.005, num_jobs: int = 1, return_df: bool = False, show_progress: bool = True, seed: Optional[int] = None, ) -> Union[np.array, pd.DataFrame]: """ Provides easy function to compare the scores of multiple models at ones. Scores can be supplied in various forms (dictionary, nested list, 2D arrays or tensors). Returns a matrix (or pandas.DataFrame) with results. Applies Bonferroni correction to confidence level by default, but can be disabled by use_bonferroni=False. Parameters ---------- scores: ScoreCollection Collection of model scores. Should be either dictionary of model name to model scores, nested Python list, 2D numpy or Jax array, or 2D Tensorflow or PyTorch tensor. confidence_level: float Desired confidence level of test. Set to 0.95 by default. use_bonferroni: bool Indicate whether Bonferroni correction should be applied to confidence level in order to adjust for the number of comparisons. Default is True. use_symmetry: bool DEPRECATED: Use the fact that ASO(A, B, alpha) = 1 - ASO(B, A, alpha) `del Barrio et al. (2018) <https://arxiv.org/pdf/1705.01788.pdf>`_ to save half of the computations. Default is True. Currently ignored, and will be deprecated in next major release. num_samples: int DEPRECATED: Number of samples from the score distributions during every bootstrap iteration when estimating sigma. Currently ignored, and will be deprecated in next major release. num_bootstrap_iterations: int Number of bootstrap iterations when estimating sigma. dt: float Differential for t during integral calculation. num_jobs: int Number of threads that bootstrap iterations are divided among. return_df: bool Indicate whether result should be returned as pandas DataFrame. Only possible if scores is a dictionary of model names to model scores. Otherwise, 2D numpy array with eps_min scores is returned. Default is False. show_progress: bool Show progress bar. Default is True. seed: Optional[int] Set seed for reproducibility purposes. Default is None (meaning no seed is used). Returns ------- Union[np.array, pd.DataFrame] 2D numpy array or pandas Dataframe (if scores is dictionary and return_df=True) with result of ASO. """ assert ( num_jobs > 0 or num_jobs == -1 ), "Number of jobs has to be at least 1 or -1, {} found.".format(num_jobs) # Determine the maximum number of jobs possible if num_jobs == -1: num_jobs = psutil.cpu_count(logical=True) if num_jobs is None: warn( "Number of available CPUs could not be determined, setting num_jobs=1." ) num_jobs = 1 # TODO: Remove in future version if num_samples != 1000: warn( "'num_samples' argument is being ignored in the current version and will be deprecated in version 1.3!", DeprecationWarning, ) # TODO: Remove in future version if not use_symmetry: warn( "'use_symmetry' argument is being ignored in the current version and will be deprecated in version 1.3!", DeprecationWarning, ) # TODO: Remove in future version if confidence_level < 0.95: warn( "'confidence_level' was refactored in version 1.2.4 to be more intuitive and usually should be in the .95 -" f".99 range, but {confidence_level} was found.", UserWarning, ) num_models = _get_num_models(scores) num_comparisons = num_models * (num_models - 1) / 2 eps_min = np.eye(num_models) # Initialize score matrix if use_bonferroni: # Increase the confidence level based in oder to mitigate the multiple comparisons problem confidence_level += (1 - confidence_level) / num_comparisons # Iterate over simple indices or dictionary keys depending on type of scores argument indices = list(range(num_models)) if type(scores) != dict else list( scores.keys()) # Add progressbar if applicable progress_bar = None if show_progress: progress_bar = tqdm( range(int(num_comparisons * num_bootstrap_iterations)) if use_symmetry else range( int(num_comparisons * num_bootstrap_iterations * 2)), desc="Model comparisons", ) for i, key_i in enumerate(indices): for j, key_j in enumerate(indices[(i + 1):], start=i + 1): scores_a, scores_b = scores[key_i], scores[key_j] quantile_func_a = get_quantile_function(scores_a) quantile_func_b = get_quantile_function(scores_b) const = np.sqrt( len(scores_a) * len(scores_b) / (len(scores_a) + len(scores_b))) violation_ratio_ab = compute_violation_ratio( dt=dt, quantile_func_a=quantile_func_a, quantile_func_b=quantile_func_b, ) violation_ratio_ba = (1 - violation_ratio_ab ) # Exploit symmetry of violation ratio here samples_ab = get_bootstrapped_violation_ratios( scores_a, scores_b, quantile_func_a, quantile_func_b, num_bootstrap_iterations, dt, num_jobs, show_progress, seed, progress_bar, ) samples_ab = np.array(samples_ab) # This quantity is the same for both, so we only have to compute it once, see # (samples_ab - violation_ratio_ab) # = (1 - samples_ba - 1 + violation_ratio_ba) # = (samples_ba - violation_ratio_ba) sigma_hat = np.std(const * (samples_ab - violation_ratio_ab)) # Compute eps_min and make sure it stays in [0, 1] min_epsilon_ab = np.clip( violation_ratio_ab - (1 / const) * sigma_hat * normal.ppf(1 - confidence_level), 0, 1, ) min_epsilon_ba = np.clip( violation_ratio_ba - (1 / const) * sigma_hat * normal.ppf(1 - confidence_level), 0, 1, ) # Set values eps_min[i, j] = min_epsilon_ab eps_min[j, i] = min_epsilon_ba if type(scores) == dict and return_df: eps_min = pd.DataFrame(data=eps_min, index=list(scores.keys())) eps_min = eps_min.rename(dict(enumerate(scores.keys())), axis=1) return eps_min
def aso_bootstrap_comparisons( scores_a: ArrayLike, scores_b: ArrayLike, confidence_level: float = 0.05, num_samples: int = 1000, num_bootstrap_iterations: int = 1000, dt: float = 0.005, num_jobs: int = 2, show_progress: bool = False, seed: Optional[int] = None, _progress_bar: Optional[tqdm] = None, ) -> Dict[str, float]: """ Like the package ASO function, but compares different choices of bootstrap estimator. Parameters ---------- scores_a: List[float] Scores of algorithm A. scores_b: List[float] Scores of algorithm B. confidence_level: float Desired confidence level of test. Set to 0.05 by default. num_samples: int Number of samples from the score distributions during every bootstrap iteration when estimating sigma. num_bootstrap_iterations: int Number of bootstrap iterations when estimating sigma. dt: float Differential for t during integral calculation. num_jobs: int Number of threads that bootstrap iterations are divided among. show_progress: bool Show progress bar. Default is True. seed: Optional[int] Set seed for reproducibility purposes. Default is None (meaning no seed is used). _progress_bar: Optional[tqdm] Hands over a progress bar object when called by multi_aso(). Only for internal use. Returns ------- float Return an upper bound to the violation ratio. If it falls below 0.5, the null hypothesis can be rejected. """ assert (len(scores_a) > 0 and len(scores_b) > 0), "Both lists of scores must be non-empty." assert num_samples > 0, "num_samples must be positive, {} found.".format( num_samples) assert (num_bootstrap_iterations > 0), "num_samples must be positive, {} found.".format( num_bootstrap_iterations) assert num_jobs > 0, "Number of jobs has to be at least 1, {} found.".format( num_jobs) violation_ratio = compute_violation_ratio(scores_a, scores_b, dt) # Based on the actual number of samples const1 = np.sqrt( len(scores_a) * len(scores_b) / (len(scores_a) + len(scores_b))) quantile_func_a = get_quantile_function(scores_a) quantile_func_b = get_quantile_function(scores_b) def _progress_iter(high: int, progress_bar: tqdm): """ This function is used when a shared progress bar is passed from multi_aso() - every time the iterator yields an element, the progress bar is updated by one. It essentially behaves like a simplified range() function. Parameters ---------- high: int Number of elements in iterator. progress_bar: tqdm Shared progress bar. """ current = 0 while current < high: yield current current += 1 progress_bar.update(1) # Add progress bar if applicable if show_progress and _progress_bar is None: iters = tqdm(range(num_bootstrap_iterations), desc="Bootstrap iterations") # Shared progress bar when called from multi_aso() elif _progress_bar is not None: iters = _progress_iter(num_bootstrap_iterations, _progress_bar) else: iters = range(num_bootstrap_iterations) # Set seeds for different jobs if applicable # "Sub-seeds" for jobs are just seed argument + job index seeds = ([None] * num_bootstrap_iterations if seed is None else [ seed + offset for offset in range(1, math.ceil((num_bootstrap_iterations + 1))) ]) def _bootstrap_iter(seed: Optional[int] = None): """ One bootstrap iteration. Wrapped in a function so it can be handed to joblib.Parallel. """ # When running multiple jobs, these modules have to be re-imported for some reason to avoid an error # Use dir() to check whether module is available in local scope: # https://stackoverflow.com/questions/30483246/how-to-check-if-a-module-has-been-imported if "np" not in dir() or "deepsig" not in dir(): import numpy as np from deepsig.aso import compute_violation_ratio if seed is not None: np.random.seed(seed) sampled_scores_a = quantile_func_a( np.random.uniform(0, 1, len(scores_a))) sampled_scores_b = quantile_func_b( np.random.uniform(0, 1, len(scores_b))) sample = compute_violation_ratio( sampled_scores_a, sampled_scores_b, dt, ) return sample # Initialize worker pool and start iterations parallel = Parallel(n_jobs=num_jobs) samples = parallel( delayed(_bootstrap_iter)(seed) for seed, _ in zip(seeds, iters)) # Compute the different variants of the bootstrap estimator # 1. Classic bootstrap estimator sigma_hat1 = np.std(1 / (num_bootstrap_iterations - 1) * (samples - np.mean(samples))) min_epsilon1 = np.clip( violation_ratio - (1 / const1) * sigma_hat1 * normal.ppf(confidence_level), 0, 1, ) # 2. ASO as implemented by Dror et al. (2019) sigma_hat2 = np.std(const1 * (samples - violation_ratio)) min_epsilon2 = np.clip( violation_ratio - (1 / const1) * sigma_hat2 * normal.ppf(confidence_level), 0, 1, ) # 3. Like 2., but using the expected violation ratio for sigma sigma_hat3 = np.std(const1 * (samples - np.mean(samples))) min_epsilon3 = np.clip( violation_ratio - (1 / const1) * sigma_hat3 * normal.ppf(confidence_level), 0, 1, ) # 4. Like 3, but with the classic bootstrap bias correction corrected_bootstrap_violation_ratio = np.clip( 2 * violation_ratio - np.mean(samples), 0, 1) min_epsilon4 = np.clip( corrected_bootstrap_violation_ratio - (1 / const1) * sigma_hat3 * normal.ppf(confidence_level), 0, 1, ) # 5. Like 4., but with conditionally corrected bootstrap estimate bias = np.mean(samples) - violation_ratio sigma_hat_corr = np.std(1 / (len(samples) - 1) * (samples - np.mean(samples))) min_epsilon5 = np.clip( (corrected_bootstrap_violation_ratio if bias >= sigma_hat_corr else violation_ratio) - (1 / const1) * sigma_hat3 * normal.ppf(confidence_level), 0, 1, ) # 6. Like 5, but conditional correction happens based on the later used sigma hat min_epsilon6 = np.clip( (corrected_bootstrap_violation_ratio if bias >= sigma_hat3 else violation_ratio) - (1 / const1) * sigma_hat3 * normal.ppf(confidence_level), 0, 1, ) return { "Classic Bootstrap": min_epsilon1, "Dror et al. (2019)": min_epsilon2, r"Bootstrap $\varepsilon_{\mathcal{W}_2}$ mean": min_epsilon3, "Bootstrap correction": min_epsilon4, "Cond. Bootstrap corr.": min_epsilon5, "Cond. Bootstrap corr. 2": min_epsilon6, }
def test_type2_error( sample_size: int, colors: Dict[str, str], name: str, num_simulations: int = 200, thresholds: List[float] = [0.05, 0.1, 0.2, 0.3, 0.4, 0.5], dist_func: Callable = np.random.normal, inv_cdf_func: Callable = scipy.stats.norm.ppf, dist_params: Dict[str, Any] = { "loc": 0, "scale": 0.5 }, dist_params2: Dict[str, Any] = { "loc": -0.25, "scale": 1.5 }, save_dir: Optional[str] = None, ): """ Test the rate of type I error (false positive) under different samples sizes. Parameters ---------- sample_size: int Sample size used in experiments. colors: Dict[str, str] Colors corresponding to each test for plotting. name: str Name of the experiment. num_simulations: int Number of simulations conducted. dist_func: Callable Distribution function that is used for sampling. inv_cdf_funcL Callable Inverse cumulative distribution function in order to compute the exact violation ratio. dist_params: Dict[str, Any] Parameters of the distribution function. dist_params2: Dict[str, Any] Parameters of the comparison distribution function. save_dir: Optional[str] Directory that plots should be saved to. """ simulation_results = defaultdict(list) with tqdm(total=len(colors) * num_simulations) as progress_bar: for _ in range(num_simulations): # Sample scores for this round scores_a = dist_func(**dist_params, size=sample_size) scores_b = dist_func(**dist_params2, size=sample_size) results = aso_bootstrap_comparisons(scores_a, scores_b) for variant, res in results.items(): simulation_results[variant].append(res) progress_bar.update(len(colors)) # with open(f"{save_dir}/type1_pg_rates.pkl", "wb") as out_file: # pickle.dump(simulation_results, out_file) # Plot Type I error rates as line plot plt.figure(figsize=(8, 6)) plt.rcParams.update({ "font.size": 18, "text.usetex": True, "legend.loc": "upper right" }) # Create datastructure for boxplots data = [ simulation_results[test_name] for test_name in simulation_results.keys() ] box_plot = plt.boxplot( data, widths=0.45, patch_artist=True, ) for variant_name, patch, color in zip(simulation_results.keys(), box_plot["boxes"], colors.values()): patch.set_edgecolor(color) patch.set_facecolor("white") plt.plot([], color=color, label=variant_name) real_violation_ratio = compute_violation_ratio( [], [], dt=0.05, quantile_func_a=lambda p: inv_cdf_func(p, **dist_params), quantile_func_b=lambda p: inv_cdf_func(p, **dist_params2), ) plt.xticks( range(1, len(colors) + 1), [ f"{(np.array(simulation_results[variant_name]) > thresholds[0]).astype(float).mean():.2f}" for variant_name in simulation_results.keys() ], ) ax = plt.gca() ax.set_ylim(0, 1) x = np.arange(ax.get_xlim()[0], ax.get_xlim()[1] + 1) plt.plot( x, np.ones(len(x)) * real_violation_ratio, alpha=0.8, linestyle="--", color="black", ) ax.yaxis.grid() plt.xlabel("Bootstrap variants") plt.ylabel(r"$\varepsilon_\mathrm{min}$") plt.legend() if save_dir is not None: plt.tight_layout() plt.savefig(f"{save_dir}/type2_bootstrap_dists_{name}.png") else: plt.show() plt.close() with open(f"{save_dir}/type2_bootstrap_rates_{name}.txt", "w") as out_file: rates_df = pd.DataFrame(index=thresholds, columns=simulation_results.keys()) for threshold in thresholds: for variant_name, data in simulation_results.items(): rates_df.at[threshold, variant_name] = ((np.array(data) > threshold).astype(float).mean()) out_file.write(rates_df.to_latex())