Example #1
0
def permutation_tests():
    """Evaluates permutation tests."""
    # Evaluate metrics that only have one value per training run.
    for metric in p.metrics_no_timeframes:
        for algo1 in p.algos:
            for algo2 in p.algos:
                data = data_def.DataDef(p.metric_values_dir, p.algos, p.tasks,
                                        p.n_runs_per_experiment)
                stats_runner = stats.StatsRunner(data, metric, None,
                                                 p.n_random_samples,
                                                 p.pvals_dir,
                                                 p.metric_values_dir_permuted)
                stats_runner.compare_algorithms(algo1, algo2, timeframe=None)

    # Evaluate metrics computed at different points along each training run.
    for metric in p.metrics_with_timeframes:
        for algo1 in p.algos:
            for algo2 in p.algos:
                for timeframe in p.timeframes:
                    data = data_def.DataDef(p.metric_values_dir, p.algos,
                                            p.tasks, p.n_runs_per_experiment)
                    stats_runner = stats.StatsRunner(
                        data, metric, p.n_timeframes, p.n_random_samples,
                        p.pvals_dir, p.metric_values_dir_permuted)
                    stats_runner.compare_algorithms(algo1, algo2, timeframe)
Example #2
0
    def test_resample_metric_results(self, metric_results, result_dims,
                                     algo_ind):
        stats_runner = stats.StatsRunner(data=self.dd,
                                         metric='IqrAcrossRuns',
                                         n_timeframes=3)
        stats_runner.result_dims = result_dims
        metric_results = np.array(metric_results)
        resampled = stats_runner._resample_metric_results(
            metric_results, algo_ind)

        # The resampled subarrays should be drawn from the original subarrays.
        n_task = metric_results.shape[1]
        for itask in range(n_task):
            algo_task_results = metric_results[algo_ind, itask]
            for run_value in resampled[algo_ind][itask]:
                self.assertIn(run_value, algo_task_results)

        # Resampled array should be the same for all other algorithms.
        n_metrics = metric_results.shape[0]
        other_algo_inds = list(set(range(n_metrics)) - {algo_ind})
        resampled_other_algos = resampled[other_algo_inds]
        original_other_algos = metric_results[other_algo_inds]
        np.testing.assert_array_equal(resampled_other_algos,
                                      original_other_algos)

        if metric_results.shape[2] == 1:
            # Since there is only one run, the resampled array should be the same.
            np.testing.assert_array_equal(resampled, metric_results)
Example #3
0
    def make_plots(self, metric):
        """Make all plots for a given metric.

    Args:
      metric: String name of the metric.
    """
        plot_utils.paper_figure_configs()

        # Create a metric-specific StatsRunner object
        stats_runner = stats.StatsRunner(self.data_def, metric,
                                         self.n_timeframes)

        result_dims = stats_runner.result_dims
        if result_dims == 'ATRP':
            # Within-runs metric with eval points.
            self._make_plots_with_eval_points(metric, stats_runner)
        elif result_dims == 'ATR':
            # Within-runs metrics without eval points (one value per run).
            self._make_plots_no_eval_points(metric, stats_runner)
        elif result_dims == 'ATP':
            # Across-runs metric with eval points
            self._make_plots_with_eval_points(metric, stats_runner)
        else:
            raise ValueError('plotting not implemented for result_dims: %s' %
                             result_dims)
Example #4
0
def bootstrap_confidence_intervals():
  """Computes bootstrap confidence intervals for each algorithm."""
  for metric in p.metrics_no_timeframes:
    for algo in p.algos:
      data = data_def.DataDef(
          p.metric_values_dir, p.algos, p.tasks, p.n_runs_per_experiment)
      stats_runner = stats.StatsRunner(data, metric, None,
                                       p.n_random_samples, p.pvals_dir,
                                       p.metric_values_dir_permuted)
      stats_runner.bootstrap_confidence_interval(algo, timeframe=None)

  for metric in p.metrics_with_timeframes:
    for algo in p.algos:
      for timeframe in p.timeframes:
        data = data_def.DataDef(
            p.metric_values_dir, p.algos, p.tasks, p.n_runs_per_experiment)
        stats_runner = stats.StatsRunner(data, metric, p.n_timeframes,
                                         p.n_random_samples, p.pvals_dir,
                                         p.metric_values_dir_permuted)
        stats_runner.bootstrap_confidence_interval(algo, timeframe)
Example #5
0
    def test_rank_per_task(self, result_dims, bigger_is_better,
                           expected_result):
        results_arrays = {
            'AT': [[3, 1], [2, -2], [4, 7], [0, 9]],
            'ATP': [[[1., 2.], [3., 4.]], [[5., 6.], [7., 8.]]],
        }
        results_array = np.array(results_arrays[result_dims])

        stats_runner = stats.StatsRunner(data=None, metric='IqrAcrossRuns')
        stats_runner.result_dims = result_dims
        stats_runner.bigger_is_better = bigger_is_better
        ranks = stats_runner.rank_per_task(results_array)
        np.testing.assert_array_equal(ranks, expected_result)
Example #6
0
 def test_get_timeframe_points(self, metric, timeframe, expected):
     stats_runner = stats.StatsRunner(data=self.dd,
                                      metric=metric,
                                      n_timeframes=3)
     timeframe_points = stats_runner.get_timeframe_points(timeframe)
     np.testing.assert_array_equal(list(timeframe_points), expected)