Beispiel #1
0
def permutation_tests():
    """Evaluates permutation tests."""
    # Evaluate metrics that only have one value per training run.
    for metric in p.metrics_no_timeframes:
        for algo1 in p.algos:
            for algo2 in p.algos:
                data = data_def.DataDef(p.metric_values_dir, p.algos, p.tasks,
                                        p.n_runs_per_experiment)
                stats_runner = stats.StatsRunner(data, metric, None,
                                                 p.n_random_samples,
                                                 p.pvals_dir,
                                                 p.metric_values_dir_permuted)
                stats_runner.compare_algorithms(algo1, algo2, timeframe=None)

    # Evaluate metrics computed at different points along each training run.
    for metric in p.metrics_with_timeframes:
        for algo1 in p.algos:
            for algo2 in p.algos:
                for timeframe in p.timeframes:
                    data = data_def.DataDef(p.metric_values_dir, p.algos,
                                            p.tasks, p.n_runs_per_experiment)
                    stats_runner = stats.StatsRunner(
                        data, metric, p.n_timeframes, p.n_random_samples,
                        p.pvals_dir, p.metric_values_dir_permuted)
                    stats_runner.compare_algorithms(algo1, algo2, timeframe)
Beispiel #2
0
    def setUp(self):
        super(StatsTest, self).setUp()

        results_dir = os.path.join(
            './', 'rl_reliability_metrics/analysis/test_data')
        self.dd = data_def.DataDef(results_dir,
                                   algorithms=['algoA', 'algoB', 'algoC'],
                                   tasks=['taskX', 'taskY'],
                                   n_runs_per_experiment=2)
Beispiel #3
0
def bootstrap_confidence_intervals():
  """Computes bootstrap confidence intervals for each algorithm."""
  for metric in p.metrics_no_timeframes:
    for algo in p.algos:
      data = data_def.DataDef(
          p.metric_values_dir, p.algos, p.tasks, p.n_runs_per_experiment)
      stats_runner = stats.StatsRunner(data, metric, None,
                                       p.n_random_samples, p.pvals_dir,
                                       p.metric_values_dir_permuted)
      stats_runner.bootstrap_confidence_interval(algo, timeframe=None)

  for metric in p.metrics_with_timeframes:
    for algo in p.algos:
      for timeframe in p.timeframes:
        data = data_def.DataDef(
            p.metric_values_dir, p.algos, p.tasks, p.n_runs_per_experiment)
        stats_runner = stats.StatsRunner(data, metric, p.n_timeframes,
                                         p.n_random_samples, p.pvals_dir,
                                         p.metric_values_dir_permuted)
        stats_runner.bootstrap_confidence_interval(algo, timeframe)
    def test_load_empty_results(self):
        """Check for exception if loading with incorrect algorithm names."""
        results_dir = os.path.join(
            './', 'rl_reliability_metrics/analysis/test_data')
        algorithms = ['wrong1', 'wrong2', 'wrong3']
        tasks = ['taskX', 'taskY']

        with self.assertRaises(Exception):
            data_def.DataDef(results_dir,
                             algorithms,
                             tasks,
                             n_runs_per_experiment=2)
Beispiel #5
0
def make_plots():
  """Makes plots."""
  dd = data_def.DataDef(p.metric_values_dir, p.algos, p.tasks,
                        p.n_runs_per_experiment)
  my_plotter = plotter.Plotter(
      data=dd,
      pvals_dir=p.pvals_dir,
      confidence_intervals_dir=p.confidence_intervals_dir,
      n_timeframes=p.n_timeframes,
      algorithms=p.algos,
      out_dir=p.plots_dir)

  for metric in p.metrics:
    my_plotter.make_plots(metric)
    def test_create_datadef(self):
        results_dir = os.path.join(
            './', 'rl_reliability_metrics/analysis/test_data')
        algorithms = ['algoA', 'algoB', 'algoC']
        tasks = ['taskX', 'taskY']

        metrics = [
            'IqrWithinRuns', 'MedianPerfDuringTraining', 'IqrAcrossRuns'
        ]

        dd = data_def.DataDef(results_dir,
                              algorithms,
                              tasks,
                              n_runs_per_experiment=2)
        self.assertEqual(dd.algorithms, algorithms)
        self.assertEqual(dd.tasks, tasks)
        self.assertEqual(len(dd.results), len(dd.algorithms) * len(dd.tasks))  # pylint: disable=g-generic-assert
        self.assertCountEqual(dd.metrics, metrics)