Beispiel #1
0
def evaluate_metrics_on_bootstrapped_runs():
    """Evaluates metrics on bootstrapped runs, for across-run metrics only."""
    gin_bindings = [
        'eval_metrics.Evaluator.metrics = [@IqrAcrossRuns/singleton(), '
        '@LowerCVaROnAcross/singleton()]'
    ]
    n_bootstraps_per_worker = int(p.n_random_samples / p.n_worker)

    # Parse gin config.
    gin.parse_config_files_and_bindings([p.gin_file], gin_bindings)

    for algo in p.algos:
        for task in p.tasks:
            for i_worker in range(p.n_worker):
                # Get the subdirectories corresponding to each run.
                summary_path = os.path.join(p.data_dir, algo, task)
                run_dirs = eval_metrics.get_run_dirs(summary_path, 'train',
                                                     p.runs)

                # Evaluate results.
                outfile_prefix = os.path.join(p.metric_values_dir_bootstrapped,
                                              algo, task) + '/'
                evaluator = eval_metrics.Evaluator(metrics=gin.REQUIRED)
                evaluator.write_metric_params(outfile_prefix)
                evaluator.evaluate_with_bootstraps(
                    run_dirs=run_dirs,
                    outfile_prefix=outfile_prefix,
                    n_bootstraps=n_bootstraps_per_worker,
                    bootstrap_start_idx=(n_bootstraps_per_worker * i_worker),
                    random_seed=i_worker)
    def test_evaluate_with_permutations(self):
        evaluator = eval_metrics.Evaluator([metrics_online.StddevWithinRuns()])
        n_permutations = 3
        permutation_start_idx = 100
        random_seed = 50
        outfile_prefix = os.path.join(FLAGS.test_tmpdir,
                                      'robustness_results_permuted_')
        results = evaluator.evaluate_with_permutations(
            self.run_dirs, self.run_dirs, outfile_prefix, n_permutations,
            permutation_start_idx, random_seed)

        # Check length of results.
        self.assertLen(results, n_permutations)

        # Check a single result.
        one_result = list(results.values())[0]['curves1']
        self.assertEqual(list(one_result.keys()), ['StddevWithinRuns'])
        self.assertTrue(np.greater(list(one_result.values()), 0.).all())

        # Check the output files.
        results_files = io_utils.paths_glob('%s*results.json' % outfile_prefix)
        self.assertLen(results_files, 1)

        # If run again with the same seed, the results should be the same
        results_same = evaluator.evaluate_with_permutations(
            self.run_dirs, self.run_dirs, outfile_prefix, n_permutations,
            permutation_start_idx, random_seed)
        self._assert_results_same(results, results_same)

        # If run again with a different seed, the results should be different
        results_different = evaluator.evaluate_with_permutations(
            self.run_dirs, self.run_dirs, outfile_prefix, n_permutations,
            permutation_start_idx, random_seed + 1)
        self._assert_results_different(results, results_different)
def evaluate_metrics_on_permuted_runs():
  """Evaluates metrics on permuted runs, for across-run metrics only."""
  gin_bindings = [
      ('eval_metrics.Evaluator.metrics = '
       '[@IqrAcrossRuns/singleton(), @LowerCVaROnAcross/singleton()]')
  ]
  n_permutations_per_worker = int(p.n_random_samples / p.n_worker)

  # Parse gin config.
  gin.parse_config_files_and_bindings([p.gin_file], gin_bindings)

  for algo1 in p.algos:
    for algo2 in p.algos:
      for task in p.tasks:
        for i_worker in range(p.n_worker):
          # Get the subdirectories corresponding to each run.
          summary_path_1 = os.path.join(p.data_dir, algo1, task)
          summary_path_2 = os.path.join(p.data_dir, algo2, task)
          run_dirs_1 = eval_metrics.get_run_dirs(summary_path_1, 'train',
                                                 p.runs)
          run_dirs_2 = eval_metrics.get_run_dirs(summary_path_2, 'train',
                                                 p.runs)

          # Evaluate the metrics.
          outfile_prefix = os.path.join(p.metric_values_dir_permuted, '%s_%s' %
                                        (algo1, algo2), task) + '/'
          evaluator = eval_metrics.Evaluator(metrics=gin.REQUIRED)
          evaluator.write_metric_params(outfile_prefix)
          evaluator.evaluate_with_permutations(
              run_dirs_1=run_dirs_1,
              run_dirs_2=run_dirs_2,
              outfile_prefix=outfile_prefix,
              n_permutations=n_permutations_per_worker,
              permutation_start_idx=(n_permutations_per_worker * i_worker),
              random_seed=i_worker)
 def test_evaluate(self):
     evaluator = eval_metrics.Evaluator([
         metrics_online.StddevWithinRuns(),
         metrics_online.StddevWithinRuns()
     ])
     results = evaluator.evaluate(self.run_dirs)
     self.assertEqual(list(results.keys()), ['StddevWithinRuns'])
     self.assertTrue(np.greater(list(results.values()), 0.).all())
 def test_compute_metrics(self):
     curves = [
         np.array([[-1, 0, 1], [1., 1., 1.]]),
         np.array([[-1, 0, 1, 2], [2., 3., 4., 5.]])
     ]
     evaluator = eval_metrics.Evaluator(
         [metrics_online.StddevAcrossRuns(eval_points=[0, 1], baseline=1)])
     results = evaluator.compute_metrics(curves)
     np.testing.assert_allclose(results['StddevAcrossRuns'],
                                [1.41421356237, 2.12132034356])
 def test_evaluate_using_environment_steps(self):
     gin.bind_parameter('metrics_online.StddevWithinRuns.eval_points',
                        [2001])
     metric_instances = [
         metrics_online.StddevWithinRuns(),
         metrics_online.StddevWithinRuns()
     ]
     evaluator = eval_metrics.Evaluator(
         metric_instances, timepoint_variable='Metrics/EnvironmentSteps')
     results = evaluator.evaluate(self.run_dirs)
     self.assertEqual(list(results.keys()), ['StddevWithinRuns'])
     self.assertTrue(np.greater(list(results.values()), 0.).all())
def evaluate_metrics():
  """Evaluates metrics specified in the gin config."""
  # Parse gin config.
  gin.parse_config_files_and_bindings([p.gin_file], [])

  for algo in p.algos:
    for task in p.tasks:
      # Get the subdirectories corresponding to each run.
      summary_path = os.path.join(p.data_dir, algo, task)
      run_dirs = eval_metrics.get_run_dirs(summary_path, 'train', p.runs)

      # Evaluate metrics.
      outfile_prefix = os.path.join(p.metric_values_dir, algo, task) + '/'
      evaluator = eval_metrics.Evaluator(metrics=gin.REQUIRED)
      evaluator.write_metric_params(outfile_prefix)
      evaluator.evaluate(run_dirs=run_dirs, outfile_prefix=outfile_prefix)
    def test_write_results(self):
        # Generate some results.
        curves = [
            np.array([[-1, 0, 1], [1., 1., 1.]]),
            np.array([[-1, 0, 1, 2], [2., 3., 4., 5.]])
        ]
        metric = metrics_online.StddevAcrossRuns(eval_points=[0, 1],
                                                 baseline=1)
        evaluator = eval_metrics.Evaluator([metric])
        results = evaluator.compute_metrics(curves)

        outfile_prefix = os.path.join(flags.FLAGS.test_tmpdir, 'results_')
        params_path = evaluator.write_metric_params(outfile_prefix)
        results_path = evaluator.write_results(results, outfile_prefix)

        # Test write_results.
        with open(results_path, 'r') as outfile:
            results_loaded = outfile.readline()
        results_dict = json.loads(results_loaded)
        expected = {'StddevAcrossRuns': [1.41421356237, 2.12132034356]}
        self.assertEqual(results_dict.keys(), expected.keys())
        np.testing.assert_allclose(expected['StddevAcrossRuns'],
                                   results_dict['StddevAcrossRuns'])

        # Test write_metric_params.
        with open(params_path, 'r') as outfile:
            params_loaded = outfile.readline()
        expected = json.dumps({
            'StddevAcrossRuns': {
                'eval_points': [0, 1],
                'lowpass_thresh': None,
                'baseline': 1,
                'window_size': None,
            }
        })
        self.assertJsonEqual(expected, params_loaded)
 def test_window_empty(self):
     curves = [np.array([[0, 2], [2, 3]])]
     evaluator = eval_metrics.Evaluator([metrics_online.StddevAcrossRuns()])
     self.assertRaises(ValueError, evaluator.compute_metrics, curves)
 def test_window_out_of_range(self):
     curves = [np.array([[0, 1], [1, 1]])]
     evaluator = eval_metrics.Evaluator([metrics_online.StddevAcrossRuns()])
     self.assertRaises(ValueError, evaluator.compute_metrics, curves)