Exemple #1
0
    def benchmark_pong_v0_at_3M(self):
        """Benchmarks to 3M Env steps.

    This is below the 12.5M train steps (50M frames) run by the paper to
    converge. Running 12.5M at the current throughput would take more than a
    week. 1-2 days is the max duration for a remotely usable test. 3M only
    confirms we have not regressed at 3M and does not gurantee convergence to 21
    at 12.5M.
    """
        self._setup()
        output_dir = self._get_test_output_dir('pongAt3M')
        start_time_sec = time.time()
        dqn_train_eval_atari.train_eval(output_dir,
                                        eval_interval=10000,
                                        num_iterations=750000)
        wall_time_sec = time.time() - start_time_sec
        event_file = utils.find_event_log(os.path.join(output_dir, 'eval'))
        values, _ = utils.extract_event_log_values(
            event_file, 'AverageReturn/EnvironmentSteps')
        print('Values:{}'.format(values))
        # Min/Max ranges are very large to only hard fail if very broken. The system
        # monitoring the results owns looking for anomalies.
        metric_3m = self.build_metric('average_return_at_env_step3000000',
                                      values[3000000],
                                      min_value=-14,
                                      max_value=21)

        self.report_benchmark(wall_time=wall_time_sec,
                              metrics=[metric_3m],
                              extras={})
Exemple #2
0
    def benchmark_halfcheetah_v2(self):
        """Benchmarks MuJoCo HalfCheetah to 3M steps."""
        self.setUp()
        output_dir = self._get_test_output_dir('halfcheetah_v2')
        start_time_sec = time.time()
        # TODO(b/172017027): Use halfcheetah gin config.
        strategy = tf.distribute.get_strategy()
        sac_train_eval.train_eval(output_dir,
                                  strategy,
                                  initial_collect_steps=10000,
                                  env_name='HalfCheetah-v2',
                                  eval_interval=50000,
                                  num_iterations=3000000)
        wall_time_sec = time.time() - start_time_sec
        event_file = utils.find_event_log(os.path.join(output_dir, 'eval'))
        values, _ = utils.extract_event_log_values(event_file,
                                                   'Metrics/AverageReturn')

        # Min/Max ranges are very large to only hard fail if very broken. The system
        # monitoring the results owns looking for anomalies.
        metric_1m = self.build_metric('average_return_at_env_step1000000',
                                      values[1000000],
                                      min_value=800,
                                      max_value=16000)

        metric_3m = self.build_metric('average_return_at_env_step3000000',
                                      values[3000000],
                                      min_value=12000,
                                      max_value=16500)

        self.report_benchmark(wall_time=wall_time_sec,
                              metrics=[metric_1m, metric_3m],
                              extras={})
Exemple #3
0
def get_tag_names(
    root_dir: str,
    experiment_phase: str,
    experiment_dir: str,
) -> Set[str]:
    """
    When running an experiment through the experiment harness, tensorboard event files are created
    with metrics recorded during the experiment. Each metric gets a unique tag name that has to be
    used to retrieve metric values with `get_metric_values`. Tensorflow generates a lots of
    metrics that are recorded and this function can be used to get tag names associated with each
    metric.

    :param root_dir: The root directory used by the experiment harness.
    :param experiment_phase: The phase of the experiment in which the trace was recorded.
    :param experiment_dir: An experiment id which has been recorded in the root directory.

    :return: A set of tag names.
    """
    summary_dir = os.path.join(root_dir, experiment_dir, experiment_phase)

    if not os.path.isdir(summary_dir):
        warn(f"{summary_dir} does not exist.")

    tags: Set[str] = set()
    event_file = find_event_log(summary_dir)
    for summary in summary_iterator(event_file):
        for value in summary.summary.value:
            tags.add(value.tag)

    return tags
Exemple #4
0
def get_metric_values(
    root_dir: str,
    experiment_phase: str,
    tag_name: str,
    experiment_dirs: Optional[List[str]] = None,
    return_time: bool = False,
) -> Dict[str, Dict[int, np.generic]]:
    """
    When running an experiment through the experiment harness, tensorboard event files are created
    with metrics recorded during the experiment. Each experiment creates a separate directory
    under the `root_dir`. Metrics gathered during each phase of the experiment (e.g. training and
    policy evaluation) are stored in separate subdirectories of the experiment directory:

        `root_dir` / experiment id / `experiment_phase`

    This function will collect the values for a named metric gathered in a particular phase, from
    all of the listed experiments.

    :param root_dir: The root directory used by the experiment harness.
    :param experiment_phase: The phase of the experiment in which the trace was recorded.
    :param tag_name: The "tag" of the metric (usually defined by the metric object).
    :param experiment_dirs: A list of experiment ids which have been recorded in the root
                            directory. The default behaviour is to collect the specified metric
                            from all of the experiments in the root directory. This argument can be
                            used to specify a subset of these, if desired.
    :param return_time: If set to to `True` a dictionary of wallclock times for `tag_name`
        events are returned instead of metric values.

    :return: A dictionary mapping experiment ids to either metric values or wallclock times.
    """
    if not experiment_dirs:
        experiment_dirs_search_pattern = os.path.join(root_dir, "*/")
        experiment_dirs_full_path = tf.io.gfile.glob(experiment_dirs_search_pattern)
        experiment_dirs = [Path(full_path).name for full_path in experiment_dirs_full_path]

    all_values = {}
    for experiment_dir in experiment_dirs:
        summary_dir = os.path.join(root_dir, experiment_dir, experiment_phase)

        if not os.path.isdir(summary_dir):
            warn(f"{summary_dir} does not exist.")
            continue

        event_file = find_event_log(summary_dir)

        # we use TF-Agents' extract_event_log_values to extract wallclock time
        if return_time:
            metric_values = extract_event_log_values(event_file, tag_name)[1]
            all_values[experiment_dir] = metric_values
        # we use internal simplified version for metric values
        else:
            metric_values = _extract_event_log_values(event_file, tag_name)
        all_values[experiment_dir] = metric_values

    return all_values
Exemple #5
0
def test_serialise_config_operational_config_tensorboard_events_file(
    experiment_setup, dummy_gin_global_config
):
    experiment_harness, _ = experiment_setup
    base_dir = experiment_harness.define_base_experiment_directory()
    experiment_harness.serialise_config(base_dir)

    event_file = find_event_log(base_dir)
    values = extract_event_log_values(event_file, GIN_CONFIG)

    assert "test_arg" in str(values[0][0])
Exemple #6
0
def test_serialise_config_empty_operational_config_tensorboard_events_file(experiment_setup):
    experiment_harness, _ = experiment_setup
    base_dir = experiment_harness.define_base_experiment_directory()
    experiment_harness.serialise_config(base_dir)

    assert not gin.operative_config_str()

    event_file = find_event_log(base_dir)
    values = extract_event_log_values(event_file, GIN_CONFIG)

    assert not values[0][0]
    def _gather_data(self) -> Tuple[List[Dict[int, np.generic]], List[float]]:
        """Gather data from all of the logs and add to the data_collector list.

    Returns:
      Tuple of arrays indexed by log file, e.g. data_collector[0] is all of the
      values found in the event log for the given event and walltimes[0] is the
      total time in minutes it took to get to the end_step in that event log.
    """
        data_collector, walltimes = [], []
        for eventlog_dir in self.eventlog_dirs:
            event_file = utils.find_event_log(eventlog_dir)
            logging.info('Processing event file: %s', event_file)
            data, total_time = utils.extract_event_log_values(
                event_file, self.event_tag, self.end_step)
            walltimes.append(total_time)
            data_collector.append(data)
        return data_collector, walltimes
Exemple #8
0
    def run_benchmark(self, training_env, expected_min, expected_max):
        """Run benchmark for a given environment.

    In order to execute ~1M environment steps to match the paper, we run 489
    iterations (num_iterations=489) which results in 1,001,472 environment
    steps. Each iteration results in 320 training steps and 2,048 environment
    steps. Thus 489 * 2,048 = 1,001,472 environment steps and
    489 * 320 = 156,480 training steps.

    Args:
      training_env: Name of environment to test.
      expected_min: The min expected return value.
      expected_max: The max expected return value.
    """
        self.setUp()
        output_dir = self._get_test_output_dir('training_env')
        start_time_sec = time.time()
        bindings = [
            'schulman17.train_eval_lib.train_eval.env_name= "{}"'.format(
                training_env),
            'schulman17.train_eval_lib.train_eval.eval_episodes = 100'
        ]
        gin.parse_config(bindings)
        ppo_clip_train_eval.ppo_clip_train_eval(output_dir,
                                                eval_interval=10000,
                                                num_iterations=489)
        wall_time_sec = time.time() - start_time_sec
        event_file = utils.find_event_log(os.path.join(output_dir, 'eval'))
        values, _ = utils.extract_event_log_values(
            event_file, 'Metrics/AverageReturn/EnvironmentSteps')

        metric_1m = self.build_metric('average_return_at_env_step1000000',
                                      values[1001472],
                                      min_value=expected_min,
                                      max_value=expected_max)

        self.report_benchmark(wall_time=wall_time_sec,
                              metrics=[metric_1m],
                              extras={})
        self._tearDown()
    def benchmark_halfcheetah_medium_v0(self):
        """Benchmarks MuJoCo HalfCheetah to 1M steps."""
        self.setUp()
        output_dir = self._get_test_output_dir('halfcheetah_medium_v0_02_eval')
        dataset_path = self.root_data_dir
        start_time_sec = time.time()
        gin.parse_config_file(
            'tf_agents/examples/cql_sac/kumar20/configs/mujoco_medium.gin')
        cql_sac_train_eval.train_eval(
            dataset_path=dataset_path,
            root_dir=output_dir,
            env_name='halfcheetah-medium-v0',
            num_gradient_updates=500000,  # Number of iterations.
            learner_iterations_per_call=500,
            data_shuffle_buffer_size=10000,
            data_num_shards=50,
            data_parallel_reads=500,
            data_prefetch=1000000,
            eval_interval=10000)
        wall_time_sec = time.time() - start_time_sec
        event_file = utils.find_event_log(os.path.join(output_dir, 'eval'))
        values, _ = utils.extract_event_log_values(event_file,
                                                   'Metrics/AverageReturn',
                                                   start_step=10000)

        # Min/Max ranges are very large to only hard fail if very broken. The system
        # monitoring the results owns looking for anomalies. These numbers are based
        # on the results that we were getting in MLCompass as of 04-NOV-2021.
        # Results at 500k steps and 1M steps are similar enough to not make it worth
        # running 1M.
        metric_500k = self.build_metric('average_return_at_env_step500000',
                                        values[500000],
                                        min_value=4400,
                                        max_value=5400)

        self.report_benchmark(wall_time=wall_time_sec,
                              metrics=[metric_500k],
                              extras={})
Exemple #10
0
 def test_no_eventlogs_found(self):
   """Tests that an exception is thrown if no log files are found."""
   with self.assertRaises(FileNotFoundError):
     utils.find_event_log(os.path.join(TEST_DATA, 'fake_path'))
Exemple #11
0
 def test_more_than_one_eventlog_per_dir(self):
   """Tests that an exception is thrown if more than one log file is found."""
   with self.assertRaises(AssertionError):
     utils.find_event_log(os.path.join(TEST_DATA, 'event_log_too_many'))