def test_alf_metrics(self, num_envs, learn_queue_cap, unroll_length, actor_queue_cap, num_actors, num_iterations): episode_length = 5 env_f = lambda: TFPyEnvironment( ValueUnittestEnv(batch_size=1, episode_length=episode_length)) envs = [env_f() for _ in range(num_envs)] common.set_global_env(envs[0]) alg = _create_ac_algorithm() driver = AsyncOffPolicyDriver(envs, alg, num_actors, unroll_length, learn_queue_cap, actor_queue_cap) driver.start() total_num_steps_ = 0 for _ in range(num_iterations): total_num_steps_ += driver.run_async() driver.stop() total_num_steps = int(driver.get_metrics()[1].result()) self.assertGreaterEqual(total_num_steps_, total_num_steps) # An exp is only put in the log queue after it's put in the learning queue # So when we stop the driver (which will force all queues to stop), # some exps might be missing from the metric. Here we assert an arbitrary # lower bound of 2/5. The upper bound is due to the fact that StepType.LAST # is not recorded by the metric (episode_length==5). self.assertLessEqual(total_num_steps, int(total_num_steps_ * 4 // 5)) self.assertGreaterEqual(total_num_steps, int(total_num_steps_ * 2 // 5)) average_reward = int(driver.get_metrics()[2].result()) self.assertEqual(average_reward, episode_length - 1) episode_length = int(driver.get_metrics()[3].result()) self.assertEqual(episode_length, episode_length)
def init_driver(self): for _ in range(1, self._config.num_envs): self._create_environment() driver = AsyncOffPolicyDriver( envs=self._envs, algorithm=self._algorithm, use_rollout_state=self._config.use_rollout_state, unroll_length=self._unroll_length) return driver
def init_driver(self): driver = AsyncOffPolicyDriver( env_f=create_environment, algorithm=self._algorithm, unroll_length=self._unroll_length, debug_summaries=self._debug_summaries, summarize_grads_and_vars=self._summarize_grads_and_vars) driver.start() return driver
def _init_driver(self): assert self._random_seed is not None for i in range(1, self._config.num_envs): # [self._random_seed, self._random_seed + batch_size) has been used # in policy_trainer.py self._create_environment(random_seed=self._random_seed + i * common._env.batch_size) driver = AsyncOffPolicyDriver(envs=self._envs, algorithm=self._algorithm, unroll_length=self._unroll_length) return driver
def init_driver(self): envs = [self._env] for i in range(1, self._config.num_envs): envs.append(create_environment()) driver = AsyncOffPolicyDriver( envs=envs, algorithm=self._algorithm, use_rollout_state=self._config.use_rollout_state, unroll_length=self._unroll_length, debug_summaries=self._debug_summaries, summarize_grads_and_vars=self._summarize_grads_and_vars) return driver
def test_alf_metrics(self, num_envs, learn_queue_cap, unroll_length, actor_queue_cap, num_actors, num_iterations): episode_length = 5 env_f = lambda: TFPyEnvironment( ValueUnittestEnv(batch_size=1, episode_length=episode_length)) alg = _create_ac_algorithm(env_f()) driver = AsyncOffPolicyDriver(env_f, alg, num_envs, num_actors, unroll_length, learn_queue_cap, actor_queue_cap) driver.start() total_num_steps_ = 0 for _ in range(num_iterations): total_num_steps_ += driver.run_async() driver.stop() total_num_steps = int(driver.get_metrics()[1].result()) self.assertGreaterEqual(total_num_steps_, total_num_steps) self.assertGreaterEqual( total_num_steps, # multiply by 2/3 because 1/3 of steps are StepType.LAST total_num_steps_ * 2 // 3) average_reward = int(driver.get_metrics()[2].result()) self.assertEqual(average_reward, episode_length - 1) episode_length = int(driver.get_metrics()[3].result()) self.assertEqual(episode_length, episode_length)
def test_off_policy_algorithm(self, algorithm_ctor, use_rollout_state, sync_driver): logging.info("{} {}".format(algorithm_ctor.__name__, sync_driver)) batch_size = 128 if use_rollout_state: steps_per_episode = 5 mini_batch_length = 8 unroll_length = 8 env_class = RNNPolicyUnittestEnv else: steps_per_episode = 12 mini_batch_length = 2 unroll_length = 12 env_class = PolicyUnittestEnv env = TFPyEnvironment( env_class( batch_size, steps_per_episode, action_type=ActionType.Continuous)) eval_env = TFPyEnvironment( env_class( batch_size, steps_per_episode, action_type=ActionType.Continuous)) common.set_global_env(env) algorithm = algorithm_ctor() algorithm.set_summary_settings(summarize_grads_and_vars=True) algorithm.use_rollout_state = use_rollout_state if sync_driver: driver = SyncOffPolicyDriver(env, algorithm) else: driver = AsyncOffPolicyDriver([env], algorithm, num_actor_queues=1, unroll_length=unroll_length, learn_queue_cap=1, actor_queue_cap=1) eval_driver = OnPolicyDriver(eval_env, algorithm, training=False) eval_env.reset() driver.start() if sync_driver: time_step = driver.get_initial_time_step() policy_state = driver.get_initial_policy_state() for i in range(5): time_step, policy_state = driver.run( max_num_steps=batch_size * steps_per_episode, time_step=time_step, policy_state=policy_state) for i in range(500): if sync_driver: time_step, policy_state = driver.run( max_num_steps=batch_size * mini_batch_length * 2, time_step=time_step, policy_state=policy_state) whole_replay_buffer_training = False clear_replay_buffer = False else: driver.run_async() whole_replay_buffer_training = True clear_replay_buffer = True driver.algorithm.train( mini_batch_size=128, mini_batch_length=mini_batch_length, whole_replay_buffer_training=whole_replay_buffer_training, clear_replay_buffer=clear_replay_buffer) eval_env.reset() eval_time_step, _ = eval_driver.run( max_num_steps=(steps_per_episode - 1) * batch_size) logging.log_every_n_seconds( logging.INFO, "%d reward=%f" % (i, float(tf.reduce_mean(eval_time_step.reward))), n_seconds=1) driver.stop() self.assertAlmostEqual( 1.0, float(tf.reduce_mean(eval_time_step.reward)), delta=2e-1)
def test_off_policy_algorithm(self, algorithm_ctor, use_rollout_state, sync_driver): logging.info("{} {}".format(algorithm_ctor.__name__, sync_driver)) batch_size = 128 if use_rollout_state: steps_per_episode = 5 mini_batch_length = 8 unroll_length = 8 env_class = RNNPolicyUnittestEnv else: steps_per_episode = 12 mini_batch_length = 2 unroll_length = 12 env_class = PolicyUnittestEnv env = TFPyEnvironment( env_class(batch_size, steps_per_episode, action_type=ActionType.Continuous)) eval_env = TFPyEnvironment( env_class(batch_size, steps_per_episode, action_type=ActionType.Continuous)) algorithm = algorithm_ctor(env) algorithm.use_rollout_state = use_rollout_state if sync_driver: driver = SyncOffPolicyDriver(env, algorithm, use_rollout_state=use_rollout_state, debug_summaries=True, summarize_grads_and_vars=True) else: driver = AsyncOffPolicyDriver( [env], algorithm, use_rollout_state=algorithm.use_rollout_state, num_actor_queues=1, unroll_length=unroll_length, learn_queue_cap=1, actor_queue_cap=1, debug_summaries=True, summarize_grads_and_vars=True) replayer = driver.exp_replayer eval_driver = OnPolicyDriver(eval_env, algorithm, training=False, greedy_predict=True) eval_env.reset() driver.start() if sync_driver: time_step = driver.get_initial_time_step() policy_state = driver.get_initial_policy_state() for i in range(5): time_step, policy_state = driver.run(max_num_steps=batch_size * steps_per_episode, time_step=time_step, policy_state=policy_state) for i in range(500): if sync_driver: time_step, policy_state = driver.run(max_num_steps=batch_size * mini_batch_length * 2, time_step=time_step, policy_state=policy_state) experience, _ = replayer.replay( sample_batch_size=128, mini_batch_length=mini_batch_length) else: driver.run_async() experience = replayer.replay_all() driver.train(experience, mini_batch_size=128, mini_batch_length=mini_batch_length) eval_env.reset() eval_time_step, _ = eval_driver.run( max_num_steps=(steps_per_episode - 1) * batch_size) logging.info("%d reward=%f", i, float(tf.reduce_mean(eval_time_step.reward))) driver.stop() self.assertAlmostEqual(1.0, float(tf.reduce_mean(eval_time_step.reward)), delta=2e-1)