def run_exp(env, policy, hp, steps, dir_name, evaluate, seed, eval_interval, log_interval, save_interval, initial_exploration_steps): """Run a single training procedure. Parameters ---------- env : str or gym.Env the training/testing environment policy : type [ hbaselines.base_policies.Policy ] the policy class to use hp : dict additional algorithm hyper-parameters steps : int total number of training steps dir_name : str the location the results files are meant to be stored evaluate : bool whether to include an evaluation environment seed : int specified the random seed for numpy, tensorflow, and random eval_interval : int number of simulation steps in the training environment before an evaluation is performed log_interval : int the number of training steps before logging training results save_interval : int number of simulation steps in the training environment before the model is saved initial_exploration_steps : int number of timesteps that the policy is run before training to initialize the replay buffer with samples """ eval_env = env if evaluate else None alg = RLAlgorithm( policy=policy, env=env, eval_env=eval_env, **hp ) # perform training alg.learn( total_steps=steps, log_dir=dir_name, log_interval=log_interval, eval_interval=eval_interval, save_interval=save_interval, initial_exploration_steps=initial_exploration_steps, seed=seed, )
def test_setup_model_feedforward(self): # Create the algorithm object. policy_params = self.init_parameters.copy() policy_params['policy'] = FeedForwardPolicy policy_params['_init_setup_model'] = True alg = RLAlgorithm(**policy_params) # check the policy_kwargs term policy_kwargs = FEEDFORWARD_PARAMS.copy() policy_kwargs.update(TD3_PARAMS) policy_kwargs['verbose'] = self.init_parameters['verbose'] policy_kwargs['num_envs'] = self.init_parameters['num_envs'] self.assertDictEqual(alg.policy_kwargs, policy_kwargs) with alg.graph.as_default(): expected_vars = sorted([var.name for var in get_trainable_vars()]) # Check that all trainable variables have been created in the # TensorFlow graph. self.assertListEqual( expected_vars, ['model/pi/fc0/bias:0', 'model/pi/fc0/kernel:0', 'model/pi/fc1/bias:0', 'model/pi/fc1/kernel:0', 'model/pi/output/bias:0', 'model/pi/output/kernel:0', 'model/qf_0/fc0/bias:0', 'model/qf_0/fc0/kernel:0', 'model/qf_0/fc1/bias:0', 'model/qf_0/fc1/kernel:0', 'model/qf_0/qf_output/bias:0', 'model/qf_0/qf_output/kernel:0', 'model/qf_1/fc0/bias:0', 'model/qf_1/fc0/kernel:0', 'model/qf_1/fc1/bias:0', 'model/qf_1/fc1/kernel:0', 'model/qf_1/qf_output/bias:0', 'model/qf_1/qf_output/kernel:0', 'target/pi/fc0/bias:0', 'target/pi/fc0/kernel:0', 'target/pi/fc1/bias:0', 'target/pi/fc1/kernel:0', 'target/pi/output/bias:0', 'target/pi/output/kernel:0', 'target/qf_0/fc0/bias:0', 'target/qf_0/fc0/kernel:0', 'target/qf_0/fc1/bias:0', 'target/qf_0/fc1/kernel:0', 'target/qf_0/qf_output/bias:0', 'target/qf_0/qf_output/kernel:0', 'target/qf_1/fc0/bias:0', 'target/qf_1/fc0/kernel:0', 'target/qf_1/fc1/bias:0', 'target/qf_1/fc1/kernel:0', 'target/qf_1/qf_output/bias:0', 'target/qf_1/qf_output/kernel:0'] )
def test_evaluate(self): """Validate the functionality of the _evaluate method. This is done for the following cases: 1. policy = FeedForwardPolicy 2. policy = GoalConditionedPolicy """ # Set the random seeds. random.seed(0) np.random.seed(0) tf.compat.v1.set_random_seed(0) # =================================================================== # # test case 1 # # =================================================================== # # Create the algorithm object. policy_params = self.init_parameters.copy() policy_params['policy'] = FeedForwardPolicy policy_params['eval_env'] = 'MountainCarContinuous-v0' policy_params['nb_eval_episodes'] = 1 policy_params['verbose'] = 2 policy_params['_init_setup_model'] = True alg = RLAlgorithm(**policy_params) # Run the _evaluate operation. ep_rewards, ep_successes, info = alg._evaluate(alg.eval_env) # Test the output from the operation. self.assertEqual(len(ep_rewards), 1) self.assertEqual(len(ep_successes), 0) self.assertEqual(list(info.keys()), ['initial', 'final', 'average']) # Clear memory. del alg # =================================================================== # # test case 2 # # =================================================================== # # Create the algorithm object. policy_params = self.init_parameters.copy() policy_params['policy'] = GoalConditionedPolicy policy_params['eval_env'] = 'MountainCarContinuous-v0' policy_params['nb_eval_episodes'] = 1 policy_params['verbose'] = 2 policy_params['_init_setup_model'] = True alg = RLAlgorithm(**policy_params) # Run the _evaluate operation. ep_rewards, ep_successes, info = alg._evaluate(alg.eval_env) # Test the output from the operation. self.assertEqual(len(ep_rewards), 1) self.assertEqual(len(ep_successes), 0) self.assertEqual(list(info.keys()), ['initial', 'final', 'average']) # Clear memory. del alg
def test_log_eval(self): # Create the algorithm object. policy_params = self.init_parameters.copy() policy_params['policy'] = GoalConditionedPolicy policy_params['_init_setup_model'] = False alg = RLAlgorithm(**policy_params) # test for one evaluation environment rewards = [0, 1, 2] successes = [True, False, False] info = {"test": 5} alg._log_eval(file_path="test_eval.csv", start_time=0, rewards=rewards, successes=successes, info=info) # check that the file was generated self.assertTrue(os.path.exists('test_eval_0.csv')) # import the stored data reader = csv.DictReader(open('test_eval_0.csv', 'r')) results = {"successes": [], "rewards": [], "test": []} for line in reader: results["successes"].append(float(line["success_rate"])) results["rewards"].append(float(line["average_return"])) results["test"].append(float(line["test"])) # test that the data matches expected values self.assertListEqual(results["rewards"], [1]) self.assertListEqual(results["successes"], [1 / 3]) self.assertListEqual(results["test"], [5]) # Delete generated files. os.remove('test_eval_0.csv') # test for one evaluation environment with no successes successes = [] alg._log_eval(file_path="test_eval.csv", start_time=0, rewards=rewards, successes=successes, info=info) # check that the file was generated self.assertTrue(os.path.exists('test_eval_0.csv')) # import the stored data reader = csv.DictReader(open('test_eval_0.csv', 'r')) results = {"successes": []} for line in reader: results["successes"].append(float(line["success_rate"])) # test that the successes are all zero self.assertListEqual(results["successes"], [0]) # Delete generated files. os.remove('test_eval_0.csv')
def test_learn_initial_exploration_steps(self): """Test the initial_exploration_steps parameter in the learn method. This is done for the following cases: 1. initial_exploration_steps= = 0 2. initial_exploration_steps= = 100 """ # =================================================================== # # test case 1 # # =================================================================== # # Create the algorithm object. policy_params = self.init_parameters.copy() policy_params['policy'] = FeedForwardPolicy policy_params['_init_setup_model'] = True alg = RLAlgorithm(**policy_params) # Run the learn operation for zero exploration steps. alg.learn(0, log_dir='results', initial_exploration_steps=0) # Check the size of the replay buffer self.assertEqual(len(alg.policy_tf.replay_buffer), 0) # Clear memory. del alg shutil.rmtree('results') # =================================================================== # # test case 2 # # =================================================================== # # Create the algorithm object. policy_params = self.init_parameters.copy() policy_params['policy'] = FeedForwardPolicy policy_params['_init_setup_model'] = True alg = RLAlgorithm(**policy_params) # Run the learn operation for zero exploration steps. alg.learn(0, log_dir='results', initial_exploration_steps=100) # Check the size of the replay buffer self.assertEqual(len(alg.policy_tf.replay_buffer), 100) # Clear memory. del alg shutil.rmtree('results')
def test_init(self): """Ensure that the parameters at init are as expected.""" # Create the algorithm object. policy_params = self.init_parameters.copy() policy_params['_init_setup_model'] = False alg = RLAlgorithm(**policy_params) # Test the attribute values. self.assertEqual(alg.policy, self.init_parameters['policy']) self.assertEqual(alg.eval_env, self.init_parameters['eval_env']) self.assertEqual(alg.nb_train_steps, self.init_parameters['nb_train_steps']) self.assertEqual(alg.nb_rollout_steps, self.init_parameters['nb_rollout_steps']) self.assertEqual(alg.nb_eval_episodes, self.init_parameters['nb_eval_episodes']) self.assertEqual(alg.reward_scale, self.init_parameters['reward_scale']) self.assertEqual(alg.render, self.init_parameters['render']) self.assertEqual(alg.render_eval, self.init_parameters['render_eval']) self.assertEqual(alg.verbose, self.init_parameters['verbose'])
def test_learn_init(self): """Test the non-loop components of the `learn` method.""" # Create the algorithm object. policy_params = self.init_parameters.copy() policy_params['policy'] = GoalConditionedPolicy policy_params['_init_setup_model'] = True alg = RLAlgorithm(**policy_params) # Run the learn operation for zero steps. alg.learn(0, log_dir='results', initial_exploration_steps=0) self.assertEqual(alg.episodes, 0) self.assertEqual(alg.total_steps, 0) self.assertEqual(alg.epoch, 0) self.assertEqual(len(alg.episode_rew_history), 0) self.assertEqual(alg.epoch_episodes, 0) self.assertEqual(len(alg.epoch_episode_rewards), 0) self.assertEqual(len(alg.epoch_episode_steps), 0) shutil.rmtree('results') # Test the seeds. alg.learn(0, log_dir='results', seed=1, initial_exploration_steps=0) self.assertEqual(np.random.sample(), 0.417022004702574) self.assertEqual(random.uniform(0, 1), 0.13436424411240122) shutil.rmtree('results')
def main(args): """Execute multiple training operations.""" flags = parse_options(args) # Run assertions. assert not (flags.no_render and flags.save_video), \ "If saving the rendering, no_render cannot be set to True." # get the hyperparameters env_name, policy, hp, seed = get_hyperparameters_from_dir(flags.dir_name) hp['num_envs'] = 1 hp['render_eval'] = not flags.no_render # to visualize the policy multiagent = env_name.startswith("multiagent") # create the algorithm object. We will be using the eval environment in # this object to perform the rollout. alg = RLAlgorithm(policy=policy, env=env_name, eval_env=env_name, **hp) # setup the seed value if not flags.random_seed: random.seed(seed) np.random.seed(seed) tf.compat.v1.set_random_seed(seed) # get the checkpoint number if flags.ckpt_num is None: filenames = os.listdir(os.path.join(flags.dir_name, "checkpoints")) metafiles = [f[:-5] for f in filenames if f[-5:] == ".meta"] metanum = [int(f.split("-")[-1]) for f in metafiles] ckpt_num = max(metanum) else: ckpt_num = flags.ckpt_num # location to the checkpoint ckpt = os.path.join(flags.dir_name, "checkpoints/itr-{}".format(ckpt_num)) # restore the previous checkpoint alg.saver = tf.compat.v1.train.Saver(alg.trainable_vars) alg.load(ckpt) # some variables that will be needed when replaying the rollout policy = alg.policy_tf env = alg.eval_env # Perform the evaluation procedure. episode_rewards = [] # Add an emission path to Flow environments. if env_name in FLOW_ENV_NAMES: sim_params = deepcopy(env.wrapped_env.sim_params) sim_params.emission_path = "./flow_results" env.wrapped_env.restart_simulation(sim_params, render=not flags.no_render) if not isinstance(env, list): env_list = [env] else: env_list = env for env_num, env in enumerate(env_list): for episode_num in range(flags.num_rollouts): if not flags.no_render and env_name not in FLOW_ENV_NAMES: out = FFmpegWriter("{}_{}_{}.mp4".format( flags.video, env_num, episode_num)) else: out = None obs, total_reward = env.reset(), 0 while True: context = [env.current_context] \ if hasattr(env, "current_context") else None if multiagent: processed_obs = { key: np.array([obs[key]]) for key in obs.keys() } else: processed_obs = np.asarray([obs]) action = policy.get_action( obs=processed_obs, context=context, apply_noise=False, random_actions=False, ) # Flatten the actions to pass to step. if multiagent: action = {key: action[key][0] for key in action.keys()} else: action = action[0] # Visualize the sub-goals of the hierarchical policy. if hasattr(policy, "_meta_action") \ and policy._meta_action is not None \ and hasattr(env, "set_goal"): goal = policy._meta_action[0][0] + (obs[ policy.goal_indices] if policy.relative_goals else 0) env.set_goal(goal) new_obs, reward, done, _ = env.step(action) if not flags.no_render: if flags.save_video: if alg.env_name == "AntGather": out.writeFrame(env.render(mode='rgb_array')) else: out.writeFrame( env.render(mode='rgb_array', height=1024, width=1024)) else: env.render() if multiagent: if (isinstance(done, dict) and done["__all__"]) or done: break obs0_transition = { key: np.array(obs[key]) for key in obs.keys() } obs1_transition = { key: np.array(new_obs[key]) for key in new_obs.keys() } total_reward += sum(reward[key] for key in reward.keys()) else: if done: break obs0_transition = obs obs1_transition = new_obs total_reward += reward policy.store_transition( obs0=obs0_transition, context0=context[0] if context is not None else None, action=action, reward=reward, obs1=obs1_transition, context1=context[0] if context is not None else None, done=done, is_final_step=done, evaluate=True, ) obs = new_obs # Print total returns from a given episode. episode_rewards.append(total_reward) print("Round {}, return: {}".format(episode_num, total_reward)) # Save the video. if not flags.no_render and env_name not in FLOW_ENV_NAMES \ and flags.save_video: out.close() # Print total statistics. print("Average, std return: {}, {}".format(np.mean(episode_rewards), np.std(episode_rewards)))
def test_setup_model_goal_conditioned(self): # Create the algorithm object. policy_params = self.init_parameters.copy() policy_params['policy'] = GoalConditionedPolicy policy_params['_init_setup_model'] = True alg = RLAlgorithm(**policy_params) # check the policy_kwargs term policy_kwargs = GOAL_CONDITIONED_PARAMS.copy() policy_kwargs.update(TD3_PARAMS) policy_kwargs['verbose'] = self.init_parameters['verbose'] policy_kwargs['env_name'] = self.init_parameters['env'] policy_kwargs['num_envs'] = self.init_parameters['num_envs'] policy_kwargs['total_steps'] = self.init_parameters['total_steps'] self.assertDictEqual(alg.policy_kwargs, policy_kwargs) with alg.graph.as_default(): expected_vars = sorted([var.name for var in get_trainable_vars()]) # Check that all trainable variables have been created in the # TensorFlow graph. self.assertListEqual( expected_vars, ['level_0/model/pi/fc0/bias:0', 'level_0/model/pi/fc0/kernel:0', 'level_0/model/pi/fc1/bias:0', 'level_0/model/pi/fc1/kernel:0', 'level_0/model/pi/output/bias:0', 'level_0/model/pi/output/kernel:0', 'level_0/model/qf_0/fc0/bias:0', 'level_0/model/qf_0/fc0/kernel:0', 'level_0/model/qf_0/fc1/bias:0', 'level_0/model/qf_0/fc1/kernel:0', 'level_0/model/qf_0/qf_output/bias:0', 'level_0/model/qf_0/qf_output/kernel:0', 'level_0/model/qf_1/fc0/bias:0', 'level_0/model/qf_1/fc0/kernel:0', 'level_0/model/qf_1/fc1/bias:0', 'level_0/model/qf_1/fc1/kernel:0', 'level_0/model/qf_1/qf_output/bias:0', 'level_0/model/qf_1/qf_output/kernel:0', 'level_0/target/pi/fc0/bias:0', 'level_0/target/pi/fc0/kernel:0', 'level_0/target/pi/fc1/bias:0', 'level_0/target/pi/fc1/kernel:0', 'level_0/target/pi/output/bias:0', 'level_0/target/pi/output/kernel:0', 'level_0/target/qf_0/fc0/bias:0', 'level_0/target/qf_0/fc0/kernel:0', 'level_0/target/qf_0/fc1/bias:0', 'level_0/target/qf_0/fc1/kernel:0', 'level_0/target/qf_0/qf_output/bias:0', 'level_0/target/qf_0/qf_output/kernel:0', 'level_0/target/qf_1/fc0/bias:0', 'level_0/target/qf_1/fc0/kernel:0', 'level_0/target/qf_1/fc1/bias:0', 'level_0/target/qf_1/fc1/kernel:0', 'level_0/target/qf_1/qf_output/bias:0', 'level_0/target/qf_1/qf_output/kernel:0', 'level_1/model/pi/fc0/bias:0', 'level_1/model/pi/fc0/kernel:0', 'level_1/model/pi/fc1/bias:0', 'level_1/model/pi/fc1/kernel:0', 'level_1/model/pi/output/bias:0', 'level_1/model/pi/output/kernel:0', 'level_1/model/qf_0/fc0/bias:0', 'level_1/model/qf_0/fc0/kernel:0', 'level_1/model/qf_0/fc1/bias:0', 'level_1/model/qf_0/fc1/kernel:0', 'level_1/model/qf_0/qf_output/bias:0', 'level_1/model/qf_0/qf_output/kernel:0', 'level_1/model/qf_1/fc0/bias:0', 'level_1/model/qf_1/fc0/kernel:0', 'level_1/model/qf_1/fc1/bias:0', 'level_1/model/qf_1/fc1/kernel:0', 'level_1/model/qf_1/qf_output/bias:0', 'level_1/model/qf_1/qf_output/kernel:0', 'level_1/target/pi/fc0/bias:0', 'level_1/target/pi/fc0/kernel:0', 'level_1/target/pi/fc1/bias:0', 'level_1/target/pi/fc1/kernel:0', 'level_1/target/pi/output/bias:0', 'level_1/target/pi/output/kernel:0', 'level_1/target/qf_0/fc0/bias:0', 'level_1/target/qf_0/fc0/kernel:0', 'level_1/target/qf_0/fc1/bias:0', 'level_1/target/qf_0/fc1/kernel:0', 'level_1/target/qf_0/qf_output/bias:0', 'level_1/target/qf_0/qf_output/kernel:0', 'level_1/target/qf_1/fc0/bias:0', 'level_1/target/qf_1/fc0/kernel:0', 'level_1/target/qf_1/fc1/bias:0', 'level_1/target/qf_1/fc1/kernel:0', 'level_1/target/qf_1/qf_output/bias:0', 'level_1/target/qf_1/qf_output/kernel:0'] )