def run_exp(env, policy, hp, steps, dir_name, evaluate, seed, eval_interval, log_interval, save_interval, initial_exploration_steps): """Run a single training procedure. Parameters ---------- env : str or gym.Env the training/testing environment policy : type [ hbaselines.base_policies.ActorCriticPolicy ] the policy class to use hp : dict additional algorithm hyper-parameters steps : int total number of training steps dir_name : str the location the results files are meant to be stored evaluate : bool whether to include an evaluation environment seed : int specified the random seed for numpy, tensorflow, and random eval_interval : int number of simulation steps in the training environment before an evaluation is performed log_interval : int the number of training steps before logging training results save_interval : int number of simulation steps in the training environment before the model is saved initial_exploration_steps : int number of timesteps that the policy is run before training to initialize the replay buffer with samples """ eval_env = env if evaluate else None alg = OffPolicyRLAlgorithm( policy=policy, env=env, eval_env=eval_env, **hp ) # perform training alg.learn( total_timesteps=steps, log_dir=dir_name, log_interval=log_interval, eval_interval=eval_interval, save_interval=save_interval, initial_exploration_steps=initial_exploration_steps, seed=seed, )
def run_exp(env, hp, steps, dir_name, evaluate, seed, eval_interval, log_interval, save_interval): """Run a single training procedure. Parameters ---------- env : str or gym.Env the training/testing environment hp : dict additional algorithm hyper-parameters steps : int total number of training steps dir_name : str the location the results files are meant to be stored evaluate : bool whether to include an evaluation environment seed : int specified the random seed for numpy, tensorflow, and random eval_interval : int number of simulation steps in the training environment before an evaluation is performed log_interval : int the number of training steps before logging training results save_interval : int number of simulation steps in the training environment before the model is saved """ eval_env = env if evaluate else None alg = OffPolicyRLAlgorithm( policy=FeedForwardPolicy, env=env, eval_env=eval_env, **hp ) # perform training alg.learn( total_timesteps=steps, log_dir=dir_name, log_interval=log_interval, eval_interval=eval_interval, save_interval=save_interval, seed=seed, )
def test_setup_model_feedforward(self): # Create the algorithm object. policy_params = self.init_parameters.copy() policy_params['policy'] = FeedForwardPolicy policy_params['_init_setup_model'] = True alg = OffPolicyRLAlgorithm(**policy_params) # check the policy_kwargs term policy_kwargs = FEEDFORWARD_PARAMS.copy() policy_kwargs.update(TD3_PARAMS) policy_kwargs['verbose'] = self.init_parameters['verbose'] self.assertDictEqual(alg.policy_kwargs, policy_kwargs) with alg.graph.as_default(): expected_vars = sorted([var.name for var in get_trainable_vars()]) # Check that all trainable variables have been created in the # TensorFlow graph. self.assertListEqual( expected_vars, ['model/pi/fc0/bias:0', 'model/pi/fc0/kernel:0', 'model/pi/fc1/bias:0', 'model/pi/fc1/kernel:0', 'model/pi/output/bias:0', 'model/pi/output/kernel:0', 'model/qf_0/fc0/bias:0', 'model/qf_0/fc0/kernel:0', 'model/qf_0/fc1/bias:0', 'model/qf_0/fc1/kernel:0', 'model/qf_0/qf_output/bias:0', 'model/qf_0/qf_output/kernel:0', 'model/qf_1/fc0/bias:0', 'model/qf_1/fc0/kernel:0', 'model/qf_1/fc1/bias:0', 'model/qf_1/fc1/kernel:0', 'model/qf_1/qf_output/bias:0', 'model/qf_1/qf_output/kernel:0', 'target/pi/fc0/bias:0', 'target/pi/fc0/kernel:0', 'target/pi/fc1/bias:0', 'target/pi/fc1/kernel:0', 'target/pi/output/bias:0', 'target/pi/output/kernel:0', 'target/qf_0/fc0/bias:0', 'target/qf_0/fc0/kernel:0', 'target/qf_0/fc1/bias:0', 'target/qf_0/fc1/kernel:0', 'target/qf_0/qf_output/bias:0', 'target/qf_0/qf_output/kernel:0', 'target/qf_1/fc0/bias:0', 'target/qf_1/fc0/kernel:0', 'target/qf_1/fc1/bias:0', 'target/qf_1/fc1/kernel:0', 'target/qf_1/qf_output/bias:0', 'target/qf_1/qf_output/kernel:0'] )
def test_learn_init(self): """Test the non-loop components of the `learn` method.""" # Create the algorithm object. policy_params = self.init_parameters.copy() policy_params['policy'] = GoalConditionedPolicy policy_params['_init_setup_model'] = True alg = OffPolicyRLAlgorithm(**policy_params) # Run the learn operation for zero timesteps. alg.learn(0, log_dir='results', initial_exploration_steps=0) self.assertEqual(alg.episodes, 0) self.assertEqual(alg.total_steps, 0) self.assertEqual(alg.epoch, 0) self.assertEqual(len(alg.episode_rewards_history), 0) self.assertEqual(alg.epoch_episodes, 0) self.assertEqual(len(alg.epoch_actions), 0) self.assertEqual(len(alg.epoch_q1s), 0) self.assertEqual(len(alg.epoch_q2s), 0) self.assertEqual(len(alg.epoch_actor_losses), 0) self.assertEqual(len(alg.epoch_q1_losses), 0) self.assertEqual(len(alg.epoch_q2_losses), 0) self.assertEqual(len(alg.epoch_episode_rewards), 0) self.assertEqual(len(alg.epoch_episode_steps), 0) shutil.rmtree('results') # Test the seeds. alg.learn(0, log_dir='results', seed=1, initial_exploration_steps=0) self.assertEqual(np.random.sample(), 0.417022004702574) self.assertEqual(random.uniform(0, 1), 0.13436424411240122) shutil.rmtree('results')
def test_evaluate(self): """Validate the functionality of the _evaluate method. This is done for the following cases: 1. policy = FeedForwardPolicy 2. policy = GoalConditionedPolicy """ # Set the random seeds. random.seed(0) np.random.seed(0) tf.compat.v1.set_random_seed(0) # =================================================================== # # test case 1 # # =================================================================== # # Create the algorithm object. policy_params = self.init_parameters.copy() policy_params['policy'] = FeedForwardPolicy policy_params['eval_env'] = 'MountainCarContinuous-v0' policy_params['nb_eval_episodes'] = 1 policy_params['verbose'] = 2 policy_params['_init_setup_model'] = True alg = OffPolicyRLAlgorithm(**policy_params) # Run the _evaluate operation. ep_rewards, ep_successes, info = alg._evaluate(alg.eval_env) # Test the output from the operation. self.assertEqual(len(ep_rewards), 1) self.assertEqual(len(ep_successes), 0) self.assertEqual(list(info.keys()), ['initial', 'final', 'average']) # Clear memory. del alg # =================================================================== # # test case 2 # # =================================================================== # # Create the algorithm object. policy_params = self.init_parameters.copy() policy_params['policy'] = GoalConditionedPolicy policy_params['eval_env'] = 'MountainCarContinuous-v0' policy_params['nb_eval_episodes'] = 1 policy_params['verbose'] = 2 policy_params['_init_setup_model'] = True alg = OffPolicyRLAlgorithm(**policy_params) # Run the _evaluate operation. ep_rewards, ep_successes, info = alg._evaluate(alg.eval_env) # Test the output from the operation. self.assertEqual(len(ep_rewards), 1) self.assertEqual(len(ep_successes), 0) self.assertEqual(list(info.keys()), ['initial', 'final', 'average']) # Clear memory. del alg
def test_fingerprints(self): """Validate the functionality of the fingerprints. When the fingerprint functionality is turned on, the observation within the algorithm (stored under self.obs) should always include the fingerprint element. Policy-specific features of the fingerprint implementation are also tested here. This feature should add a fingerprint dimension to the observation spaces, but NOT the context space of the lower-level or the action space of the higher-level. The intrinsic reward function should also be ignoring the fingerprint elements during its computation. The fingerprint elements are passed by the algorithm, and tested under test_algorithm.py """ # Create the algorithm. policy_params = self.init_parameters.copy() policy_params['policy'] = GoalConditionedPolicy policy_params['nb_rollout_steps'] = 1 policy_params['policy_kwargs'] = {'use_fingerprints': True} alg = OffPolicyRLAlgorithm(**policy_params) # Test the observation spaces of the policies, as well as the context # space of the lower-level policy and action space of the higher-level # policy. self.assertTupleEqual(alg.policy_tf.policy[0].ob_space.shape, (4,)) self.assertTupleEqual(alg.policy_tf.policy[0].ac_space.shape, (2,)) self.assertTupleEqual(alg.policy_tf.policy[-1].ob_space.shape, (4,)) self.assertTupleEqual(alg.policy_tf.policy[-1].co_space.shape, (2,)) # Test intrinsic_reward method within the policy. self.assertAlmostEqual( alg.policy_tf.intrinsic_reward_fn( states=np.array([1, 2, 3]), goals=np.array([0, 0]), next_states=np.array([1, 2, 3])), -np.sqrt(1**2 + 2**2) ) # Validate that observations include the fingerprints elements upon # initializing the `learn` procedure and during a step in the # `_collect_samples` method. alg.learn(1, log_dir='results', log_interval=1, initial_exploration_steps=0) self.assertEqual(len(alg.obs[0]), alg.ob_space.shape[0]) np.testing.assert_almost_equal( alg.obs[0][-alg.policy_tf.fingerprint_dim[0]:], np.array([0, 5])) # Validate that observations include the fingerprints elements during # a reset in the `_collect_samples` method. alg.learn(500, log_dir='results', log_interval=500, initial_exploration_steps=0) self.assertEqual(len(alg.obs[0]), alg.ob_space.shape[0]) np.testing.assert_almost_equal( alg.obs[0][-alg.policy_tf.fingerprint_dim[0]:], np.array([4.99, 0.01])) # Delete generated files. shutil.rmtree('results')
def test_log_eval(self): # Create the algorithm object. policy_params = self.init_parameters.copy() policy_params['policy'] = GoalConditionedPolicy policy_params['_init_setup_model'] = False alg = OffPolicyRLAlgorithm(**policy_params) # test for one evaluation environment rewards = [0, 1, 2] successes = [True, False, False] info = {"test": 5} alg._log_eval(file_path="test_eval.csv", start_time=0, rewards=rewards, successes=successes, info=info) # check that the file was generated self.assertTrue(os.path.exists('test_eval_0.csv')) # import the stored data reader = csv.DictReader(open('test_eval_0.csv', 'r')) results = {"successes": [], "rewards": [], "test": []} for line in reader: results["successes"].append(float(line["success_rate"])) results["rewards"].append(float(line["average_return"])) results["test"].append(float(line["test"])) # test that the data matches expected values self.assertListEqual(results["rewards"], [1]) self.assertListEqual(results["successes"], [1 / 3]) self.assertListEqual(results["test"], [5]) # Delete generated files. os.remove('test_eval_0.csv') # test for one evaluation environment with no successes successes = [] alg._log_eval(file_path="test_eval.csv", start_time=0, rewards=rewards, successes=successes, info=info) # check that the file was generated self.assertTrue(os.path.exists('test_eval_0.csv')) # import the stored data reader = csv.DictReader(open('test_eval_0.csv', 'r')) results = {"successes": []} for line in reader: results["successes"].append(float(line["success_rate"])) # test that the successes are all zero self.assertListEqual(results["successes"], [0]) # Delete generated files. os.remove('test_eval_0.csv')
def test_learn_initial_exploration_steps(self): """Test the initial_exploration_steps parameter in the learn method. This is done for the following cases: 1. initial_exploration_steps= = 0 2. initial_exploration_steps= = 100 """ # =================================================================== # # test case 1 # # =================================================================== # # Create the algorithm object. policy_params = self.init_parameters.copy() policy_params['policy'] = FeedForwardPolicy policy_params['_init_setup_model'] = True alg = OffPolicyRLAlgorithm(**policy_params) # Run the learn operation for zero exploration steps. alg.learn(0, log_dir='results', initial_exploration_steps=0) # Check the size of the replay buffer self.assertEqual(len(alg.policy_tf.replay_buffer), 1) # Clear memory. del alg shutil.rmtree('results') # =================================================================== # # test case 2 # # =================================================================== # # Create the algorithm object. policy_params = self.init_parameters.copy() policy_params['policy'] = FeedForwardPolicy policy_params['_init_setup_model'] = True alg = OffPolicyRLAlgorithm(**policy_params) # Run the learn operation for zero exploration steps. alg.learn(0, log_dir='results', initial_exploration_steps=100) # Check the size of the replay buffer self.assertEqual(len(alg.policy_tf.replay_buffer), 100) # Clear memory. del alg shutil.rmtree('results')
def test_init(self): """Ensure that the parameters at init are as expected.""" # Create the algorithm object. policy_params = self.init_parameters.copy() policy_params['_init_setup_model'] = False alg = OffPolicyRLAlgorithm(**policy_params) # Test the attribute values. self.assertEqual(alg.policy, self.init_parameters['policy']) self.assertEqual(alg.eval_env, self.init_parameters['eval_env']) self.assertEqual(alg.nb_train_steps, self.init_parameters['nb_train_steps']) self.assertEqual(alg.nb_rollout_steps, self.init_parameters['nb_rollout_steps']) self.assertEqual(alg.nb_eval_episodes, self.init_parameters['nb_eval_episodes']) self.assertEqual(alg.reward_scale, self.init_parameters['reward_scale']) self.assertEqual(alg.render, self.init_parameters['render']) self.assertEqual(alg.render_eval, self.init_parameters['render_eval']) self.assertEqual(alg.verbose, self.init_parameters['verbose'])
def test_setup_model_goal_conditioned(self): # Create the algorithm object. policy_params = self.init_parameters.copy() policy_params['policy'] = GoalConditionedPolicy policy_params['_init_setup_model'] = True alg = OffPolicyRLAlgorithm(**policy_params) # check the policy_kwargs term policy_kwargs = GOAL_CONDITIONED_PARAMS.copy() policy_kwargs.update(TD3_PARAMS) policy_kwargs['verbose'] = self.init_parameters['verbose'] policy_kwargs['env_name'] = self.init_parameters['env'] policy_kwargs['num_envs'] = self.init_parameters['num_envs'] self.assertDictEqual(alg.policy_kwargs, policy_kwargs) with alg.graph.as_default(): expected_vars = sorted([var.name for var in get_trainable_vars()]) # Check that all trainable variables have been created in the # TensorFlow graph. self.assertListEqual( expected_vars, ['level_0/model/pi/fc0/bias:0', 'level_0/model/pi/fc0/kernel:0', 'level_0/model/pi/fc1/bias:0', 'level_0/model/pi/fc1/kernel:0', 'level_0/model/pi/output/bias:0', 'level_0/model/pi/output/kernel:0', 'level_0/model/qf_0/fc0/bias:0', 'level_0/model/qf_0/fc0/kernel:0', 'level_0/model/qf_0/fc1/bias:0', 'level_0/model/qf_0/fc1/kernel:0', 'level_0/model/qf_0/qf_output/bias:0', 'level_0/model/qf_0/qf_output/kernel:0', 'level_0/model/qf_1/fc0/bias:0', 'level_0/model/qf_1/fc0/kernel:0', 'level_0/model/qf_1/fc1/bias:0', 'level_0/model/qf_1/fc1/kernel:0', 'level_0/model/qf_1/qf_output/bias:0', 'level_0/model/qf_1/qf_output/kernel:0', 'level_0/target/pi/fc0/bias:0', 'level_0/target/pi/fc0/kernel:0', 'level_0/target/pi/fc1/bias:0', 'level_0/target/pi/fc1/kernel:0', 'level_0/target/pi/output/bias:0', 'level_0/target/pi/output/kernel:0', 'level_0/target/qf_0/fc0/bias:0', 'level_0/target/qf_0/fc0/kernel:0', 'level_0/target/qf_0/fc1/bias:0', 'level_0/target/qf_0/fc1/kernel:0', 'level_0/target/qf_0/qf_output/bias:0', 'level_0/target/qf_0/qf_output/kernel:0', 'level_0/target/qf_1/fc0/bias:0', 'level_0/target/qf_1/fc0/kernel:0', 'level_0/target/qf_1/fc1/bias:0', 'level_0/target/qf_1/fc1/kernel:0', 'level_0/target/qf_1/qf_output/bias:0', 'level_0/target/qf_1/qf_output/kernel:0', 'level_1/model/pi/fc0/bias:0', 'level_1/model/pi/fc0/kernel:0', 'level_1/model/pi/fc1/bias:0', 'level_1/model/pi/fc1/kernel:0', 'level_1/model/pi/output/bias:0', 'level_1/model/pi/output/kernel:0', 'level_1/model/qf_0/fc0/bias:0', 'level_1/model/qf_0/fc0/kernel:0', 'level_1/model/qf_0/fc1/bias:0', 'level_1/model/qf_0/fc1/kernel:0', 'level_1/model/qf_0/qf_output/bias:0', 'level_1/model/qf_0/qf_output/kernel:0', 'level_1/model/qf_1/fc0/bias:0', 'level_1/model/qf_1/fc0/kernel:0', 'level_1/model/qf_1/fc1/bias:0', 'level_1/model/qf_1/fc1/kernel:0', 'level_1/model/qf_1/qf_output/bias:0', 'level_1/model/qf_1/qf_output/kernel:0', 'level_1/target/pi/fc0/bias:0', 'level_1/target/pi/fc0/kernel:0', 'level_1/target/pi/fc1/bias:0', 'level_1/target/pi/fc1/kernel:0', 'level_1/target/pi/output/bias:0', 'level_1/target/pi/output/kernel:0', 'level_1/target/qf_0/fc0/bias:0', 'level_1/target/qf_0/fc0/kernel:0', 'level_1/target/qf_0/fc1/bias:0', 'level_1/target/qf_0/fc1/kernel:0', 'level_1/target/qf_0/qf_output/bias:0', 'level_1/target/qf_0/qf_output/kernel:0', 'level_1/target/qf_1/fc0/bias:0', 'level_1/target/qf_1/fc0/kernel:0', 'level_1/target/qf_1/fc1/bias:0', 'level_1/target/qf_1/fc1/kernel:0', 'level_1/target/qf_1/qf_output/bias:0', 'level_1/target/qf_1/qf_output/kernel:0'] )
def train_h_baselines(env_name, args, multiagent): """Train policies using SAC and TD3 with h-baselines.""" from hbaselines.algorithms import OffPolicyRLAlgorithm from hbaselines.utils.train import parse_options, get_hyperparameters # Get the command-line arguments that are relevant here args = parse_options(description="", example_usage="", args=args) # the base directory that the logged data will be stored in base_dir = "training_data" for i in range(args.n_training): # value of the next seed seed = args.seed + i # The time when the current experiment started. now = strftime("%Y-%m-%d-%H:%M:%S") # Create a save directory folder (if it doesn't exist). dir_name = os.path.join(base_dir, '{}/{}'.format(args.env_name, now)) ensure_dir(dir_name) # Get the policy class. if args.alg == "TD3": if multiagent: from hbaselines.multi_fcnet.td3 import MultiFeedForwardPolicy policy = MultiFeedForwardPolicy else: from hbaselines.fcnet.td3 import FeedForwardPolicy policy = FeedForwardPolicy elif args.alg == "SAC": if multiagent: from hbaselines.multi_fcnet.sac import MultiFeedForwardPolicy policy = MultiFeedForwardPolicy else: from hbaselines.fcnet.sac import FeedForwardPolicy policy = FeedForwardPolicy else: raise ValueError("Unknown algorithm: {}".format(args.alg)) # Get the hyperparameters. hp = get_hyperparameters(args, policy) # Add the seed for logging purposes. params_with_extra = hp.copy() params_with_extra['seed'] = seed params_with_extra['env_name'] = args.env_name params_with_extra['policy_name'] = policy.__name__ params_with_extra['algorithm'] = args.alg params_with_extra['date/time'] = now # Add the hyperparameters to the folder. with open(os.path.join(dir_name, 'hyperparameters.json'), 'w') as f: json.dump(params_with_extra, f, sort_keys=True, indent=4) # Create the algorithm object. alg = OffPolicyRLAlgorithm( policy=policy, env="flow:{}".format(env_name), eval_env="flow:{}".format(env_name) if args.evaluate else None, **hp) # Perform training. alg.learn( total_steps=args.total_steps, log_dir=dir_name, log_interval=args.log_interval, eval_interval=args.eval_interval, save_interval=args.save_interval, initial_exploration_steps=args.initial_exploration_steps, seed=seed, )
def main(args): """Execute multiple training operations.""" flags = parse_options(args) # get the hyperparameters env_name, policy, hp, seed = get_hyperparameters_from_dir(flags.dir_name) hp['render'] = not flags.no_render # to visualize the policy # create the algorithm object. We will be using the eval environment in # this object to perform the rollout. alg = OffPolicyRLAlgorithm(policy=policy, env=env_name, eval_env=env_name, **hp) # setup the seed value random.seed(seed) np.random.seed(seed) tf.compat.v1.set_random_seed(seed) # get the checkpoint number if flags.ckpt_num is None: filenames = os.listdir(os.path.join(flags.dir_name, "checkpoints")) metafiles = [f[:-5] for f in filenames if f[-5:] == ".meta"] metanum = [int(f.split("-")[-1]) for f in metafiles] ckpt_num = max(metanum) else: ckpt_num = flags.ckpt_num # location to the checkpoint ckpt = os.path.join(flags.dir_name, "checkpoints/itr-{}".format(ckpt_num)) # restore the previous checkpoint alg.saver = tf.compat.v1.train.Saver(alg.trainable_vars) alg.load(ckpt) # some variables that will be needed when replaying the rollout policy = alg.policy_tf env = alg.eval_env # Perform the evaluation procedure. episdoe_rewards = [] for episode_num in range(flags.num_rollouts): # Run a rollout. obs = env.reset() total_reward = 0 while True: context = [env.current_context] \ if hasattr(env, "current_context") else None action = policy.get_action( np.asarray([obs]), context=context, apply_noise=False, random_actions=False, ) obs, reward, done, _ = env.step(action) if not flags.no_render: env.render() total_reward += reward if done: break # Print total returns from a given episode. episdoe_rewards.append(total_reward) print("Round {}, return: {}".format(episode_num, total_reward)) # Print total statistics. print("Average, std return: {}, {}".format(np.mean(episdoe_rewards), np.std(episdoe_rewards)))
def main(args): """Execute multiple training operations.""" flags = parse_options(args) # get the hyperparameters env_name, policy, hp, seed = get_hyperparameters_from_dir(flags.dir_name) hp['render'] = not flags.no_render # to visualize the policy # create the algorithm object. We will be using the eval environment in # this object to perform the rollout. alg = OffPolicyRLAlgorithm(policy=policy, env=env_name, **hp) # setup the seed value if not flags.random_seed: random.seed(seed) np.random.seed(seed) tf.compat.v1.set_random_seed(seed) # get the checkpoint number if flags.ckpt_num is None: filenames = os.listdir(os.path.join(flags.dir_name, "checkpoints")) metafiles = [f[:-5] for f in filenames if f[-5:] == ".meta"] metanum = [int(f.split("-")[-1]) for f in metafiles] ckpt_num = max(metanum) else: ckpt_num = flags.ckpt_num # location to the checkpoint ckpt = os.path.join(flags.dir_name, "checkpoints/itr-{}".format(ckpt_num)) # restore the previous checkpoint alg.saver = tf.compat.v1.train.Saver(alg.trainable_vars) alg.load(ckpt) # some variables that will be needed when replaying the rollout policy = alg.policy_tf env = alg.sampler.env # Perform the evaluation procedure. episdoe_rewards = [] # Add an emission path to Flow environments. if env_name in FLOW_ENV_NAMES: sim_params = deepcopy(env.wrapped_env.sim_params) sim_params.emission_path = "./flow_results" env.wrapped_env.restart_simulation(sim_params, render=not flags.no_render) for episode_num in range(flags.num_rollouts): # Run a rollout. obs = env.reset() total_reward = 0 while True: context = [env.current_context] \ if hasattr(env, "current_context") else None action = policy.get_action( np.asarray([obs]), context=context, apply_noise=False, random_actions=False, ) obs, reward, done, _ = env.step(action[0]) if not flags.no_render: env.render() total_reward += reward if done: break # Print total returns from a given episode. episdoe_rewards.append(total_reward) print("Round {}, return: {}".format(episode_num, total_reward)) # Print total statistics. print("Average, std return: {}, {}".format(np.mean(episdoe_rewards), np.std(episdoe_rewards))) if env_name in FLOW_ENV_NAMES: # wait a short period of time to ensure the xml file is readable time.sleep(0.1) # collect the location of the emission file dir_path = env.wrapped_env.sim_params.emission_path emission_filename = "{0}-emission.xml".format( env.wrapped_env.network.name) emission_path = os.path.join(dir_path, emission_filename) # convert the emission file into a csv emission_to_csv(emission_path) # Delete the .xml version of the emission file. os.remove(emission_path)
def main(args): """Execute multiple training operations.""" flags = parse_options(args) # Run assertions. assert not (flags.no_render and flags.save_video), \ "If saving the rendering, no_render cannot be set to True." # get the hyperparameters env_name, policy, hp, seed = get_hyperparameters_from_dir(flags.dir_name) hp['num_envs'] = 1 hp['render_eval'] = not flags.no_render # to visualize the policy # create the algorithm object. We will be using the eval environment in # this object to perform the rollout. alg = OffPolicyRLAlgorithm(policy=policy, env=env_name, eval_env=env_name, **hp) # setup the seed value if not flags.random_seed: random.seed(seed) np.random.seed(seed) tf.compat.v1.set_random_seed(seed) # get the checkpoint number if flags.ckpt_num is None: filenames = os.listdir(os.path.join(flags.dir_name, "checkpoints")) metafiles = [f[:-5] for f in filenames if f[-5:] == ".meta"] metanum = [int(f.split("-")[-1]) for f in metafiles] ckpt_num = max(metanum) else: ckpt_num = flags.ckpt_num # location to the checkpoint ckpt = os.path.join(flags.dir_name, "checkpoints/itr-{}".format(ckpt_num)) # restore the previous checkpoint alg.saver = tf.compat.v1.train.Saver(alg.trainable_vars) alg.load(ckpt) # some variables that will be needed when replaying the rollout policy = alg.policy_tf env = alg.eval_env # Perform the evaluation procedure. episode_rewards = [] # Add an emission path to Flow environments. if env_name in FLOW_ENV_NAMES: sim_params = deepcopy(env.wrapped_env.sim_params) sim_params.emission_path = "./flow_results" env.wrapped_env.restart_simulation(sim_params, render=not flags.no_render) if not isinstance(env, list): env_list = [env] else: env_list = env for env_num, env in enumerate(env_list): for episode_num in range(flags.num_rollouts): if not flags.no_render and env_name not in FLOW_ENV_NAMES: out = FFmpegWriter("{}_{}_{}.mp4".format( flags.video, env_num, episode_num)) else: out = None obs, total_reward = env.reset(), 0 while True: context = [env.current_context] \ if hasattr(env, "current_context") else None action = policy.get_action( obs=np.asarray([obs]), context=context, apply_noise=False, random_actions=False, ) # Visualize the sub-goals of the hierarchical policy. if hasattr(policy, "_meta_action") \ and policy._meta_action is not None \ and hasattr(env, "set_goal"): goal = policy._meta_action[0][0] + (obs[ policy.goal_indices] if policy.relative_goals else 0) env.set_goal(goal) new_obs, reward, done, _ = env.step(action[0]) if not flags.no_render: if flags.save_video: if alg.env_name == "AntGather": out.writeFrame(env.render(mode='rgb_array')) else: out.writeFrame( env.render(mode='rgb_array', height=1024, width=1024)) else: env.render() total_reward += reward if done: break policy.store_transition( obs0=obs, context0=context[0] if context is not None else None, action=action[0], reward=reward, obs1=new_obs, context1=context[0] if context is not None else None, done=done, is_final_step=done, evaluate=True, ) obs = new_obs # Print total returns from a given episode. episode_rewards.append(total_reward) print("Round {}, return: {}".format(episode_num, total_reward)) # Save the video. if not flags.no_render and env_name not in FLOW_ENV_NAMES \ and flags.save_video: out.close() # Print total statistics. print("Average, std return: {}, {}".format(np.mean(episode_rewards), np.std(episode_rewards)))
def main(args): """Execute multiple training operations.""" flags = parse_options(args) data = { 'name': [], 'step': [], 'distance': [], } for dir_name, name in zip(flags.dir_name, flags.name): # get the hyperparameters env_name, policy, hp, seed = get_hyperparameters_from_dir(dir_name) print(hp.keys()) del hp['algorithm'] del hp['date/time'] # create the algorithm object. We will be using the eval environment in # this object to perform the rollout. alg = OffPolicyRLAlgorithm( policy=policy, env=env_name, eval_env=env_name, **hp) # setup the seed value random.seed(seed) np.random.seed(seed) tf.compat.v1.set_random_seed(seed) filenames = os.listdir(os.path.join(dir_name, "checkpoints")) metafiles = [f[:-5] for f in filenames if f[-5:] == ".meta"] metanum = list(sorted([int(f.split("-")[-1]) for f in metafiles]))[:-1] # get the checkpoint number ckpt_num = max(metanum) # location to the checkpoint ckpt = os.path.join(dir_name, "checkpoints/itr-{}".format(ckpt_num)) # restore the previous checkpoint alg.saver = tf.compat.v1.train.Saver(alg.trainable_vars) alg.load(ckpt) # some variables that will be needed when replaying the rollout policy = alg.policy_tf batches = [] for b in range(flags.num_batches): worker_obs0 = policy.replay_buffer.sample(with_additional=False)[5] batches.append(worker_obs0) for ckpt_num_one, ckpt_num_two in zip(metanum[1:], metanum[:-1]): # get the checkpoint number ckpt_num = ckpt_num_one # location to the checkpoint ckpt = os.path.join(dir_name, "checkpoints/itr-{}".format(ckpt_num)) # restore the previous checkpoint alg.load(ckpt) # some variables that will be needed when replaying the rollout policy = alg.policy_tf mean_one = [] for b in batches: a = policy.policy[-1].get_action(b, None, False, False) mean_one.append(a) # get the checkpoint number ckpt_num = ckpt_num_two # location to the checkpoint ckpt = os.path.join(dir_name, "checkpoints/itr-{}".format(ckpt_num)) # restore the previous checkpoint alg.load(ckpt) # some variables that will be needed when replaying the rollout policy = alg.policy_tf mean_two = [] for b in batches: a = policy.policy[-1].get_action(b, None, False, False) mean_two.append(a) # compute a distance metric between the policies mean_one = np.concatenate(mean_one, axis=0) mean_two = np.concatenate(mean_two, axis=0) kl = np.sum((mean_one - mean_two) ** 2, axis=1).mean() print("{},{},{},{}".format(name, ckpt_num_one, ckpt_num_two, kl)) data['name'].append(name) data['step'].append(ckpt_num_one) data['distance'].append(kl) df = pd.DataFrame(data, columns=['name', 'step', 'distance']) plt.title("Manager MDP Non-Stationarity") ax = sns.lineplot(x='step', y='distance', hue='name', data=df) plt.savefig('ns.png')