def test_baseline(self): """Test the baseline initialization.""" box_env = TfEnv(DummyBoxEnv()) deterministic_mlp_baseline = ContinuousMLPBaseline(env_spec=box_env) gaussian_mlp_baseline = GaussianMLPBaseline(env_spec=box_env) self.sess.run(tf.compat.v1.global_variables_initializer()) deterministic_mlp_baseline.get_param_values() gaussian_mlp_baseline.get_param_values() box_env.close()
def test_get_params_internal(self, obs_dim): box_env = MetaRLEnv(DummyBoxEnv(obs_dim=obs_dim)) with mock.patch(('metarl.tf.baselines.' 'gaussian_mlp_baseline.' 'GaussianMLPRegressor'), new=SimpleGaussianMLPRegressor): gmb = GaussianMLPBaseline(env_spec=box_env.spec, regressor_args=dict()) params_interal = gmb.get_params_internal() trainable_params = tf.compat.v1.trainable_variables( scope='GaussianMLPBaseline') assert np.array_equal(params_interal, trainable_params)
def trpo_mt50(ctxt=None, seed=1): """Run task.""" set_seed(seed) with LocalTFRunner(snapshot_config=ctxt) as runner: env = MultiEnvWrapper(MT50_envs, env_ids, sample_strategy=round_robin_strategy) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(64, 64)) # baseline = LinearFeatureBaseline(env_spec=env.spec) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict( hidden_sizes=(64, 64), use_trust_region=False, ), ) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=150, discount=0.99, gae_lambda=0.97, max_kl_step=0.01) runner.setup(algo, env) runner.train(n_epochs=1500, batch_size=len(MT50_envs)*10*150)
def test_process_samples_continuous_recurrent(self): env = TfEnv(DummyBoxEnv()) policy = GaussianLSTMPolicy(env_spec=env.spec) baseline = GaussianMLPBaseline(env_spec=env.spec) max_path_length = 100 with LocalTFRunner(snapshot_config, sess=self.sess) as runner: algo = BatchPolopt2(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=max_path_length, flatten_input=True) runner.setup(algo, env, sampler_args=dict(n_envs=1)) runner.train(n_epochs=1, batch_size=max_path_length) paths = runner.obtain_samples(0) samples = algo.process_samples(0, paths) # Since there is only 1 vec_env in the sampler and DummyBoxEnv # never terminate until it reaches max_path_length, batch size # must be max_path_length, i.e. 100 assert samples['observations'].shape == ( max_path_length, env.observation_space.flat_dim) assert samples['actions'].shape == (max_path_length, env.action_space.flat_dim) assert samples['rewards'].shape == (max_path_length, ) assert samples['baselines'].shape == (max_path_length, ) assert samples['returns'].shape == (max_path_length, ) # there is only 1 path assert samples['lengths'].shape == (1, ) for key, shape in policy.state_info_specs: assert samples['agent_infos'][key].shape == (max_path_length, np.prod(shape)) # DummyBoxEnv has env_info dummy assert samples['env_infos']['dummy'].shape == (max_path_length, ) assert isinstance(samples['average_return'], float)
def test_ppo_pendulum_gru(self): """Test PPO with Pendulum environment and recurrent policy.""" with LocalTFRunner(snapshot_config) as runner: env = MetaRLEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) gru_policy = GaussianGRUPolicy(env_spec=env.spec) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict(hidden_sizes=(32, 32)), ) algo = PPO( env_spec=env.spec, policy=gru_policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, ) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 80
def trpo_ml1(ctxt=None, seed=1): """Run task.""" set_seed(seed) with LocalTFRunner(snapshot_config=ctxt) as runner: Ml1_reach_envs = get_ML1_envs_test(env_id) env = MTMetaWorldWrapper(Ml1_reach_envs) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict(hidden_sizes=(64, 64), use_trust_region=False), ) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=150, discount=0.99, gae_lambda=0.97, max_kl_step=0.01) timesteps = 6000000 batch_size = 150 * env.num_tasks epochs = timesteps // batch_size print(f'epochs: {epochs}, batch_size: {batch_size}') runner.setup(algo, env, sampler_args={'n_envs': 1}) runner.train(n_epochs=epochs, batch_size=batch_size, plot=False)
def test_fit(self, obs_dim): box_env = MetaRLEnv(DummyBoxEnv(obs_dim=obs_dim)) with mock.patch(('metarl.tf.baselines.' 'gaussian_mlp_baseline.' 'GaussianMLPRegressor'), new=SimpleGaussianMLPRegressor): gmb = GaussianMLPBaseline(env_spec=box_env.spec) paths = [{ 'observations': [np.full(obs_dim, 1)], 'returns': [1] }, { 'observations': [np.full(obs_dim, 2)], 'returns': [2] }] gmb.fit(paths) obs = {'observations': [np.full(obs_dim, 1), np.full(obs_dim, 2)]} prediction = gmb.predict(obs) assert np.array_equal(prediction, [1, 2])
def gaussian_gru_policy(ctxt, env_id, seed): """Create Gaussian GRU Policy on TF-PPO. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with LocalTFRunner(ctxt) as runner: env = MetaRLEnv(normalize(gym.make(env_id))) policy = GaussianGRUPolicy( env_spec=env.spec, hidden_dim=32, hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict( hidden_sizes=(64, 64), use_trust_region=False, optimizer=FirstOrderOptimizer, optimizer_args=dict( batch_size=32, max_epochs=10, learning_rate=1e-3, ), ), ) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_epochs=10, learning_rate=1e-3, ), ) runner.setup(algo, env, sampler_args=dict(n_envs=12)) runner.train(n_epochs=5, batch_size=2048)
def tf_ppo_pendulum(ctxt=None, seed=1): """Train PPO with InvertedDoublePendulum-v2 environment. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with LocalTFRunner(snapshot_config=ctxt) as runner: env = TfEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict( hidden_sizes=(64, 64), use_trust_region=True, ), ) # NOTE: make sure when setting entropy_method to 'max', set # center_adv to False and turn off policy gradient. See # tf.algos.NPO for detailed documentation. algo = RL2PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_epochs=10, learning_rate=1e-3, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.002, center_adv=False, ) runner.setup(algo, env) runner.train(n_epochs=120, batch_size=4096, plot=False)
def setup_method(self): super().setup_method() self.env = MetaRLEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) self.policy = GaussianMLPPolicy( env_spec=self.env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) self.baseline = GaussianMLPBaseline( env_spec=self.env.spec, regressor_args=dict(hidden_sizes=(32, 32)), )
def test_is_pickleable(self): box_env = MetaRLEnv(DummyBoxEnv(obs_dim=(1, ))) with mock.patch(('metarl.tf.baselines.' 'gaussian_mlp_baseline.' 'GaussianMLPRegressor'), new=SimpleGaussianMLPRegressor): gmb = GaussianMLPBaseline(env_spec=box_env.spec) obs = {'observations': [np.full(1, 1), np.full(1, 1)]} with tf.compat.v1.variable_scope('GaussianMLPBaseline', reuse=True): return_var = tf.compat.v1.get_variable( 'SimpleGaussianMLPModel/return_var') return_var.load(1.0) prediction = gmb.predict(obs) h = pickle.dumps(gmb) with tf.compat.v1.Session(graph=tf.Graph()): gmb_pickled = pickle.loads(h) prediction2 = gmb_pickled.predict(obs) assert np.array_equal(prediction, prediction2)
def ppo_ml1(ctxt=None, seed=1): """Run task.""" set_seed(seed) with LocalTFRunner(snapshot_config=ctxt) as runner: Ml1_reach_envs = get_ML1_envs_test(env_id) env = MTMetaWorldWrapper(Ml1_reach_envs) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, hidden_w_init=tf.constant_initializer(np.sqrt(2)), hidden_b_init=tf.constant_initializer(np.sqrt(2)), ) # baseline = LinearFeatureBaseline(env_spec=env.spec) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict( hidden_sizes=(64, 64), use_trust_region=False, hidden_w_init=tf.constant_initializer(np.sqrt(2)), hidden_b_init=tf.constant_initializer(np.sqrt(2)), ), ) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=150, discount=0.99, gae_lambda=0.97, lr_clip_range=0.2, optimizer_args=dict( batch_size=30, max_epochs=4, tf_optimizer_args=dict(learning_rate=3e-4, ), ), ) timesteps = 6000000 batch_size = 150 * env.num_tasks epochs = timesteps // batch_size print(f'epochs: {epochs}, batch_size: {batch_size}') runner.setup(algo, env, sampler_args={'n_envs': 1}) runner.train(n_epochs=epochs, batch_size=batch_size, plot=False)
def run_task(self, snapshot_config, *_): config = tf.ConfigProto(device_count={'GPU': 0}, allow_soft_placement=True, intra_op_parallelism_threads=12, inter_op_parallelism_threads=12) sess = tf.Session(config=config) with LocalTFRunner(snapshot_config=snapshot_config, sess=sess) as runner: env = gym.make(self._env) env = TfEnv(normalize(env)) env.reset() policy = GaussianGRUPolicy( env_spec=env.spec, hidden_dim=32, hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict( hidden_sizes=(64, 64), use_trust_region=False, optimizer=FirstOrderOptimizer, optimizer_args=dict( batch_size=32, max_epochs=10, tf_optimizer_args=dict(learning_rate=1e-3), ), ), ) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_epochs=10, tf_optimizer_args=dict(learning_rate=1e-3), ), ) runner.setup(algo, env, sampler_args=dict(n_envs=12)) runner.train(n_epochs=5, batch_size=2048)
def test_param_values(self, obs_dim): box_env = MetaRLEnv(DummyBoxEnv(obs_dim=obs_dim)) with mock.patch(('metarl.tf.baselines.' 'gaussian_mlp_baseline.' 'GaussianMLPRegressor'), new=SimpleGaussianMLPRegressor): gmb = GaussianMLPBaseline(env_spec=box_env.spec) new_gmb = GaussianMLPBaseline(env_spec=box_env.spec, name='GaussianMLPBaseline2') # Manual change the parameter of GaussianMLPBaseline with tf.compat.v1.variable_scope('GaussianMLPBaseline', reuse=True): return_var = tf.compat.v1.get_variable( 'SimpleGaussianMLPModel/return_var') return_var.load(1.0) old_param_values = gmb.get_param_values() new_param_values = new_gmb.get_param_values() assert not np.array_equal(old_param_values, new_param_values) new_gmb.set_param_values(old_param_values) new_param_values = new_gmb.get_param_values() assert np.array_equal(old_param_values, new_param_values)
def ppo_mt10_sampling(ctxt=None, seed=1): """Run task.""" set_seed(seed) with LocalTFRunner(snapshot_config=ctxt) as runner: env = MultiEnvSamplingWrapper(MT10_envs, env_ids, len(env_ids)-skip_size, sample_strategy=round_robin_strategy) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict( hidden_sizes=(64, 64), use_trust_region=False, ), ) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=150, discount=0.99, gae_lambda=0.97, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_epochs=10, tf_optimizer_args=dict( learning_rate=3e-4, ), ), ) batch_size = (len(env_ids)-skip_size)*10*150 epochs = (total_steps//batch_size)+10 print ("epochs:", epochs, "batch_size:", batch_size) runner.setup(algo, env) runner.train(n_epochs=epochs, batch_size=batch_size, plot=False)
def run_metarl_tf(env, seed, log_dir): """Create metarl TensorFlow PPO model and training. Args: env (dict): Environment of the task. seed (int): Random positive integer for the trial. log_dir (str): Log dir path. Returns: str: Path to output csv file """ deterministic.set_seed(seed) with LocalTFRunner(snapshot_config) as runner: env = TfEnv(normalize(env)) policy = TF_GMP( env_spec=env.spec, hidden_sizes=hyper_parameters['hidden_sizes'], hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) # baseline = LinearFeatureBaseline(env_spec=env.spec) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict( hidden_sizes=hyper_parameters['hidden_sizes'], use_trust_region=False, optimizer=FirstOrderOptimizer, optimizer_args=dict( batch_size=hyper_parameters['training_batch_size'], max_epochs=hyper_parameters['training_epochs'], tf_optimizer_args=dict( learning_rate=hyper_parameters['learning_rate'], ), ), ), ) algo = TF_PPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=hyper_parameters['max_path_length'], discount=hyper_parameters['discount'], gae_lambda=hyper_parameters['gae_lambda'], center_adv=hyper_parameters['center_adv'], policy_ent_coeff=hyper_parameters['policy_ent_coeff'], lr_clip_range=hyper_parameters['lr_clip_range'], optimizer_args=dict( batch_size=hyper_parameters['training_batch_size'], max_epochs=hyper_parameters['training_epochs'], tf_optimizer_args=dict( learning_rate=hyper_parameters['learning_rate']))) # yapf: disable # Set up logger since we are not using run_experiment tabular_log_file = osp.join(log_dir, 'progress.csv') dowel_logger.add_output(dowel.StdOutput()) dowel_logger.add_output(dowel.CsvOutput(tabular_log_file)) dowel_logger.add_output(dowel.TensorBoardOutput(log_dir)) runner.setup(algo, env) runner.train(n_epochs=hyper_parameters['n_epochs'], batch_size=hyper_parameters['batch_size']) dowel_logger.remove_all() return tabular_log_file