def test_dist_info_sym_include_action(self, obs_dim, action_dim, hidden_dim): env = TfEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) obs_ph = tf.compat.v1.placeholder( tf.float32, shape=(None, None, env.observation_space.flat_dim)) with mock.patch(('metarl.tf.policies.' 'gaussian_lstm_policy.GaussianLSTMModel'), new=SimpleGaussianLSTMModel): policy = GaussianLSTMPolicy(env_spec=env.spec, state_include_action=True) policy.reset() obs = env.reset() dist_sym = policy.dist_info_sym( obs_var=obs_ph, state_info_vars={'prev_action': np.zeros((2, 1) + action_dim)}, name='p2_sym') dist = self.sess.run( dist_sym, feed_dict={obs_ph: [[obs.flatten()], [obs.flatten()]]}) assert np.array_equal(dist['mean'], np.full((2, 1) + action_dim, 0.5)) assert np.array_equal(dist['log_std'], np.full((2, 1) + action_dim, 0.5))
def test_gaussian_lstm_policy(self): gaussian_lstm_policy = GaussianLSTMPolicy(env_spec=self.env, hidden_dim=1) self.sess.run(tf.compat.v1.global_variables_initializer()) gaussian_lstm_policy.reset() obs = self.env.observation_space.high assert gaussian_lstm_policy.get_action(obs)
def test_ppo_pendulum_lstm(self): """Test PPO with Pendulum environment and recurrent policy.""" with LocalTFRunner(snapshot_config) as runner: env = MetaRLEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) lstm_policy = GaussianLSTMPolicy(env_spec=env.spec) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict(hidden_sizes=(32, 32)), ) algo = PPO( env_spec=env.spec, policy=lstm_policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, ) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 80
def test_is_pickleable(self): env = TfEnv(DummyBoxEnv(obs_dim=(1, ), action_dim=(1, ))) with mock.patch(('metarl.tf.policies.' 'gaussian_lstm_policy.GaussianLSTMModel'), new=SimpleGaussianLSTMModel): policy = GaussianLSTMPolicy(env_spec=env.spec, state_include_action=False) env.reset() obs = env.reset() with tf.compat.v1.variable_scope( 'GaussianLSTMPolicy/GaussianLSTMModel', reuse=True): return_var = tf.compat.v1.get_variable('return_var') # assign it to all one return_var.load(tf.ones_like(return_var).eval()) output1 = self.sess.run( policy.model.networks['default'].mean, feed_dict={policy.model.input: [[obs.flatten()], [obs.flatten()]]}) p = pickle.dumps(policy) # yapf: disable with tf.compat.v1.Session(graph=tf.Graph()) as sess: policy_pickled = pickle.loads(p) output2 = sess.run( policy_pickled.model.networks['default'].mean, feed_dict={ policy_pickled.model.input: [[obs.flatten()], [obs.flatten()]] }) assert np.array_equal(output1, output2)
def ppo_cmb(env, seed, log_dir): """Create test continuous mlp baseline on ppo. Args: env (gym_env): Environment of the task. seed (int): Random seed for the trial. log_dir (str): Log dir path. Returns: str: training results in csv format. """ deterministic.set_seed(seed) config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=num_proc, inter_op_parallelism_threads=num_proc) sess = tf.Session(config=config) with LocalTFRunner(snapshot_config, sess=sess, max_cpus=num_proc) as runner: env = TfEnv(normalize(env)) policy = GaussianLSTMPolicy( env_spec=env.spec, hidden_dim=policy_params['policy_hidden_sizes'], hidden_nonlinearity=policy_params['hidden_nonlinearity'], ) baseline = ContinuousMLPBaseline( env_spec=env.spec, regressor_args=baseline_params['regressor_args'], ) algo = PPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=algo_params['max_path_length'], discount=algo_params['discount'], gae_lambda=algo_params['gae_lambda'], lr_clip_range=algo_params['lr_clip_range'], entropy_method=algo_params['entropy_method'], policy_ent_coeff=algo_params['policy_ent_coeff'], optimizer_args=algo_params['optimizer_args'], center_adv=algo_params['center_adv'], stop_entropy_gradient=True) # Set up logger since we are not using run_experiment tabular_log_file = osp.join(log_dir, 'progress.csv') dowel_logger.add_output(dowel.StdOutput()) dowel_logger.add_output(dowel.CsvOutput(tabular_log_file)) dowel_logger.add_output(dowel.TensorBoardOutput(log_dir)) runner.setup(algo, env, sampler_args=dict(n_envs=algo_params['n_envs'])) runner.train(n_epochs=algo_params['n_epochs'], batch_size=algo_params['n_rollout_steps']) dowel_logger.remove_all() return tabular_log_file
def test_process_samples_continuous_recurrent(self): env = TfEnv(DummyBoxEnv()) policy = GaussianLSTMPolicy(env_spec=env.spec) baseline = GaussianMLPBaseline(env_spec=env.spec) max_path_length = 100 with LocalTFRunner(snapshot_config, sess=self.sess) as runner: algo = BatchPolopt2(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=max_path_length, flatten_input=True) runner.setup(algo, env, sampler_args=dict(n_envs=1)) runner.train(n_epochs=1, batch_size=max_path_length) paths = runner.obtain_samples(0) samples = algo.process_samples(0, paths) # Since there is only 1 vec_env in the sampler and DummyBoxEnv # never terminate until it reaches max_path_length, batch size # must be max_path_length, i.e. 100 assert samples['observations'].shape == ( max_path_length, env.observation_space.flat_dim) assert samples['actions'].shape == (max_path_length, env.action_space.flat_dim) assert samples['rewards'].shape == (max_path_length, ) assert samples['baselines'].shape == (max_path_length, ) assert samples['returns'].shape == (max_path_length, ) # there is only 1 path assert samples['lengths'].shape == (1, ) for key, shape in policy.state_info_specs: assert samples['agent_infos'][key].shape == (max_path_length, np.prod(shape)) # DummyBoxEnv has env_info dummy assert samples['env_infos']['dummy'].shape == (max_path_length, ) assert isinstance(samples['average_return'], float)
def test_is_pickleable(self): env = MetaRLEnv(DummyBoxEnv(obs_dim=(1, ), action_dim=(1, ))) obs_var = tf.compat.v1.placeholder( tf.float32, shape=[None, None, env.observation_space.flat_dim], name='obs') policy = GaussianLSTMPolicy(env_spec=env.spec, state_include_action=False) policy.build(obs_var) env.reset() obs = env.reset() with tf.compat.v1.variable_scope( 'GaussianLSTMPolicy/GaussianLSTMModel', reuse=True): param = tf.compat.v1.get_variable( 'dist_params/log_std_param/parameter') # assign it to all one param.load(tf.ones_like(param).eval()) output1 = self.sess.run( [policy.distribution.loc, policy.distribution.stddev()], feed_dict={policy.model.input: [[obs.flatten()], [obs.flatten()]]}) p = pickle.dumps(policy) # yapf: disable with tf.compat.v1.Session(graph=tf.Graph()) as sess: policy_pickled = pickle.loads(p) obs_var = tf.compat.v1.placeholder( tf.float32, shape=[None, None, env.observation_space.flat_dim], name='obs') policy_pickled.build(obs_var) output2 = sess.run( [ policy_pickled.distribution.loc, policy_pickled.distribution.stddev() ], feed_dict={ policy_pickled.model.input: [[obs.flatten()], [obs.flatten()]] }) assert np.array_equal(output1, output2)
def gaussian_lstm_policy(ctxt, env_id, seed): """Create Gaussian LSTM Policy on TF-PPO. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with LocalTFRunner(ctxt) as runner: env = MetaRLEnv(normalize(gym.make(env_id))) policy = GaussianLSTMPolicy( env_spec=env.spec, hidden_dim=32, hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict( hidden_sizes=(64, 64), use_trust_region=False, optimizer=FirstOrderOptimizer, optimizer_args=dict( batch_size=32, max_epochs=10, learning_rate=1e-3, ), ), ) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_epochs=10, learning_rate=1e-3, ), ) runner.setup(algo, env, sampler_args=dict(n_envs=12)) runner.train(n_epochs=5, batch_size=2048)
def test_get_action(self, mock_normal, obs_dim, action_dim, hidden_dim): mock_normal.return_value = 0.5 env = TfEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) with mock.patch(('metarl.tf.policies.' 'gaussian_lstm_policy.GaussianLSTMModel'), new=SimpleGaussianLSTMModel): policy = GaussianLSTMPolicy(env_spec=env.spec, state_include_action=False) expected_action = np.full(action_dim, 0.5 * np.exp(0.5) + 0.5) policy.reset() obs = env.reset() action, agent_info = policy.get_action(obs) assert env.action_space.contains(action) assert np.allclose(action, np.full(action_dim, expected_action), atol=1e-6) expected_mean = np.full(action_dim, 0.5) assert np.array_equal(agent_info['mean'], expected_mean) expected_log_std = np.full(action_dim, 0.5) assert np.array_equal(agent_info['log_std'], expected_log_std) actions, agent_infos = policy.get_actions([obs]) for action, mean, log_std in zip(actions, agent_infos['mean'], agent_infos['log_std']): assert env.action_space.contains(action) assert np.allclose(action, np.full(action_dim, expected_action), atol=1e-6) assert np.array_equal(mean, expected_mean) assert np.array_equal(log_std, expected_log_std)
def continuous_mlp_baseline(ctxt, env_id, seed): """Create Continuous MLP Baseline on TF-PPO. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with LocalTFRunner(ctxt, max_cpus=hyper_params['num_proc']) as runner: env = MetaRLEnv(normalize(gym.make(env_id))) policy = GaussianLSTMPolicy( env_spec=env.spec, hidden_dim=hyper_params['policy_hidden_sizes'], hidden_nonlinearity=hyper_params['hidden_nonlinearity'], ) baseline = ContinuousMLPBaseline( env_spec=env.spec, regressor_args=dict(hidden_sizes=(64, 64)), ) algo = PPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=hyper_params['max_path_length'], discount=hyper_params['discount'], gae_lambda=hyper_params['gae_lambda'], lr_clip_range=hyper_params['lr_clip_range'], entropy_method=hyper_params['entropy_method'], policy_ent_coeff=hyper_params['policy_ent_coeff'], optimizer_args=dict( batch_size=32, max_epochs=10, learning_rate=1e-3, ), center_adv=hyper_params['center_adv'], stop_entropy_gradient=True) runner.setup(algo, env, sampler_args=dict(n_envs=hyper_params['n_envs'])) runner.train(n_epochs=hyper_params['n_epochs'], batch_size=hyper_params['n_rollout_steps'])
def run_task(self, snapshot_config, *_): config = tf.ConfigProto(device_count={'GPU': 0}, allow_soft_placement=True, intra_op_parallelism_threads=12, inter_op_parallelism_threads=12) sess = tf.Session(config=config) with LocalTFRunner(snapshot_config=snapshot_config, sess=sess) as runner: env = gym.make(self._env) env = TfEnv(normalize(env)) env.reset() policy = GaussianLSTMPolicy( env_spec=env.spec, hidden_dim=32, hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict( hidden_sizes=(64, 64), use_trust_region=False, optimizer=FirstOrderOptimizer, optimizer_args=dict( batch_size=32, max_epochs=10, tf_optimizer_args=dict(learning_rate=1e-3), ), ), ) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_epochs=10, tf_optimizer_args=dict(learning_rate=1e-3), ), ) runner.setup(algo, env, sampler_args=dict(n_envs=12)) runner.train(n_epochs=5, batch_size=2048)
def test_get_action_state_include_action(self, obs_dim, action_dim, hidden_dim): env = MetaRLEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) obs_var = tf.compat.v1.placeholder( tf.float32, shape=[ None, None, env.observation_space.flat_dim + np.prod(action_dim) ], name='obs') policy = GaussianLSTMPolicy(env_spec=env.spec, hidden_dim=hidden_dim, state_include_action=True) policy.build(obs_var) policy.reset() obs = env.reset() action, _ = policy.get_action(obs.flatten()) assert env.action_space.contains(action) policy.reset() actions, _ = policy.get_actions([obs.flatten()]) for action in actions: assert env.action_space.contains(action)
def setup_method(self): super().setup_method() self.env = MetaRLEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) self.policy = GaussianMLPPolicy( env_spec=self.env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) self.lstm_policy = GaussianLSTMPolicy(env_spec=self.env.spec) self.gru_policy = GaussianGRUPolicy(env_spec=self.env.spec) self.baseline = GaussianMLPBaseline( env_spec=self.env.spec, regressor_args=dict(hidden_sizes=(32, 32)), )
def test_dist_info_sym_wrong_input(self): env = TfEnv(DummyBoxEnv(obs_dim=(1, ), action_dim=(1, ))) obs_ph = tf.compat.v1.placeholder( tf.float32, shape=(None, None, env.observation_space.flat_dim)) with mock.patch(('metarl.tf.policies.' 'gaussian_lstm_policy.GaussianLSTMModel'), new=SimpleGaussianLSTMModel): policy = GaussianLSTMPolicy(env_spec=env.spec, state_include_action=True) policy.reset() obs = env.reset() policy.dist_info_sym( obs_var=obs_ph, state_info_vars={'prev_action': np.zeros((3, 1, 1))}, name='p2_sym') # observation batch size = 2 but prev_action batch size = 3 with pytest.raises(tf.errors.InvalidArgumentError): self.sess.run( policy.model.networks['p2_sym'].input, feed_dict={obs_ph: [[obs.flatten()], [obs.flatten()]]})
def test_state_info_specs(self): env = MetaRLEnv(DummyBoxEnv(obs_dim=(4, ), action_dim=(4, ))) policy = GaussianLSTMPolicy(env_spec=env.spec, state_include_action=False) assert policy.state_info_specs == []
def test_clone(self): env = MetaRLEnv(DummyBoxEnv(obs_dim=(4, ), action_dim=(4, ))) policy = GaussianLSTMPolicy(env_spec=env.spec) policy_clone = policy.clone('GaussianLSTMPolicyClone') assert policy_clone.env_spec == policy.env_spec
def test_state_info_specs_with_state_include_action(self): env = MetaRLEnv(DummyBoxEnv(obs_dim=(4, ), action_dim=(4, ))) policy = GaussianLSTMPolicy(env_spec=env.spec, state_include_action=True) assert policy.state_info_specs == [('prev_action', (4, ))]
def test_invalid_env(self): env = TfEnv(DummyDiscreteEnv()) with pytest.raises(ValueError): GaussianLSTMPolicy(env_spec=env.spec)