def test_rl2_sampler_less_envs_than_meta_batch(self): with LocalTFRunner(snapshot_config, sess=self.sess) as runner: policy = GaussianMLPPolicy(env_spec=self.env.spec, hidden_sizes=[32, 32]) baseline = LinearFeatureBaseline(env_spec=self.env.spec) algo = PPO(env_spec=self.env.spec, policy=policy, baseline=baseline, max_path_length=self.max_path_length, discount=0.99) runner.setup(algo, env=self.env, sampler_cls=RL2Sampler, sampler_args=dict( meta_batch_size=self.meta_batch_size, n_envs=self.meta_batch_size // 2)) runner._start_worker() assert isinstance(runner._sampler, RL2Sampler) assert runner._sampler._envs_per_worker == 1 all_indices = np.arange(self.meta_batch_size) for i in range(self.meta_batch_size // 2): assert all(runner._sampler._vec_envs_indices[i] == all_indices[i * 2:i * 2 + 2]) paths = runner._sampler.obtain_samples(0) assert len(paths) == self.meta_batch_size assert len(paths[0]['observations']) == self.max_path_length paths = runner._sampler.obtain_samples( 0, self.meta_batch_size * 10 * self.max_path_length) assert len(paths) == self.meta_batch_size * 10 assert len(paths[0]['observations']) == self.max_path_length
def test_ppo_pendulum_gru(self): """Test PPO with Pendulum environment and recurrent policy.""" with LocalTFRunner(snapshot_config) as runner: env = MetaRLEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) gru_policy = GaussianGRUPolicy(env_spec=env.spec) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict(hidden_sizes=(32, 32)), ) algo = PPO( env_spec=env.spec, policy=gru_policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, ) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 80
def ppo_cmb(env, seed, log_dir): """Create test continuous mlp baseline on ppo. Args: env (gym_env): Environment of the task. seed (int): Random seed for the trial. log_dir (str): Log dir path. Returns: str: training results in csv format. """ deterministic.set_seed(seed) config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=num_proc, inter_op_parallelism_threads=num_proc) sess = tf.Session(config=config) with LocalTFRunner(snapshot_config, sess=sess, max_cpus=num_proc) as runner: env = TfEnv(normalize(env)) policy = GaussianLSTMPolicy( env_spec=env.spec, hidden_dim=policy_params['policy_hidden_sizes'], hidden_nonlinearity=policy_params['hidden_nonlinearity'], ) baseline = ContinuousMLPBaseline( env_spec=env.spec, regressor_args=baseline_params['regressor_args'], ) algo = PPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=algo_params['max_path_length'], discount=algo_params['discount'], gae_lambda=algo_params['gae_lambda'], lr_clip_range=algo_params['lr_clip_range'], entropy_method=algo_params['entropy_method'], policy_ent_coeff=algo_params['policy_ent_coeff'], optimizer_args=algo_params['optimizer_args'], center_adv=algo_params['center_adv'], stop_entropy_gradient=True) # Set up logger since we are not using run_experiment tabular_log_file = osp.join(log_dir, 'progress.csv') dowel_logger.add_output(dowel.StdOutput()) dowel_logger.add_output(dowel.CsvOutput(tabular_log_file)) dowel_logger.add_output(dowel.TensorBoardOutput(log_dir)) runner.setup(algo, env, sampler_args=dict(n_envs=algo_params['n_envs'])) runner.train(n_epochs=algo_params['n_epochs'], batch_size=algo_params['n_rollout_steps']) dowel_logger.remove_all() return tabular_log_file
def test_ppo_pendulum_flatten_input(self): """Test PPO with CartPole to test observation flattening.""" with LocalTFRunner(snapshot_config, sess=self.sess) as runner: env = MetaRLEnv( normalize(ReshapeObservation(gym.make('CartPole-v1'), (2, 2)))) policy = CategoricalMLPPolicy( env_spec=env.spec, hidden_nonlinearity=tf.nn.tanh, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = PPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_epochs=10, learning_rate=1e-3, )) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 80
def gaussian_gru_policy(ctxt, env_id, seed): """Create Gaussian GRU Policy on TF-PPO. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with LocalTFRunner(ctxt) as runner: env = MetaRLEnv(normalize(gym.make(env_id))) policy = GaussianGRUPolicy( env_spec=env.spec, hidden_dim=32, hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict( hidden_sizes=(64, 64), use_trust_region=False, optimizer=FirstOrderOptimizer, optimizer_args=dict( batch_size=32, max_epochs=10, learning_rate=1e-3, ), ), ) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_epochs=10, learning_rate=1e-3, ), ) runner.setup(algo, env, sampler_args=dict(n_envs=12)) runner.train(n_epochs=5, batch_size=2048)
def ppo_memorize_digits(ctxt=None, seed=1, batch_size=4000): """Train PPO on MemorizeDigits-v0 environment. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. batch_size (int): Number of timesteps to use in each training step. """ set_seed(seed) with LocalTFRunner(ctxt) as runner: env = MetaRLEnv(normalize(gym.make('MemorizeDigits-v0')), is_image=True) policy = CategoricalCNNPolicy(env_spec=env.spec, filters=( (32, (5, 5)), (64, (3, 3)), (64, (2, 2)), ), strides=(4, 2, 1), padding='VALID', hidden_sizes=(256, )) # yapf: disable baseline = GaussianCNNBaseline( env_spec=env.spec, regressor_args=dict(filters=( (32, (5, 5)), (64, (3, 3)), (64, (2, 2)), ), strides=(4, 2, 1), padding='VALID', hidden_sizes=(256, ), use_trust_region=True)) # yapf: disable algo = PPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_epochs=10, learning_rate=1e-3, ), flatten_input=False) runner.setup(algo, env) runner.train(n_epochs=1000, batch_size=batch_size)
def ppo_pendulum(ctxt=None, seed=1): """Train PPO with InvertedDoublePendulum-v2 environment. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with LocalTFRunner(snapshot_config=ctxt) as runner: env = MetaRLEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict( hidden_sizes=(32, 32), use_trust_region=True, ), ) # NOTE: make sure when setting entropy_method to 'max', set # center_adv to False and turn off policy gradient. See # tf.algos.NPO for detailed documentation. algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, ) runner.setup(algo, env) runner.train(n_epochs=120, batch_size=2048, plot=False)
def run_metarl(env, seed, log_dir): ''' Create metarl model and training. Replace the ppo with the algorithm you want to run. :param env: Environment of the task. :param seed: Random seed for the trial. :param log_dir: Log dir path. :return: ''' deterministic.set_seed(seed) config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=12, inter_op_parallelism_threads=12) sess = tf.Session(config=config) with LocalTFRunner(snapshot_config, sess=sess, max_cpus=12) as runner: env = TfEnv(normalize(env)) policy = CategoricalLSTMPolicy( env_spec=env.spec, hidden_dim=32, hidden_nonlinearity=tf.nn.tanh, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_epochs=10, tf_optimizer_args=dict(learning_rate=1e-3), ), ) # Set up logger since we are not using run_experiment tabular_log_file = osp.join(log_dir, 'progress.csv') dowel_logger.add_output(dowel.StdOutput()) dowel_logger.add_output(dowel.CsvOutput(tabular_log_file)) dowel_logger.add_output(dowel.TensorBoardOutput(log_dir)) runner.setup(algo, env, sampler_args=dict(n_envs=12)) runner.train(n_epochs=488, batch_size=2048) dowel_logger.remove_all() return tabular_log_file
def categorical_cnn_policy(ctxt, env_id, seed): """Create Categorical CNN Policy on TF-PPO. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with LocalTFRunner(ctxt, max_cpus=12) as runner: env = MetaRLEnv(normalize(gym.make(env_id))) policy = CategoricalCNNPolicy( env_spec=env.spec, conv_filters=hyper_params['conv_filters'], conv_strides=hyper_params['conv_strides'], conv_pad=hyper_params['conv_pad'], hidden_sizes=hyper_params['hidden_sizes']) baseline = GaussianCNNBaseline( env_spec=env.spec, regressor_args=dict( filters=hyper_params['conv_filters'], strides=hyper_params['conv_strides'], padding=hyper_params['conv_pad'], hidden_sizes=hyper_params['hidden_sizes'], use_trust_region=hyper_params['use_trust_region'])) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_epochs=10, learning_rate=1e-3, ), flatten_input=False, ) runner.setup(algo, env) runner.train(n_epochs=hyper_params['n_epochs'], batch_size=hyper_params['batch_size'])
def test_ppo_pendulum(self): """Test PPO with Pendulum environment.""" with LocalTFRunner(snapshot_config, sess=self.sess) as runner: algo = PPO(env_spec=self.env.spec, policy=self.policy, baseline=self.baseline, max_path_length=100, discount=0.99, lr_clip_range=0.01, optimizer_args=dict(batch_size=32, max_epochs=10)) runner.setup(algo, self.env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 35
def ppo_ml1(ctxt=None, seed=1): """Run task.""" set_seed(seed) with LocalTFRunner(snapshot_config=ctxt) as runner: Ml1_reach_envs = get_ML1_envs_test(env_id) env = MTMetaWorldWrapper(Ml1_reach_envs) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, hidden_w_init=tf.constant_initializer(np.sqrt(2)), hidden_b_init=tf.constant_initializer(np.sqrt(2)), ) # baseline = LinearFeatureBaseline(env_spec=env.spec) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict( hidden_sizes=(64, 64), use_trust_region=False, hidden_w_init=tf.constant_initializer(np.sqrt(2)), hidden_b_init=tf.constant_initializer(np.sqrt(2)), ), ) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=150, discount=0.99, gae_lambda=0.97, lr_clip_range=0.2, optimizer_args=dict( batch_size=30, max_epochs=4, tf_optimizer_args=dict(learning_rate=3e-4, ), ), ) timesteps = 6000000 batch_size = 150 * env.num_tasks epochs = timesteps // batch_size print(f'epochs: {epochs}, batch_size: {batch_size}') runner.setup(algo, env, sampler_args={'n_envs': 1}) runner.train(n_epochs=epochs, batch_size=batch_size, plot=False)
def run_task(self, snapshot_config, *_): config = tf.ConfigProto(device_count={'GPU': 0}, allow_soft_placement=True, intra_op_parallelism_threads=12, inter_op_parallelism_threads=12) sess = tf.Session(config=config) with LocalTFRunner(snapshot_config=snapshot_config, sess=sess) as runner: env = gym.make(self._env) env = TfEnv(normalize(env)) env.reset() policy = GaussianGRUPolicy( env_spec=env.spec, hidden_dim=32, hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict( hidden_sizes=(64, 64), use_trust_region=False, optimizer=FirstOrderOptimizer, optimizer_args=dict( batch_size=32, max_epochs=10, tf_optimizer_args=dict(learning_rate=1e-3), ), ), ) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_epochs=10, tf_optimizer_args=dict(learning_rate=1e-3), ), ) runner.setup(algo, env, sampler_args=dict(n_envs=12)) runner.train(n_epochs=5, batch_size=2048)
def continuous_mlp_baseline(ctxt, env_id, seed): """Create Continuous MLP Baseline on TF-PPO. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with LocalTFRunner(ctxt, max_cpus=hyper_params['num_proc']) as runner: env = MetaRLEnv(normalize(gym.make(env_id))) policy = GaussianLSTMPolicy( env_spec=env.spec, hidden_dim=hyper_params['policy_hidden_sizes'], hidden_nonlinearity=hyper_params['hidden_nonlinearity'], ) baseline = ContinuousMLPBaseline( env_spec=env.spec, regressor_args=dict(hidden_sizes=(64, 64)), ) algo = PPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=hyper_params['max_path_length'], discount=hyper_params['discount'], gae_lambda=hyper_params['gae_lambda'], lr_clip_range=hyper_params['lr_clip_range'], entropy_method=hyper_params['entropy_method'], policy_ent_coeff=hyper_params['policy_ent_coeff'], optimizer_args=dict( batch_size=32, max_epochs=10, learning_rate=1e-3, ), center_adv=hyper_params['center_adv'], stop_entropy_gradient=True) runner.setup(algo, env, sampler_args=dict(n_envs=hyper_params['n_envs'])) runner.train(n_epochs=hyper_params['n_epochs'], batch_size=hyper_params['n_rollout_steps'])
def test_ppo_with_maximum_entropy(self): """Test PPO with maxium entropy method.""" with LocalTFRunner(snapshot_config, sess=self.sess) as runner: algo = PPO(env_spec=self.env.spec, policy=self.policy, baseline=self.baseline, max_path_length=100, discount=0.99, lr_clip_range=0.01, optimizer_args=dict(batch_size=32, max_epochs=10), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False) runner.setup(algo, self.env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 35
def ppo_mt10_sampling(ctxt=None, seed=1): """Run task.""" set_seed(seed) with LocalTFRunner(snapshot_config=ctxt) as runner: env = MultiEnvSamplingWrapper(MT10_envs, env_ids, len(env_ids)-skip_size, sample_strategy=round_robin_strategy) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict( hidden_sizes=(64, 64), use_trust_region=False, ), ) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=150, discount=0.99, gae_lambda=0.97, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_epochs=10, tf_optimizer_args=dict( learning_rate=3e-4, ), ), ) batch_size = (len(env_ids)-skip_size)*10*150 epochs = (total_steps//batch_size)+10 print ("epochs:", epochs, "batch_size:", batch_size) runner.setup(algo, env) runner.train(n_epochs=epochs, batch_size=batch_size, plot=False)
def categorical_gru_policy(ctxt, env_id, seed): """Create Categorical CNN Policy on TF-PPO. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with LocalTFRunner(ctxt, max_cpus=12) as runner: env = MetaRLEnv(normalize(gym.make(env_id))) policy = CategoricalGRUPolicy( env_spec=env.spec, hidden_dim=32, hidden_nonlinearity=tf.nn.tanh, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_epochs=10, learning_rate=1e-3, ), ) runner.setup(algo, env, sampler_args=dict(n_envs=12)) runner.train(n_epochs=488, batch_size=2048)
def test_ppo_with_neg_log_likeli_entropy_estimation_and_regularized(self): """ Test PPO with negative log likelihood entropy estimation and regularized entropy method. """ with LocalTFRunner(snapshot_config, sess=self.sess) as runner: algo = PPO(env_spec=self.env.spec, policy=self.policy, baseline=self.baseline, max_path_length=100, discount=0.99, lr_clip_range=0.01, optimizer_args=dict(batch_size=32, max_epochs=10), stop_entropy_gradient=True, use_neg_logli_entropy=True, entropy_method='regularized', policy_ent_coeff=0.0, center_adv=True) runner.setup(algo, self.env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 35
def run_task(snapshot_config, variant_data, *_): """Run task. Args: snapshot_config (metarl.experiment.SnapshotConfig): The snapshot configuration used by LocalRunner to create the snapshotter. variant_data (dict): Custom arguments for the task. *_ (object): Ignored by this function. """ with LocalTFRunner(snapshot_config=snapshot_config) as runner: env = TfEnv(normalize(gym.make('MemorizeDigits-v0'))) policy = CategoricalCNNPolicy(env_spec=env.spec, conv_filters=(32, 64, 64), conv_filter_sizes=(5, 3, 2), conv_strides=(4, 2, 1), conv_pad='VALID', hidden_sizes=(256, )) baseline = GaussianCNNBaseline(env_spec=env.spec, regressor_args=dict( num_filters=(32, 64, 64), filter_dims=(5, 3, 2), strides=(4, 2, 1), padding='VALID', hidden_sizes=(256, ), use_trust_region=True)) algo = PPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, max_kl_step=0.01, flatten_input=False) runner.setup(algo, env) runner.train(n_epochs=1000, batch_size=variant_data['batch_size'])
def multi_env_ppo(ctxt=None, seed=1): """Train PPO on two Atari environments simultaneously. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with LocalTFRunner(ctxt) as runner: env1 = MetaRLEnv(normalize(gym.make('Adventure-ram-v4'))) env2 = MetaRLEnv(normalize(gym.make('Alien-ram-v4'))) env = MultiEnvWrapper([env1, env2]) policy = CategoricalMLPPolicy( env_spec=env.spec, hidden_nonlinearity=tf.nn.tanh, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = PPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_epochs=10, learning_rate=1e-3, )) runner.setup(algo, env) runner.train(n_epochs=120, batch_size=2048, plot=False)
def run_task(snapshot_config, *_): """Run task. Args: snapshot_config (metarl.experiment.SnapshotConfig): The snapshot configuration used by LocalRunner to create the snapshotter. _ (object): Ignored by this function. """ with LocalTFRunner(snapshot_config=snapshot_config) as runner: env1 = TfEnv(normalize(gym.make('Adventure-ram-v4'))) env2 = TfEnv(normalize(gym.make('Alien-ram-v4'))) env = MultiEnvWrapper([env1, env2]) policy = CategoricalMLPPolicy( env_spec=env.spec, hidden_nonlinearity=tf.nn.tanh, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = PPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_epochs=10, tf_optimizer_args=dict(learning_rate=1e-3), )) runner.setup(algo, env) runner.train(n_epochs=120, batch_size=2048, plot=False)
def test_rl2_sampler_invalid_num_of_env_again(self): with pytest.raises( ValueError, match='n_envs must be a multiple of meta_batch_size'): with LocalTFRunner(snapshot_config, sess=self.sess) as runner: policy = GaussianMLPPolicy(env_spec=self.env.spec, hidden_sizes=[32, 32]) baseline = LinearFeatureBaseline(env_spec=self.env.spec) algo = PPO(env_spec=self.env.spec, policy=policy, baseline=baseline, max_path_length=self.max_path_length, discount=0.99) runner.setup(algo, env=self.env, sampler_cls=RL2Sampler, sampler_args=dict( meta_batch_size=self.meta_batch_size, n_envs=self.meta_batch_size + 1)) runner._start_worker() runner._sampler.obtain_samples(0)
def test_ppo_pendulum_continuous_baseline(self): """Test PPO with Pendulum environment.""" with LocalTFRunner(snapshot_config, sess=self.sess) as runner: env = MetaRLEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = ContinuousMLPBaseline( env_spec=env.spec, regressor_args=dict(hidden_sizes=(32, 32)), ) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, ) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 100 env.close()
def run_metarl(env, seed, log_dir): ''' Create metarl model and training. Replace the ppo with the algorithm you want to run. :param env: Environment of the task. :param seed: Random seed for the trial. :param log_dir: Log dir path. :return: ''' deterministic.set_seed(seed) config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=12, inter_op_parallelism_threads=12) sess = tf.Session(config=config) with LocalTFRunner(snapshot_config, sess=sess, max_cpus=12) as runner: env = TfEnv(normalize(env)) policy = CategoricalCNNPolicy( env_spec=env.spec, conv_filters=params['conv_filters'], conv_filter_sizes=params['conv_filter_sizes'], conv_strides=params['conv_strides'], conv_pad=params['conv_pad'], hidden_sizes=params['hidden_sizes']) baseline = GaussianCNNBaseline( env_spec=env.spec, regressor_args=dict(num_filters=params['conv_filters'], filter_dims=params['conv_filter_sizes'], strides=params['conv_strides'], padding=params['conv_pad'], hidden_sizes=params['hidden_sizes'], use_trust_region=params['use_trust_region'])) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_epochs=10, tf_optimizer_args=dict(learning_rate=1e-3), ), flatten_input=False, ) # Set up logger since we are not using run_experiment tabular_log_file = osp.join(log_dir, 'progress.csv') dowel_logger.add_output(dowel.StdOutput()) dowel_logger.add_output(dowel.CsvOutput(tabular_log_file)) dowel_logger.add_output(dowel.TensorBoardOutput(log_dir)) runner.setup(algo, env) runner.train(n_epochs=params['n_epochs'], batch_size=params['batch_size']) dowel_logger.remove_all() return tabular_log_file