def test_session(self): with LocalTFRunner(snapshot_config): assert tf.compat.v1.get_default_session() is not None, ( 'LocalTFRunner() should provide a default tf session.') sess = tf.compat.v1.Session() with LocalTFRunner(snapshot_config, sess=sess): assert tf.compat.v1.get_default_session() is sess, ( 'LocalTFRunner(sess) should use sess as default session.')
def test_ppo_pendulum_gru(self): """Test PPO with Pendulum environment and recurrent policy.""" with LocalTFRunner(snapshot_config) as runner: env = TfEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) gru_policy = GaussianGRUPolicy(env_spec=env.spec) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict(hidden_sizes=(32, 32)), ) algo = PPO( env_spec=env.spec, policy=gru_policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, ) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 80
def test_ppo_pendulum_flatten_input(self): """Test PPO with CartPole to test observation flattening.""" with LocalTFRunner(snapshot_config, sess=self.sess) as runner: env = TfEnv( normalize(ReshapeObservation(gym.make('CartPole-v1'), (2, 2)))) policy = CategoricalMLPPolicy( env_spec=env.spec, hidden_nonlinearity=tf.nn.tanh, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = PPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_epochs=10, tf_optimizer_args=dict(learning_rate=1e-3), )) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 80
def run_task(snapshot_config, *_): """Run the job. Args: snapshot_config (metarl.experiment.SnapshotConfig): Configuration values for snapshotting. *_ (object): Hyperparameters (unused). """ with LocalTFRunner(snapshot_config=snapshot_config) as runner: env = TfEnv(normalize(gym.make('InvertedPendulum-v2'))) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, max_kl_step=0.01) runner.setup(algo, env, sampler_cls=ISSampler, sampler_args=dict(n_backtrack=1)) runner.train(n_epochs=200, batch_size=4000)
def test_cem_cartpole(self): """Test CEM with Cartpole-v1 environment.""" with LocalTFRunner(snapshot_config) as runner: env = TfEnv(env_name='CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) n_samples = 10 algo = CEM(env_spec=env.spec, policy=policy, baseline=baseline, best_frac=0.1, max_path_length=100, n_samples=n_samples) runner.setup(algo, env, sampler_cls=OnPolicyVectorizedSampler) rtn = runner.train(n_epochs=10, batch_size=2048) assert rtn > 40 env.close()
def test_rl2_sampler_less_envs_than_meta_batch(self): with LocalTFRunner(snapshot_config, sess=self.sess) as runner: policy = GaussianMLPPolicy(env_spec=self.env.spec, hidden_sizes=[32, 32]) baseline = LinearFeatureBaseline(env_spec=self.env.spec) algo = PPO(env_spec=self.env.spec, policy=policy, baseline=baseline, max_path_length=self.max_path_length, discount=0.99) runner.setup(algo, env=self.env, sampler_cls=RL2Sampler, sampler_args=dict( meta_batch_size=self.meta_batch_size, n_envs=self.meta_batch_size // 2)) runner._start_worker() assert isinstance(runner._sampler, RL2Sampler) assert runner._sampler._envs_per_worker == 1 all_indices = np.arange(self.meta_batch_size) for i in range(self.meta_batch_size // 2): assert all(runner._sampler._vec_envs_indices[i] == all_indices[i * 2:i * 2 + 2]) paths = runner._sampler.obtain_samples(0) assert len(paths) == self.meta_batch_size assert len(paths[0]['observations']) == self.max_path_length paths = runner._sampler.obtain_samples( 0, self.meta_batch_size * 10 * self.max_path_length) assert len(paths) == self.meta_batch_size * 10 assert len(paths[0]['observations']) == self.max_path_length
def run_task(snapshot_config, *_): """Train CMA_ES with Cartpole-v1 environment. Args: snapshot_config (metarl.experiment.SnapshotConfig): The snapshot configuration used by LocalRunner to create the snapshotter. *_ (object): Ignored by this function. """ with LocalTFRunner(snapshot_config=snapshot_config) as runner: env = TfEnv(env_name='CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) n_samples = 20 algo = CMAES(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, n_samples=n_samples) runner.setup(algo, env, sampler_cls=OnPolicyVectorizedSampler) runner.train(n_epochs=100, batch_size=1000)
def trpo_ml1(ctxt=None, seed=1): """Run task.""" set_seed(seed) with LocalTFRunner(snapshot_config=ctxt) as runner: Ml1_reach_envs = get_ML1_envs_test(env_id) env = MTMetaWorldWrapper(Ml1_reach_envs) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict(hidden_sizes=(64, 64), use_trust_region=False), ) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=150, discount=0.99, gae_lambda=0.97, max_kl_step=0.01) timesteps = 6000000 batch_size = 150 * env.num_tasks epochs = timesteps // batch_size print(f'epochs: {epochs}, batch_size: {batch_size}') runner.setup(algo, env, sampler_args={'n_envs': 1}) runner.train(n_epochs=epochs, batch_size=batch_size, plot=False)
def trpo_mt50(ctxt=None, seed=1): """Run task.""" set_seed(seed) with LocalTFRunner(snapshot_config=ctxt) as runner: env = MultiEnvWrapper(MT50_envs, env_ids, sample_strategy=round_robin_strategy) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(64, 64)) # baseline = LinearFeatureBaseline(env_spec=env.spec) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict( hidden_sizes=(64, 64), use_trust_region=False, ), ) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=150, discount=0.99, gae_lambda=0.97, max_kl_step=0.01) runner.setup(algo, env) runner.train(n_epochs=1500, batch_size=len(MT50_envs)*10*150)
def run_task(snapshot_config, *_): """Run task. Args: snapshot_config (metarl.experiment.SnapshotConfig): The snapshot configuration used by LocalRunner to create the snapshotter. _ (object): Ignored by this function. """ with LocalTFRunner(snapshot_config=snapshot_config) as runner: env1 = TfEnv(normalize(PointEnv(goal=(-1., 0.)))) env2 = TfEnv(normalize(PointEnv(goal=(1., 0.)))) env = MultiEnvWrapper([env1, env2]) policy = GaussianMLPPolicy(env_spec=env.spec) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0) runner.setup(algo, env) runner.train(n_epochs=40, batch_size=2048, plot=False)
def run_task(snapshot_config, *_): """Run task. Args: snapshot_config (metarl.experiment.SnapshotConfig): Configuration values for snapshotting. *_ (object): Hyperparameters (unused). """ with LocalTFRunner(snapshot_config=snapshot_config) as runner: env = TfEnv(gym.make('Swimmer-v2')) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=500, discount=0.99, max_kl_step=0.01) runner.setup(algo, env, sampler_cls=RaySampler, sampler_args={'seed': seed}) runner.train(n_epochs=40, batch_size=4000)
def erwr_cartpole(ctxt=None, seed=1): """Train with ERWR on CartPole-v1 environment. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with LocalTFRunner(snapshot_config=ctxt) as runner: env = TfEnv(env_name='CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = ERWR(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99) runner.setup(algo=algo, env=env) runner.train(n_epochs=100, batch_size=10000, plot=False)
def test_process_samples_discrete_non_recurrent(self): env = TfEnv(DummyDiscreteEnv()) policy = CategoricalMLPPolicy(env_spec=env.spec) baseline = LinearFeatureBaseline(env_spec=env.spec) max_path_length = 100 with LocalTFRunner(snapshot_config, sess=self.sess) as runner: algo = BatchPolopt2(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=max_path_length, flatten_input=True) runner.setup(algo, env, sampler_args=dict(n_envs=1)) runner.train(n_epochs=1, batch_size=max_path_length) paths = runner.obtain_samples(0) samples = algo.process_samples(0, paths) # Since there is only 1 vec_env in the sampler and DummyDiscreteEnv # always terminate, number of paths must be max_path_length, and # batch size must be max_path_length as well, i.e. 100 assert samples['observations'].shape == ( max_path_length, env.observation_space.flat_dim) assert samples['actions'].shape == (max_path_length, env.action_space.n) assert samples['rewards'].shape == (max_path_length, ) assert samples['baselines'].shape == (max_path_length, ) assert samples['returns'].shape == (max_path_length, ) # there is 100 path assert samples['lengths'].shape == (max_path_length, ) # non-recurrent policy has empty agent info assert samples['agent_infos'] == {} # non-recurrent policy has empty env info assert samples['env_infos'] == {} assert isinstance(samples['average_return'], float)
def run_metarl(env, seed, log_dir): ''' Create metarl model and training. Replace the ddpg with the algorithm you want to run. :param env: Environment of the task. :param seed: Random seed for the trial. :param log_dir: Log dir path. :return: ''' deterministic.set_seed(seed) with LocalTFRunner(snapshot_config) as runner: env = TfEnv(normalize(env)) # Set up params for ddpg action_noise = OUStrategy(env.spec, sigma=params['sigma']) policy = ContinuousMLPPolicy( env_spec=env.spec, hidden_sizes=params['policy_hidden_sizes'], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=params['qf_hidden_sizes'], hidden_nonlinearity=tf.nn.relu) replay_buffer = SimpleReplayBuffer( env_spec=env.spec, size_in_transitions=params['replay_buffer_size'], time_horizon=params['n_rollout_steps']) ddpg = DDPG(env_spec=env.spec, policy=policy, qf=qf, replay_buffer=replay_buffer, steps_per_epoch=params['steps_per_epoch'], policy_lr=params['policy_lr'], qf_lr=params['qf_lr'], target_update_tau=params['tau'], n_train_steps=params['n_train_steps'], discount=params['discount'], min_buffer_size=int(1e4), exploration_strategy=action_noise, policy_optimizer=tf.train.AdamOptimizer, qf_optimizer=tf.train.AdamOptimizer) # Set up logger since we are not using run_experiment tabular_log_file = osp.join(log_dir, 'progress.csv') tensorboard_log_dir = osp.join(log_dir) dowel_logger.add_output(dowel.StdOutput()) dowel_logger.add_output(dowel.CsvOutput(tabular_log_file)) dowel_logger.add_output(dowel.TensorBoardOutput(tensorboard_log_dir)) runner.setup(ddpg, env) runner.train(n_epochs=params['n_epochs'], batch_size=params['n_rollout_steps']) dowel_logger.remove_all() return tabular_log_file
def test_ddpg_double_pendulum(self): """Test DDPG with Pendulum environment.""" with LocalTFRunner(snapshot_config, sess=self.sess) as runner: env = TfEnv(gym.make('InvertedDoublePendulum-v2')) action_noise = OUStrategy(env.spec, sigma=0.2) policy = ContinuousMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e5), time_horizon=100) algo = DDPG( env_spec=env.spec, policy=policy, policy_lr=1e-4, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, steps_per_epoch=20, target_update_tau=1e-2, n_train_steps=50, discount=0.9, min_buffer_size=int(5e3), exploration_strategy=action_noise, ) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=100) assert last_avg_ret > 60 env.close()
def test_process_samples_continuous_recurrent(self): env = TfEnv(DummyBoxEnv()) policy = GaussianLSTMPolicy(env_spec=env.spec) baseline = GaussianMLPBaseline(env_spec=env.spec) max_path_length = 100 with LocalTFRunner(snapshot_config, sess=self.sess) as runner: algo = BatchPolopt2(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=max_path_length, flatten_input=True) runner.setup(algo, env, sampler_args=dict(n_envs=1)) runner.train(n_epochs=1, batch_size=max_path_length) paths = runner.obtain_samples(0) samples = algo.process_samples(0, paths) # Since there is only 1 vec_env in the sampler and DummyBoxEnv # never terminate until it reaches max_path_length, batch size # must be max_path_length, i.e. 100 assert samples['observations'].shape == ( max_path_length, env.observation_space.flat_dim) assert samples['actions'].shape == (max_path_length, env.action_space.flat_dim) assert samples['rewards'].shape == (max_path_length, ) assert samples['baselines'].shape == (max_path_length, ) assert samples['returns'].shape == (max_path_length, ) # there is only 1 path assert samples['lengths'].shape == (1, ) for key, shape in policy.state_info_specs: assert samples['agent_infos'][key].shape == (max_path_length, np.prod(shape)) # DummyBoxEnv has env_info dummy assert samples['env_infos']['dummy'].shape == (max_path_length, ) assert isinstance(samples['average_return'], float)
def test_vpg_cartpole(self): """Test VPG with CartPole-v1 environment.""" with LocalTFRunner(snapshot_config, sess=self.sess) as runner: env = TfEnv(env_name='CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, optimizer_args=dict( tf_optimizer_args=dict(learning_rate=0.01, ))) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=10000) assert last_avg_ret > 90 env.close()
def test_dm_control_tf_policy(self): task = ALL_TASKS[0] with LocalTFRunner(snapshot_config, sess=self.sess) as runner: env = TfEnv(DmControlEnv.from_suite(*task)) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(32, 32), ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=5, discount=0.99, max_kl_step=0.01, ) runner.setup(algo, env) runner.train(n_epochs=1, batch_size=10) env.close()
def cem_cartpole(ctxt=None, seed=1): """Train CEM with Cartpole-v1 environment. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with LocalTFRunner(snapshot_config=ctxt) as runner: env = TfEnv(env_name='CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) n_samples = 20 algo = CEM(env_spec=env.spec, policy=policy, baseline=baseline, best_frac=0.05, max_path_length=100, n_samples=n_samples) runner.setup(algo, env, sampler_cls=OnPolicyVectorizedSampler) runner.train(n_epochs=100, batch_size=1000)
def test_trpo_cnn_cubecrash(self): with LocalTFRunner(snapshot_config, sess=self.sess) as runner: env = TfEnv(normalize(gym.make('CubeCrash-v0'))) policy = CategoricalCNNPolicy(env_spec=env.spec, conv_filters=(32, 64), conv_filter_sizes=(8, 4), conv_strides=(4, 2), conv_pad='VALID', hidden_sizes=(32, 32)) baseline = GaussianCNNBaseline(env_spec=env.spec, regressor_args=dict( num_filters=(32, 64), filter_dims=(8, 4), strides=(4, 2), padding='VALID', hidden_sizes=(32, 32), use_trust_region=True)) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.98, max_kl_step=0.01, policy_ent_coeff=0.0, flatten_input=False) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > -0.9 env.close()
def run_task(snapshot_config, *_): """Run task.""" with LocalTFRunner(snapshot_config=snapshot_config, max_cpus=n_envs) as runner: env = TfEnv(env_name='CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=max_path_length, discount=0.99, max_kl_step=0.01) runner.setup(algo=algo, env=env, sampler_cls=BatchSampler, sampler_args={'n_envs': n_envs}) runner.train(n_epochs=100, batch_size=4000, plot=False)
def run_task(snapshot_config, *_): """Defines the main experiment routine. Args: snapshot_config (metarl.experiment.SnapshotConfig): Configuration values for snapshotting. *_ (object): Hyperparameters (unused). """ with LocalTFRunner(snapshot_config=snapshot_config) as runner: env = TfEnv(env_name='CartPole-v1') policy = CategoricalLSTMPolicy(name='policy', env_spec=env.spec) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, max_kl_step=0.01, optimizer=ConjugateGradientOptimizer, optimizer_args=dict(hvp_approach=FiniteDifferenceHvp( base_eps=1e-5))) runner.setup(algo, env) runner.train(n_epochs=100, batch_size=4000)
def ppo_cmb(env, seed, log_dir): """Create test continuous mlp baseline on ppo. Args: env (gym_env): Environment of the task. seed (int): Random seed for the trial. log_dir (str): Log dir path. Returns: str: training results in csv format. """ deterministic.set_seed(seed) config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=num_proc, inter_op_parallelism_threads=num_proc) sess = tf.Session(config=config) with LocalTFRunner(snapshot_config, sess=sess, max_cpus=num_proc) as runner: env = TfEnv(normalize(env)) policy = GaussianLSTMPolicy( env_spec=env.spec, hidden_dim=policy_params['policy_hidden_sizes'], hidden_nonlinearity=policy_params['hidden_nonlinearity'], ) baseline = ContinuousMLPBaseline( env_spec=env.spec, regressor_args=baseline_params['regressor_args'], ) algo = PPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=algo_params['max_path_length'], discount=algo_params['discount'], gae_lambda=algo_params['gae_lambda'], lr_clip_range=algo_params['lr_clip_range'], entropy_method=algo_params['entropy_method'], policy_ent_coeff=algo_params['policy_ent_coeff'], optimizer_args=algo_params['optimizer_args'], center_adv=algo_params['center_adv'], stop_entropy_gradient=True) # Set up logger since we are not using run_experiment tabular_log_file = osp.join(log_dir, 'progress.csv') dowel_logger.add_output(dowel.StdOutput()) dowel_logger.add_output(dowel.CsvOutput(tabular_log_file)) dowel_logger.add_output(dowel.TensorBoardOutput(log_dir)) runner.setup(algo, env, sampler_args=dict(n_envs=algo_params['n_envs'])) runner.train(n_epochs=algo_params['n_epochs'], batch_size=algo_params['n_rollout_steps']) dowel_logger.remove_all() return tabular_log_file
def test_te_ppo(self): with LocalTFRunner(snapshot_config, sess=self.sess) as runner: algo = TEPPO(env_spec=self.env.spec, policy=self.policy, baseline=self.baseline, inference=self.inference, max_path_length=self.max_path_length, discount=0.99, lr_clip_range=0.2, policy_ent_coeff=self.policy_ent_coeff, encoder_ent_coeff=self.encoder_ent_coeff, inference_ce_coeff=self.inference_ce_coeff, entropy_method='max', stop_entropy_gradient=True, use_softplus_entropy=True, optimizer_args=dict( batch_size=32, max_epochs=10, ), inference_optimizer_args=dict( batch_size=32, max_epochs=10, ), center_adv=True, stop_ce_gradient=True) runner.setup(algo, self.env, sampler_cls=LocalSampler, sampler_args=None, worker_class=TaskEmbeddingWorker) runner.train(n_epochs=1, batch_size=self.batch_size, plot=False)
def run_task(snapshot_config, *_): """Run task. Args: snapshot_config (metarl.experiment.SnapshotConfig): The snapshot configuration used by LocalRunner to create the snapshotter. *_ (object): Ignored by this function. """ with LocalTFRunner(snapshot_config=snapshot_config) as runner: env = TfEnv(gym.make('FetchReach-v1')) action_noise = OUStrategy(env.spec, sigma=0.2) policy = ContinuousMLPPolicy( env_spec=env.spec, name='Policy', hidden_sizes=[256, 256, 256], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh, input_include_goal=True, ) qf = ContinuousMLPQFunction( env_spec=env.spec, name='QFunction', hidden_sizes=[256, 256, 256], hidden_nonlinearity=tf.nn.relu, input_include_goal=True, ) replay_buffer = HerReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100, replay_k=0.4, reward_fun=env.compute_reward) ddpg = DDPG( env_spec=env.spec, policy=policy, policy_lr=1e-3, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, target_update_tau=0.05, steps_per_epoch=20, max_path_length=100, n_train_steps=40, discount=0.9, exploration_strategy=action_noise, policy_optimizer=tf.train.AdamOptimizer, qf_optimizer=tf.train.AdamOptimizer, buffer_batch_size=256, input_include_goal=True, ) runner.setup(algo=ddpg, env=env) runner.train(n_epochs=50, batch_size=100)
def run_task(snapshot_config, *_): """Wrap TD3 training task in the run_task function. Args: snapshot_config (metarl.experiment.SnapshotConfig): Configuration values for snapshotting. *_ (object): Hyperparameters (unused). """ with LocalTFRunner(snapshot_config) as runner: env = TfEnv(gym.make('InvertedDoublePendulum-v2')) action_noise = GaussianStrategy(env.spec, max_sigma=0.1, min_sigma=0.1) policy = ContinuousMLPPolicy(env_spec=env.spec, hidden_sizes=[400, 300], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) qf = ContinuousMLPQFunction(name='ContinuousMLPQFunction', env_spec=env.spec, hidden_sizes=[400, 300], action_merge_layer=0, hidden_nonlinearity=tf.nn.relu) qf2 = ContinuousMLPQFunction(name='ContinuousMLPQFunction2', env_spec=env.spec, hidden_sizes=[400, 300], action_merge_layer=0, hidden_nonlinearity=tf.nn.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=250) td3 = TD3(env_spec=env.spec, policy=policy, policy_lr=1e-4, qf_lr=1e-3, qf=qf, qf2=qf2, replay_buffer=replay_buffer, target_update_tau=1e-2, steps_per_epoch=20, n_train_steps=1, smooth_return=False, discount=0.99, buffer_batch_size=100, min_buffer_size=1e4, exploration_strategy=action_noise, policy_optimizer=tf.train.AdamOptimizer, qf_optimizer=tf.train.AdamOptimizer) runner.setup(td3, env) runner.train(n_epochs=500, batch_size=250)
def tf_ppo_pendulum(ctxt=None, seed=1): """Train PPO with InvertedDoublePendulum-v2 environment. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with LocalTFRunner(snapshot_config=ctxt) as runner: env = TfEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict( hidden_sizes=(64, 64), use_trust_region=True, ), ) # NOTE: make sure when setting entropy_method to 'max', set # center_adv to False and turn off policy gradient. See # tf.algos.NPO for detailed documentation. algo = RL2PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_epochs=10, learning_rate=1e-3, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.002, center_adv=False, ) runner.setup(algo, env) runner.train(n_epochs=120, batch_size=4096, plot=False)
def test_td3_pendulum(self): """Test TD3 with Pendulum environment.""" with LocalTFRunner(snapshot_config) as runner: env = TfEnv(gym.make('InvertedDoublePendulum-v2')) action_noise = GaussianStrategy(env.spec, max_sigma=0.1, min_sigma=0.1) policy = ContinuousMLPPolicy(env_spec=env.spec, hidden_sizes=[400, 300], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) qf = ContinuousMLPQFunction(name='ContinuousMLPQFunction', env_spec=env.spec, hidden_sizes=[400, 300], action_merge_layer=0, hidden_nonlinearity=tf.nn.relu) qf2 = ContinuousMLPQFunction(name='ContinuousMLPQFunction2', env_spec=env.spec, hidden_sizes=[400, 300], action_merge_layer=0, hidden_nonlinearity=tf.nn.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=250) algo = TD3(env_spec=env.spec, policy=policy, policy_lr=1e-3, qf_lr=1e-3, qf=qf, qf2=qf2, replay_buffer=replay_buffer, steps_per_epoch=20, target_update_tau=0.005, n_train_steps=50, discount=0.99, smooth_return=False, min_buffer_size=int(1e4), buffer_batch_size=100, policy_weight_decay=0.001, qf_weight_decay=0.001, exploration_strategy=action_noise, policy_optimizer=tf.compat.v1.train.AdamOptimizer, qf_optimizer=tf.compat.v1.train.AdamOptimizer) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=250) assert last_avg_ret > 400
def test_dqn_cartpole_pickle(self): """Test DQN with CartPole environment.""" with LocalTFRunner(snapshot_config, sess=self.sess) as runner: n_epochs = 10 steps_per_epoch = 10 sampler_batch_size = 500 num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size env = TfEnv(gym.make('CartPole-v0')) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e4), time_horizon=1) qf = DiscreteMLPQFunction(env_spec=env.spec, hidden_sizes=(64, 64)) policy = DiscreteQfDerivedPolicy(env_spec=env.spec, qf=qf) epilson_greedy_strategy = EpsilonGreedyStrategy( env_spec=env.spec, total_timesteps=num_timesteps, max_epsilon=1.0, min_epsilon=0.02, decay_ratio=0.1) algo = DQN(env_spec=env.spec, policy=policy, qf=qf, exploration_strategy=epilson_greedy_strategy, replay_buffer=replay_buffer, qf_lr=1e-4, discount=1.0, min_buffer_size=int(1e3), double_q=False, n_train_steps=500, grad_norm_clipping=5.0, steps_per_epoch=steps_per_epoch, target_network_update_freq=1, buffer_batch_size=32) runner.setup(algo, env) with tf.compat.v1.variable_scope( 'DiscreteMLPQFunction/MLPModel/mlp/hidden_0', reuse=True): bias = tf.compat.v1.get_variable('bias') # assign it to all one old_bias = tf.ones_like(bias).eval() bias.load(old_bias) h = pickle.dumps(algo) with tf.compat.v1.Session(graph=tf.Graph()): pickle.loads(h) with tf.compat.v1.variable_scope( 'DiscreteMLPQFunction/MLPModel/mlp/hidden_0', reuse=True): new_bias = tf.compat.v1.get_variable('bias') new_bias = new_bias.eval() assert np.array_equal(old_bias, new_bias) env.close()
def run_metarl(env, seed, log_dir): ''' Create metarl model and training. Replace the ppo with the algorithm you want to run. :param env: Environment of the task. :param seed: Random seed for the trial. :param log_dir: Log dir path. :return: ''' deterministic.set_seed(seed) config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=12, inter_op_parallelism_threads=12) sess = tf.Session(config=config) with LocalTFRunner(snapshot_config, sess=sess, max_cpus=12) as runner: env = TfEnv(normalize(env)) policy = CategoricalLSTMPolicy( env_spec=env.spec, hidden_dim=32, hidden_nonlinearity=tf.nn.tanh, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_epochs=10, tf_optimizer_args=dict(learning_rate=1e-3), ), ) # Set up logger since we are not using run_experiment tabular_log_file = osp.join(log_dir, 'progress.csv') dowel_logger.add_output(dowel.StdOutput()) dowel_logger.add_output(dowel.CsvOutput(tabular_log_file)) dowel_logger.add_output(dowel.TensorBoardOutput(log_dir)) runner.setup(algo, env, sampler_args=dict(n_envs=12)) runner.train(n_epochs=488, batch_size=2048) dowel_logger.remove_all() return tabular_log_file