def vpgis_inverted_pendulum(ctxt=None, seed=1): """Train TRPO with InvertedPendulum-v2 environment. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with LocalTFRunner(ctxt) as runner: env = MetaRLEnv(normalize(gym.make('InvertedPendulum-v2'))) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, max_kl_step=0.01, ) runner.setup(algo, env, sampler_cls=ISSampler, sampler_args=dict(n_backtrack=1)) runner.train(n_epochs=40, batch_size=4000)
def test_vpg_cartpole(self): """Test VPG with CartPole-v1 environment.""" with LocalTFRunner(snapshot_config, sess=self.sess) as runner: env = TfEnv(env_name='CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, optimizer_args=dict( tf_optimizer_args=dict(learning_rate=0.01, ))) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=10000) assert last_avg_ret > 90 env.close()
def vpg_cartpole(ctxt=None, seed=1): """Train VPG with CartPole-v1 environment. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with LocalTFRunner(snapshot_config=ctxt) as runner: env = MetaRLEnv(env_name='CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, optimizer_args=dict(learning_rate=0.01, )) runner.setup(algo, env) runner.train(n_epochs=100, batch_size=10000)
def run_task(snapshot_config, *_): """Run the job. Args: snapshot_config (metarl.experiment.SnapshotConfig): Configuration values for snapshotting. *_ (object): Hyperparameters (unused). """ with LocalTFRunner(snapshot_config=snapshot_config) as runner: env = TfEnv(normalize(gym.make('InvertedPendulum-v2'))) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, max_kl_step=0.01, ) runner.setup(algo, env, sampler_cls=ISSampler, sampler_args=dict(n_backtrack=1)) runner.train(n_epochs=40, batch_size=4000)
def fixture_exp(snapshot_config, sess): """Dummy fixture experiment function. Args: snapshot_config (metarl.experiment.SnapshotConfig): The snapshot configuration used by LocalRunner to create the snapshotter. If None, it will create one with default settings. sess (tf.Session): An optional TensorFlow session. A new session will be created immediately if not provided. Returns: np.ndarray: Values of the parameters evaluated in the current session """ with LocalTFRunner(snapshot_config=snapshot_config, sess=sess) as runner: env = MetaRLEnv(env_name='CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(8, 8)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, optimizer_args=dict(learning_rate=0.01, )) runner.setup(algo, env) runner.train(n_epochs=5, batch_size=100) return policy.get_param_values()
def test_train(self): with LocalTFRunner(snapshot_config) as runner: env = MetaRLEnv(env_name='CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(8, 8)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, optimizer_args=dict(learning_rate=0.01, )) runner.setup(algo, env) runner.train(n_epochs=1, batch_size=100)
def run_task(snapshot_config, *_): """Run task.""" with LocalTFRunner(snapshot_config=snapshot_config) as runner: env = TfEnv(env_name='CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, optimizer_args=dict(tf_optimizer_args=dict(learning_rate=0.01, ))) runner.setup(algo, env) runner.train(n_epochs=100, batch_size=10000)
def test_make_sampler_ray_sampler(self): with LocalTFRunner(snapshot_config) as runner: env = TfEnv(env_name='CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(8, 8)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, optimizer_args=dict( tf_optimizer_args=dict(learning_rate=0.01, ))) runner.setup(algo, env, sampler_cls=RaySampler) assert isinstance(runner._sampler, RaySampler) runner.train(n_epochs=1, batch_size=10)
def fixture_exp(snapshot_config, sess): with LocalTFRunner(snapshot_config=snapshot_config, sess=sess) as runner: env = TfEnv(env_name='CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(8, 8)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, optimizer_args=dict(tf_optimizer_args=dict(learning_rate=0.01, ))) runner.setup(algo, env) runner.train(n_epochs=5, batch_size=100) return policy.get_param_values()
def test_set_plot(self): with LocalTFRunner(snapshot_config) as runner: env = TfEnv(env_name='CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(8, 8)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, optimizer_args=dict( tf_optimizer_args=dict(learning_rate=0.01, ))) runner.setup(algo, env) runner.train(n_epochs=1, batch_size=100, plot=True) assert isinstance(runner._plotter, Plotter), ( 'self.plotter in LocalTFRunner should be set to Plotter.')
def test_tf_batch_sampler(self): max_cpus = 8 with LocalTFRunner(snapshot_config, max_cpus=max_cpus) as runner: env = TfEnv(env_name='CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = VPG(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=1, discount=0.99) runner.setup(algo, env, sampler_cls=BatchSampler, sampler_args={'n_envs': max_cpus}) try: runner.initialize_tf_vars() except BaseException: raise AssertionError( 'LocalRunner should be able to initialize tf variables.') runner._start_worker() paths = runner._sampler.obtain_samples(0, batch_size=8, whole_paths=True) assert len(paths) >= max_cpus, ( 'BatchSampler should sample more than max_cpus={} ' 'trajectories'.format(max_cpus))