Exemple #1
0
    def test_trpo_lstm_cartpole(self):
        with LocalTFRunner(snapshot_config, sess=self.sess) as runner:
            env = TfEnv(normalize(gym.make('CartPole-v1')))

            policy = CategoricalLSTMPolicy(name='policy', env_spec=env.spec)

            baseline = LinearFeatureBaseline(env_spec=env.spec)

            algo = TRPO(env_spec=env.spec,
                        policy=policy,
                        baseline=baseline,
                        max_path_length=100,
                        discount=0.99,
                        max_kl_step=0.01,
                        optimizer_args=dict(hvp_approach=FiniteDifferenceHvp(
                            base_eps=1e-5)))

            snapshotter.snapshot_dir = './'
            runner.setup(algo, env)
            last_avg_ret = runner.train(n_epochs=10, batch_size=2048)
            assert last_avg_ret > 80

            env.close()
    def test_gaussian_policies(self, policy_cls):
        with LocalTFRunner(sess=self.sess) as runner:
            env = TfEnv(normalize(gym.make('Pendulum-v0')))

            policy = policy_cls(name='policy', env_spec=env.spec)

            baseline = LinearFeatureBaseline(env_spec=env.spec)

            algo = TRPO(
                env_spec=env.spec,
                policy=policy,
                baseline=baseline,
                max_path_length=100,
                discount=0.99,
                max_kl_step=0.01,
                optimizer=ConjugateGradientOptimizer,
                optimizer_args=dict(
                    hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)),
            )

            runner.setup(algo, env)
            runner.train(n_epochs=1, batch_size=4000)
            env.close()
Exemple #3
0
    def test_trpo_gru_cartpole(self):
        deterministic.set_seed(2)
        with LocalTFRunner(snapshot_config, sess=self.sess) as runner:
            env = GarageEnv(normalize(gym.make('CartPole-v1')))

            policy = CategoricalGRUPolicy(name='policy', env_spec=env.spec)

            baseline = LinearFeatureBaseline(env_spec=env.spec)

            algo = TRPO(env_spec=env.spec,
                        policy=policy,
                        baseline=baseline,
                        max_path_length=100,
                        discount=0.99,
                        max_kl_step=0.01,
                        optimizer_args=dict(hvp_approach=FiniteDifferenceHvp(
                            base_eps=1e-5)))

            runner.setup(algo, env, sampler_cls=LocalSampler)
            last_avg_ret = runner.train(n_epochs=10, batch_size=2048)
            assert last_avg_ret > 64

            env.close()
def run_task(snapshot_config, *_):
    """Run task."""
    with LocalRunner(snapshot_config=snapshot_config) as runner:
        env = TfEnv(env_name='CartPole-v1')

        policy = CategoricalLSTMPolicyWithModel(
            name='policy', env_spec=env.spec)

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = TRPO(
            env_spec=env.spec,
            policy=policy,
            baseline=baseline,
            max_path_length=100,
            discount=0.99,
            max_kl_step=0.01,
            optimizer=ConjugateGradientOptimizer,
            optimizer_args=dict(
                hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)))

        runner.setup(algo, env)
        runner.train(n_epochs=100, batch_size=4000)
    def test_categorical_policies(self, policy_cls):
        with TFTrainer(snapshot_config, sess=self.sess) as trainer:
            env = normalize(GymEnv('CartPole-v0', max_episode_length=100))

            policy = policy_cls(name='policy', env_spec=env.spec)

            baseline = LinearFeatureBaseline(env_spec=env.spec)

            algo = TRPO(
                env_spec=env.spec,
                policy=policy,
                baseline=baseline,
                discount=0.99,
                max_kl_step=0.01,
                optimizer=ConjugateGradientOptimizer,
                optimizer_args=dict(hvp_approach=FiniteDifferenceHVP(
                    base_eps=1e-5)),
            )

            trainer.setup(algo, env, sampler_cls=LocalSampler)
            trainer.train(n_epochs=1, batch_size=4000)

            env.close()
Exemple #6
0
def trpo_cartpole_bullet(ctxt=None, seed=1):
    """Train TRPO with Pybullet's CartPoleBulletEnv environment.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by Trainer to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.

    """
    set_seed(seed)
    with TFTrainer(ctxt) as trainer:
        env = BulletEnv(
            gym.make('CartPoleBulletEnv-v1',
                     renders=False,
                     discrete_actions=True))

        policy = CategoricalMLPPolicy(name='policy',
                                      env_spec=env.spec,
                                      hidden_sizes=(32, 32))

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        sampler = RaySampler(agents=policy,
                             envs=env,
                             max_episode_length=env.spec.max_episode_length,
                             is_tf_worker=True)

        algo = TRPO(env_spec=env.spec,
                    policy=policy,
                    baseline=baseline,
                    sampler=sampler,
                    discount=0.99,
                    max_kl_step=0.01)

        trainer.setup(algo, env)
        trainer.train(n_epochs=100, batch_size=4000)
def trpo_swimmer_ray_sampler(ctxt=None, seed=1):
    """tf_trpo_swimmer.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by Trainer to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.


    """
    # Since this is an example, we are running ray in a reduced state.
    # One can comment this line out in order to run ray at full capacity
    ray.init(memory=52428800,
             object_store_memory=78643200,
             ignore_reinit_error=True,
             log_to_driver=False,
             include_dashboard=False)
    with TFTrainer(snapshot_config=ctxt) as trainer:
        set_seed(seed)
        env = GymEnv('Swimmer-v2')

        policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32))

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = TRPO(env_spec=env.spec,
                    policy=policy,
                    baseline=baseline,
                    discount=0.99,
                    max_kl_step=0.01)

        trainer.setup(algo,
                      env,
                      sampler_cls=RaySampler,
                      sampler_args={'seed': seed})
        trainer.train(n_epochs=40, batch_size=4000)
Exemple #8
0
    def test_snapshot(self):
        with LocalRunner() as runner:
            env = TfEnv(env_name='CartPole-v1')

            policy = CategoricalMLPPolicy(name='policy',
                                          env_spec=env.spec,
                                          hidden_sizes=(32, 32))

            baseline = LinearFeatureBaseline(env_spec=env.spec)

            algo = TRPO(env_spec=env.spec,
                        policy=policy,
                        baseline=baseline,
                        max_path_length=100,
                        discount=0.99,
                        max_kl_step=0.01)

            runner.setup(algo, env)
            runner.train(n_epochs=self.verifyItrs, batch_size=4000)

            env.close()

        # Read snapshot from self.log_dir
        # Test the presence and integrity of policy and env
        for i in range(0, self.verifyItrs):
            self.reset_tf()
            with LocalRunner():
                snapshot = joblib.load(
                    osp.join(self.log_dir.name, 'itr_{}.pkl'.format(i)))

                env = snapshot['env']
                algo = snapshot['algo']
                assert env
                assert algo
                assert algo.policy

                rollout(env, algo.policy, animated=False)
Exemple #9
0
def trpo_garage_tf(ctxt, env_id, seed):
    """Create garage Tensorflow TROI model and training.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the
            snapshotter.
        env_id (str): Environment id of the task.
        seed (int): Random positive integer for the trial.

    """
    deterministic.set_seed(seed)

    with LocalTFRunner(ctxt) as runner:
        env = GarageEnv(normalize(gym.make(env_id)))

        policy = GaussianMLPPolicy(
            env_spec=env.spec,
            hidden_sizes=hyper_parameters['hidden_sizes'],
            hidden_nonlinearity=tf.nn.tanh,
            output_nonlinearity=None,
        )

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = TRPO(env_spec=env.spec,
                    policy=policy,
                    baseline=baseline,
                    max_path_length=hyper_parameters['max_path_length'],
                    discount=hyper_parameters['discount'],
                    gae_lambda=hyper_parameters['gae_lambda'],
                    max_kl_step=hyper_parameters['max_kl'])

        runner.setup(algo, env)
        runner.train(n_epochs=hyper_parameters['n_epochs'],
                     batch_size=hyper_parameters['batch_size'])
Exemple #10
0
def run_task(snapshot_config, *_):
    with LocalRunner(snapshot_config=snapshot_config,
                     max_cpus=n_envs) as runner:
        env = TfEnv(env_name='CartPole-v1')

        policy = CategoricalMLPPolicy(name='policy',
                                      env_spec=env.spec,
                                      hidden_sizes=(32, 32))

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = TRPO(env_spec=env.spec,
                    policy=policy,
                    baseline=baseline,
                    max_path_length=max_path_length,
                    discount=0.99,
                    max_kl_step=0.01)

        runner.setup(algo=algo,
                     env=env,
                     sampler_cls=BatchSampler,
                     sampler_args={'n_envs': n_envs})

        runner.train(n_epochs=100, batch_size=4000, plot=False)
Exemple #11
0
def multi_env_trpo(ctxt=None, seed=1):
    """Train TRPO on two different PointEnv instances.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by Trainer to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.

    """
    set_seed(seed)
    with TFTrainer(ctxt) as trainer:
        env1 = normalize(PointEnv(goal=(-1., 0.), max_episode_length=100))
        env2 = normalize(PointEnv(goal=(1., 0.), max_episode_length=100))
        env = MultiEnvWrapper([env1, env2])

        policy = GaussianMLPPolicy(env_spec=env.spec)

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        sampler = RaySampler(agents=policy,
                             envs=env,
                             max_episode_length=env.spec.max_episode_length,
                             is_tf_worker=True)

        algo = TRPO(env_spec=env.spec,
                    policy=policy,
                    baseline=baseline,
                    sampler=sampler,
                    discount=0.99,
                    gae_lambda=0.95,
                    lr_clip_range=0.2,
                    policy_ent_coeff=0.0)

        trainer.setup(algo, env)
        trainer.train(n_epochs=40, batch_size=2048, plot=False)
Exemple #12
0
from garage.baselines import LinearFeatureBaseline
from garage.envs import normalize
from garage.experiment import LocalRunner
from garage.tf.algos import TRPO
from garage.tf.policies import GaussianMLPPolicy
from garage.tf.envs import TfEnv
from sawyer.mujoco.reacher_env import SimpleReacherEnv

with LocalRunner() as runner:
    env = TfEnv(normalize(SimpleReacherEnv(goal_position=[0.3, 0.3, 0.3])))

    policy = GaussianMLPPolicy(name="policy",
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        max_path_length=100,
        discount=0.99,
        max_kl_step=0.01,
    )

    runner.setup(algo, env)
    runner.train(n_epochs=100, batch_size=4000, plot=True)
from garage.baselines import LinearFeatureBaseline
from garage.contrib.alexbeloi.is_sampler import ISSampler
from garage.envs import normalize
from garage.tf.algos import TRPO
from garage.tf.policies import GaussianMLPPolicy

env = normalize(gym.make('InvertedPendulum-v2'))

policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32))

baseline = LinearFeatureBaseline(env_spec=env.spec)

optimizer_args = dict(
    # debug_nan=True,
    # reg_coeff=0.1,
    # cg_iters=2
)

algo = TRPO(env=env,
            policy=policy,
            baseline=baseline,
            batch_size=4000,
            max_path_length=100,
            n_itr=200,
            discount=0.99,
            step_size=0.01,
            sampler_cls=ISSampler,
            sampler_args=dict(n_backtrack=1),
            optimizer_args=optimizer_args)
algo.train()
from garage.tf.algos import TRPO
import garage.tf.core.layers as L
from garage.tf.envs import TfEnv
from garage.tf.optimizers import ConjugateGradientOptimizer
from garage.tf.optimizers import FiniteDifferenceHvp
from garage.tf.policies import CategoricalLSTMPolicy

env = TfEnv(env_name="CartPole-v1")

policy = CategoricalLSTMPolicy(
    name="policy",
    env_spec=env.spec,
    lstm_layer_cls=L.TfBasicLSTMLayer,
    # gru_layer_cls=L.GRULayer,
)

baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = TRPO(
    env=env,
    policy=policy,
    baseline=baseline,
    batch_size=4000,
    max_path_length=100,
    n_itr=100,
    discount=0.99,
    max_kl_step=0.01,
    optimizer=ConjugateGradientOptimizer,
    optimizer_args=dict(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)))
algo.train()
Exemple #15
0
    def test_is_sampler(self):
        with LocalTFRunner(snapshot_config, sess=self.sess) as runner:
            env = TfEnv(normalize(gym.make('InvertedPendulum-v2')))
            policy = GaussianMLPPolicy(env_spec=env.spec,
                                       hidden_sizes=(32, 32))
            baseline = LinearFeatureBaseline(env_spec=env.spec)
            algo = TRPO(env_spec=env.spec,
                        policy=policy,
                        baseline=baseline,
                        max_path_length=100,
                        discount=0.99,
                        max_kl_step=0.01)

            runner.setup(algo,
                         env,
                         sampler_cls=ISSampler,
                         sampler_args=dict(n_backtrack=1, init_is=1))
            runner._start_worker()

            paths = runner._sampler.obtain_samples(1)
            assert paths == [], 'Should return empty paths if no history'

            # test importance and live sampling get called alternatively
            with unittest.mock.patch.object(ISSampler,
                                            '_obtain_is_samples') as mocked:
                assert runner._sampler.obtain_samples(2, 20)
                mocked.assert_not_called()

                assert runner._sampler.obtain_samples(3)
                mocked.assert_called_once_with(3, None, True)

            # test importance sampling for first n_is_pretrain iterations
            with unittest.mock.patch.object(ISSampler,
                                            '_obtain_is_samples') as mocked:
                runner._sampler.n_is_pretrain = 5
                runner._sampler.n_backtrack = None
                runner._sampler.obtain_samples(4)

                mocked.assert_called_once_with(4, None, True)

            runner._sampler.obtain_samples(5)

            # test random draw important samples
            runner._sampler.randomize_draw = True
            assert runner._sampler.obtain_samples(6, 1)
            runner._sampler.randomize_draw = False

            runner._sampler.obtain_samples(7, 30)

            # test ess_threshold use
            runner._sampler.ess_threshold = 500
            paths = runner._sampler.obtain_samples(8, 30)
            assert paths == [], (
                'Should return empty paths when ess_threshold is large')
            runner._sampler.ess_threshold = 0

            # test random sample selection when len(paths) > batch size
            runner._sampler.n_is_pretrain = 15
            runner._sampler.obtain_samples(9, 10)
            runner._sampler.obtain_samples(10, 1)

            runner._shutdown_worker()
Exemple #16
0
from garage.baselines import LinearFeatureBaseline
from garage.envs import normalize
from garage.envs.box2d import CartpoleEnv
from garage.envs.mujoco import SwimmerEnv
from garage.tf.algos import TRPO
import gym
from garage.tf.envs import TfEnv
from garage.tf.policies import GaussianMLPPolicy

env = TfEnv(normalize(SwimmerEnv()))

policy = GaussianMLPPolicy(name="policy",
                           env_spec=env.spec,
                           hidden_sizes=(32, 32))

baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = TRPO(env=env,
            policy=policy,
            baseline=baseline,
            batch_size=5000,
            max_path_length=500,
            n_itr=200,
            discount=0.99,
            step_size=0.01,
            plot=False)
algo.train()
Exemple #17
0
from garage.misc.instrument import run_experiment

from garage.tf.algos import TRPO
from garage.tf.policies import GaussianMLPPolicy
from garage.tf.envs import TfEnv

from sandbox.embed2learn.envs.mujoco import PR2ArmEnv

env = TfEnv(normalize(PR2ArmEnv()))

policy = GaussianMLPPolicy(
    name="policy",
    env_spec=env.spec,
    hidden_sizes=(32, 32),
)

baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = TRPO(
    env=env,
    policy=policy,
    baseline=baseline,
    batch_size=4000,
    max_path_length=100,
    n_itr=100,
    discount=0.99,
    step_size=0.01,
    plot=True,
)
algo.train()
def run_task(snapshot_config, *_,
             env_params,
             algo='TRPO',
             algo_params={},
             epochs=1000,
             batch_size=4000,
             policy_hidden_sizes=(32,32),
             embed_state=False,
             model_dir='../models/reacher_limited/train_6',
             augment_embedded_state=False):
    """Run task."""

    embed_config_file = os.path.join(model_dir, 'config.yaml')
    ckpt_path = os.path.join(model_dir, 'model_latest.ckpt')

    with LocalTFRunner(snapshot_config=snapshot_config) as runner:
        if embed_state:
            reacher_env = ReacherEmbeddedEnv(
                embed_config_file,
                ckpt_path,
                augment_embedded_state=augment_embedded_state,
                **env_params)
        else:
            reacher_env = ReacherEnv(**env_params)
        env = GarageEnv(reacher_env)

        policy = GaussianMLPPolicy(
            name='policy',
            env_spec=env.spec,
            hidden_sizes=policy_hidden_sizes)
            # hidden_sizes=(32, 32))

        #************** TRPO ***************
        if algo == 'TRPO':
            baseline = LinearFeatureBaseline(env_spec=env.spec)

            algo = TRPO(env_spec=env.spec,
                        policy=policy,
                        baseline=baseline,
                        **algo_params)
                        # max_path_length=100,
                        # discount=0.99,
                        # max_kl_step=0.01)

        #**************** PPO *********************
        elif algo == 'PPO':
            baseline = GaussianMLPBaseline(
                env_spec=env.spec,
                regressor_args=dict(
                    hidden_sizes=(32, 32),
                    use_trust_region=True,
                ),
            )

            # NOTE: make sure when setting entropy_method to 'max', set
            # center_adv to False and turn off policy gradient. See
            # tf.algos.NPO for detailed documentation.
            algo = PPO(
                env_spec=env.spec,
                policy=policy,
                baseline=baseline,
                # max_path_length=100,
                # discount=0.99,
                gae_lambda=0.95,
                lr_clip_range=0.2,
                optimizer_args=dict(
                    batch_size=32,
                    max_epochs=10,),
                stop_entropy_gradient=True,
                entropy_method='max',
                policy_ent_coeff=0.02,
                center_adv=False,
                **algo_params)

        #**************** Other? **********************
        else:
            print("ERROR: requested unrecognized algorithm: ", algo)
            raise NotImplementedError

        runner.setup(algo, env)
        runner.train(n_epochs=epochs, batch_size=batch_size)
Exemple #19
0
import gym

from garage.baselines import LinearFeatureBaseline
from garage.experiment import run_experiment
from garage.tf.algos import TRPO
from garage.tf.envs import TfEnv
from garage.tf.policies import CategoricalMLPPolicy

# Need to wrap in a tf environment and force_reset to true
# see https://github.com/openai/rllab/issues/87#issuecomment-282519288
env = TfEnv(gym.make("CartPole-v0"))

policy = CategoricalMLPPolicy(name="policy",
                              env_spec=env.spec,
                              hidden_sizes=(32, 32))

baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = TRPO(
    env=env,
    policy=policy,
    baseline=baseline,
    batch_size=4000,
    max_path_length=200,
    n_itr=120,
    discount=0.99,
    max_kl_step=0.01,
)

run_experiment(algo.train(), n_parallel=1, snapshot_mode="last", seed=1)
from garage.baselines import LinearFeatureBaseline
from garage.misc.instrument import run_experiment
from garage.envs.mujoco import SwimmerEnv
from garage.envs import normalize
from garage.tf.algos import TRPO
from garage.tf.envs import TfEnv
from garage.tf.policies import GaussianMLPPolicy

# Need to wrap in a tf environment and force_reset to true
# see https://github.com/openai/garage/issues/87#issuecomment-282519288
env = TfEnv(normalize(SwimmerEnv))

policy = GaussianMLPPolicy(name="policy",
                           env_spec=env.spec,
                           hidden_sizes=(32, 32))

baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = TRPO(env=env,
            policy=policy,
            baseline=baseline,
            batch_size=100,
            max_path_length=500,
            n_itr=100,
            discount=0.995,
            step_size=0.001,
            plot=False)

run_experiment(algo.train(), n_parallel=8, snapshot_mode="last", seed=1)