Ejemplo n.º 1
0
 def setUp(self):
     self.env = TfEnv(DummyDiscretePixelEnv(random=False))
     self.env_g = TfEnv(Grayscale(DummyDiscretePixelEnv(random=False)))
Ejemplo n.º 2
0
def run_garage(env, seed, log_dir):
    '''
    Create garage model and training.

    Replace the ppo with the algorithm you want to run.

    :param env: Environment of the task.
    :param seed: Random seed for the trial.
    :param log_dir: Log dir path.
    :return:
    '''
    deterministic.set_seed(seed)

    with LocalRunner() as runner:
        env = TfEnv(normalize(env))

        policy = GaussianMLPPolicy(
            env_spec=env.spec,
            hidden_sizes=(64, 64),
            hidden_nonlinearity=tf.nn.tanh,
            output_nonlinearity=None,
        )

        baseline = GaussianMLPBaseline(
            env_spec=env.spec,
            regressor_args=dict(
                hidden_sizes=(64, 64),
                use_trust_region=False,
                optimizer=FirstOrderOptimizer,
                optimizer_args=dict(
                    batch_size=32,
                    max_epochs=10,
                    tf_optimizer_args=dict(learning_rate=1e-3),
                ),
            ),
        )

        algo = PPO(
            env_spec=env.spec,
            policy=policy,
            baseline=baseline,
            max_path_length=100,
            discount=0.99,
            gae_lambda=0.95,
            lr_clip_range=0.2,
            policy_ent_coeff=0.0,
            optimizer_args=dict(
                batch_size=32,
                max_epochs=10,
                tf_optimizer_args=dict(learning_rate=1e-3),
            ),
        )

        # Set up logger since we are not using run_experiment
        tabular_log_file = osp.join(log_dir, 'progress.csv')
        dowel_logger.add_output(dowel.StdOutput())
        dowel_logger.add_output(dowel.CsvOutput(tabular_log_file))
        dowel_logger.add_output(dowel.TensorBoardOutput(log_dir))

        runner.setup(algo, env)
        runner.train(n_epochs=488, batch_size=2048)

        dowel_logger.remove_all()

        return tabular_log_file
Ejemplo n.º 3
0
import gym

from garage.baselines import LinearFeatureBaseline
from garage.misc.instrument import run_experiment
from garage.tf.algos import TRPO
from garage.tf.envs import TfEnv
from garage.tf.policies import CategoricalMLPPolicy

# Need to wrap in a tf environment and force_reset to true
# see https://github.com/openai/rllab/issues/87#issuecomment-282519288
env = TfEnv(gym.make("CartPole-v0"))

policy = CategoricalMLPPolicy(name="policy",
                              env_spec=env.spec,
                              hidden_sizes=(32, 32))

baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = TRPO(
    env=env,
    policy=policy,
    baseline=baseline,
    batch_size=4000,
    max_path_length=200,
    n_itr=120,
    discount=0.99,
    step_size=0.01,
)

run_experiment(algo.train(), n_parallel=1, snapshot_mode="last", seed=1)
Ejemplo n.º 4
0
 def test_invalid_env(self):
     env = TfEnv(DummyBoxEnv())
     with pytest.raises(ValueError):
         CategoricalGRUPolicy(env_spec=env.spec)
Ejemplo n.º 5
0
def run_garage(env, seed, log_dir):
    """
    Create garage model and training.

    Replace the td3 with the algorithm you want to run.

    :param env: Environment of the task.
    :param seed: Random seed for the trail.
    :param log_dir: Log dir path.
    :return:
    """
    deterministic.set_seed(seed)

    with LocalTFRunner(snapshot_config) as runner:
        env = TfEnv(normalize(env))
        # Set up params for TD3
        exploration_noise = GaussianStrategy(env.spec,
                                             max_sigma=params['sigma'],
                                             min_sigma=params['sigma'])

        policy = ContinuousMLPPolicy(
            env_spec=env.spec,
            hidden_sizes=params['policy_hidden_sizes'],
            hidden_nonlinearity=tf.nn.relu,
            output_nonlinearity=tf.nn.tanh)

        qf = ContinuousMLPQFunction(name='ContinuousMLPQFunction',
                                    env_spec=env.spec,
                                    hidden_sizes=params['qf_hidden_sizes'],
                                    action_merge_layer=0,
                                    hidden_nonlinearity=tf.nn.relu)

        qf2 = ContinuousMLPQFunction(name='ContinuousMLPQFunction2',
                                     env_spec=env.spec,
                                     hidden_sizes=params['qf_hidden_sizes'],
                                     action_merge_layer=0,
                                     hidden_nonlinearity=tf.nn.relu)

        replay_buffer = SimpleReplayBuffer(
            env_spec=env.spec,
            size_in_transitions=params['replay_buffer_size'],
            time_horizon=params['n_rollout_steps'])

        td3 = TD3(env.spec,
                  policy=policy,
                  qf=qf,
                  qf2=qf2,
                  replay_buffer=replay_buffer,
                  steps_per_epoch=params['steps_per_epoch'],
                  policy_lr=params['policy_lr'],
                  qf_lr=params['qf_lr'],
                  target_update_tau=params['tau'],
                  n_train_steps=params['n_train_steps'],
                  discount=params['discount'],
                  smooth_return=params['smooth_return'],
                  min_buffer_size=params['min_buffer_size'],
                  buffer_batch_size=params['buffer_batch_size'],
                  exploration_strategy=exploration_noise,
                  policy_optimizer=tf.compat.v1.train.AdamOptimizer,
                  qf_optimizer=tf.compat.v1.train.AdamOptimizer)

        # Set up logger since we are not using run_experiment
        tabular_log_file = osp.join(log_dir, 'progress.csv')
        dowel_logger.add_output(dowel.StdOutput())
        dowel_logger.add_output(dowel.CsvOutput(tabular_log_file))
        dowel_logger.add_output(dowel.TensorBoardOutput(log_dir))

        runner.setup(td3, env)
        runner.train(n_epochs=params['n_epochs'],
                     batch_size=params['n_rollout_steps'])

        dowel_logger.remove_all()

        return tabular_log_file
Ejemplo n.º 6
0
def run_garage(env, seed, log_dir):
    '''
    Create garage model and training.

    Replace the ppo with the algorithm you want to run.

    :param env: Environment of the task.
    :param seed: Random seed for the trial.
    :param log_dir: Log dir path.
    :return:
    '''
    deterministic.set_seed(seed)
    env.reset()

    with LocalTFRunner(snapshot_config) as runner:
        env = TfEnv(normalize(env))

        action_noise = OUStrategy(env.spec, sigma=params['sigma'])

        policy = ContinuousMLPPolicy(
            env_spec=env.spec,
            hidden_sizes=params['policy_hidden_sizes'],
            hidden_nonlinearity=tf.nn.relu,
            output_nonlinearity=tf.nn.tanh,
            input_include_goal=True,
        )

        qf = ContinuousMLPQFunction(
            env_spec=env.spec,
            hidden_sizes=params['qf_hidden_sizes'],
            hidden_nonlinearity=tf.nn.relu,
            input_include_goal=True,
        )

        replay_buffer = HerReplayBuffer(
            env_spec=env.spec,
            size_in_transitions=params['replay_buffer_size'],
            time_horizon=params['n_rollout_steps'],
            replay_k=0.4,
            reward_fun=env.compute_reward,
        )

        algo = DDPG(
            env_spec=env.spec,
            policy=policy,
            qf=qf,
            replay_buffer=replay_buffer,
            policy_lr=params['policy_lr'],
            qf_lr=params['qf_lr'],
            target_update_tau=params['tau'],
            n_train_steps=params['n_train_steps'],
            discount=params['discount'],
            exploration_strategy=action_noise,
            policy_optimizer=tf.train.AdamOptimizer,
            qf_optimizer=tf.train.AdamOptimizer,
            buffer_batch_size=256,
            input_include_goal=True,
        )

        # Set up logger since we are not using run_experiment
        tabular_log_file = osp.join(log_dir, 'progress.csv')
        logger.add_output(dowel.StdOutput())
        logger.add_output(dowel.CsvOutput(tabular_log_file))
        logger.add_output(dowel.TensorBoardOutput(log_dir))

        runner.setup(algo, env)
        runner.train(n_epochs=params['n_epochs'],
                     n_epoch_cycles=params['n_epoch_cycles'],
                     batch_size=params['n_rollout_steps'])

        logger.remove_all()

        return tabular_log_file
Ejemplo n.º 7
0
 def test_state_info_specs_with_state_include_action(self):
     env = TfEnv(DummyDiscreteEnv(obs_dim=(10, ), action_dim=4))
     policy = CategoricalGRUPolicy(env_spec=env.spec,
                                   state_include_action=True)
     assert policy.state_info_specs == [('prev_action', (4, ))]
Ejemplo n.º 8
0
 def test_all_gym_envs(self, spec):
     if spec._env_name.startswith('Defender'):
         pytest.skip(
             'Defender-* envs bundled in atari-py 0.2.x don\'t load')
     env = TfEnv(spec.make())
     step_env_with_gym_quirks(env, spec)
Ejemplo n.º 9
0
from garage.baselines import LinearFeatureBaseline
from garage.envs import normalize
from garage.envs.box2d import CartpoleEnv
from garage.envs.mujoco import SwimmerEnv
from garage.tf.algos import VPG
from garage.tf.envs import TfEnv
from garage.tf.policies import GaussianMLPPolicy
from garage.misc.instrument import run_experiment

env = TfEnv(normalize(SwimmerEnv()))

policy = GaussianMLPPolicy(name="policy",
                           env_spec=env.spec,
                           hidden_sizes=(32, 32))

baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = VPG(env=env,
           policy=policy,
           baseline=baseline,
           batch_size=5000,
           max_path_length=500,
           n_itr=40,
           discount=0.995,
           optimizer_args=dict(tf_optimizer_args=dict(learning_rate=1e-4, )))

run_experiment(algo.train(),
               n_parallel=1,
               snapshot_mode="last",
               seed=1,
               use_gpu=True,
Ejemplo n.º 10
0
 def setUp(self):
     self.env = TfEnv(DummyDiscreteEnv(random=False))
     self.env_r = TfEnv(
         RepeatAction(DummyDiscreteEnv(random=False), n_frame_to_repeat=4))
Ejemplo n.º 11
0
 def test_is_pickleable(self):
     env = TfEnv(env_name='CartPole-v1')
     round_trip = pickle.loads(pickle.dumps(env))
     assert round_trip.env.spec == env.env.spec
Ejemplo n.º 12
0
    def test_no_reset(self):
        with LocalTFRunner(snapshot_config, sess=self.sess) as runner:
            # This tests if off-policy sampler respect batch_size
            # when no_reset is set to True
            env = TfEnv(normalize(gym.make('InvertedDoublePendulum-v2')))
            action_noise = OUStrategy(env.spec, sigma=0.2)
            policy = ContinuousMLPPolicy(env_spec=env.spec,
                                         hidden_sizes=[64, 64],
                                         hidden_nonlinearity=tf.nn.relu,
                                         output_nonlinearity=tf.nn.tanh)
            qf = ContinuousMLPQFunction(env_spec=env.spec,
                                        hidden_sizes=[64, 64],
                                        hidden_nonlinearity=tf.nn.relu)
            replay_buffer = SimpleReplayBuffer(env_spec=env.spec,
                                               size_in_transitions=int(1e6),
                                               time_horizon=100)
            algo = DDPG(
                env_spec=env.spec,
                policy=policy,
                policy_lr=1e-4,
                qf_lr=1e-3,
                qf=qf,
                replay_buffer=replay_buffer,
                target_update_tau=1e-2,
                n_train_steps=50,
                discount=0.9,
                min_buffer_size=int(1e4),
                exploration_strategy=action_noise,
            )

            sampler = OffPolicyVectorizedSampler(algo, env, 1, no_reset=True)
            sampler.start_worker()

            runner.initialize_tf_vars()

            paths1 = sampler.obtain_samples(0, 5)
            paths2 = sampler.obtain_samples(0, 5)

            len1 = sum([len(path['rewards']) for path in paths1])
            len2 = sum([len(path['rewards']) for path in paths2])

            assert len1 == 5 and len2 == 5, 'Sampler should respect batch_size'
            # yapf: disable
            # When done is False in 1st sampling, the next sampling should be
            # stacked with the last batch in 1st sampling
            case1 = (len(paths1[-1]['rewards']) + len(paths2[0]['rewards'])
                     == paths2[0]['running_length'])
            # When done is True in 1st sampling, the next sampling should be
            # separated
            case2 = len(paths2[0]['rewards']) == paths2[0]['running_length']
            done = paths1[-1]['dones'][-1]
            assert (
                (not done and case1) or (done and case2)
            ), 'Running length should be the length of full path'

            # yapf: enable
            case1 = np.isclose(
                paths1[-1]['rewards'].sum() + paths2[0]['rewards'].sum(),
                paths2[0]['undiscounted_return'])
            case2 = np.isclose(paths2[0]['rewards'].sum(),
                               paths2[0]['undiscounted_return'])
            assert (
                (not done and case1) or (done and case2)
            ), 'Undiscounted_return should be the sum of rewards of full path'
Ejemplo n.º 13
0
from garage.baselines import LinearFeatureBaseline
from garage.envs import normalize
from garage.envs.box2d import CartpoleEnv
from garage.tf.algos import TRPO
import garage.tf.core.layers as L
from garage.tf.envs import TfEnv
from garage.tf.optimizers import ConjugateGradientOptimizer
from garage.tf.optimizers import FiniteDifferenceHvp
from garage.tf.policies import GaussianLSTMPolicy

env = TfEnv(normalize(CartpoleEnv()))

policy = GaussianLSTMPolicy(
    name="policy",
    env_spec=env.spec,
    lstm_layer_cls=L.TfBasicLSTMLayer,
    # gru_layer_cls=L.GRULayer,
)

baseline = LinearFeatureBaseline(env_spec=env.spec)

algo = TRPO(
    env=env,
    policy=policy,
    baseline=baseline,
    batch_size=4000,
    max_path_length=100,
    n_itr=10,
    discount=0.99,
    step_size=0.01,
    optimizer=ConjugateGradientOptimizer,
Ejemplo n.º 14
0
def run_garage_tf(env, seed, log_dir):
    """Create garage TensorFlow PPO model and training.

    Args:
        env (dict): Environment of the task.
        seed (int): Random positive integer for the trial.
        log_dir (str): Log dir path.

    Returns:
        str: Path to output csv file

    """
    deterministic.set_seed(seed)

    with LocalTFRunner(snapshot_config) as runner:
        env = TfEnv(normalize(env))

        policy = TF_GMP(
            env_spec=env.spec,
            hidden_sizes=(32, 32),
            hidden_nonlinearity=tf.nn.tanh,
            output_nonlinearity=None,
        )

        baseline = TF_GMB(
            env_spec=env.spec,
            regressor_args=dict(
                hidden_sizes=(32, 32),
                use_trust_region=False,
                optimizer=FirstOrderOptimizer,
                optimizer_args=dict(
                    batch_size=32,
                    max_epochs=10,
                    tf_optimizer_args=dict(learning_rate=3e-4),
                ),
            ),
        )

        algo = TF_PPO(env_spec=env.spec,
                      policy=policy,
                      baseline=baseline,
                      max_path_length=hyper_parameters['max_path_length'],
                      discount=0.99,
                      gae_lambda=0.95,
                      center_adv=True,
                      lr_clip_range=0.2,
                      optimizer_args=dict(
                          batch_size=32,
                          max_epochs=10,
                          tf_optimizer_args=dict(learning_rate=3e-4),
                          verbose=True))

        # Set up logger since we are not using run_experiment
        tabular_log_file = osp.join(log_dir, 'progress.csv')
        dowel_logger.add_output(dowel.StdOutput())
        dowel_logger.add_output(dowel.CsvOutput(tabular_log_file))
        dowel_logger.add_output(dowel.TensorBoardOutput(log_dir))

        runner.setup(algo, env)
        runner.train(n_epochs=hyper_parameters['n_epochs'],
                     batch_size=hyper_parameters['batch_size'])

        dowel_logger.remove_all()

        return tabular_log_file
Ejemplo n.º 15
0
 def setUp(self):
     super().setUp()
     self.data = np.ones((2, 1))
     self.env = TfEnv(DummyDiscreteEnv())
     self.qf = DiscreteMLPQFunction(self.env.spec)
Ejemplo n.º 16
0
def run_task(snapshot_config, *_):
    """Set up environment and algorithm and run the task.
    Args:
        snapshot_config (garage.experiment.SnapshotConfig): The snapshot
            configuration used by LocalRunner to create the snapshotter.
            If None, it will create one with default settings.
        _ : Unused parameters
    """

    th = 1.8
    g_max = 0.1
    #delta = 1e-7
    if args.env == 'CartPole':
        #CartPole

        env = TfEnv(normalize(CartPoleEnv()))
        runner = LocalRunner(snapshot_config)
        batch_size = 5000
        max_length = 100
        n_timestep = 5e5
        n_counts = 5
        name = 'CartPole'
        grad_factor = 5
        th = 1.2
        #batchsize: 1
        # lr = 0.1
        # w = 2
        # c = 50

        #batchsize: 50
        lr = 0.75
        c = 3
        w = 2

        discount = 0.995
        path = './init/CartPole_policy.pth'

    if args.env == 'Walker':
        #Walker_2d
        env = TfEnv(normalize(Walker2dEnv()))
        runner = LocalRunner(snapshot_config)
        batch_size = 50000
        max_length = 500

        n_timestep = 1e7
        n_counts = 5
        lr = 0.75
        w = 2
        c = 12
        grad_factor = 6

        discount = 0.999

        name = 'Walk'
        path = './init/Walk_policy.pth'

    if args.env == 'HalfCheetah':
        env = TfEnv(normalize(HalfCheetahEnv()))
        runner = LocalRunner(snapshot_config)

        batch_size = 50000
        max_length = 500

        n_timestep = 1e7
        n_counts = 5
        lr = 0.6
        w = 1
        c = 4
        grad_factor = 5
        th = 1.2
        g_max = 0.06

        discount = 0.999

        name = 'HalfCheetah'
        path = './init/HalfCheetah_policy.pth'

    if args.env == 'Hopper':
        #Hopper
        env = TfEnv(normalize(HopperEnv()))
        runner = LocalRunner(snapshot_config)

        batch_size = 50000
        max_length = 1000
        th = 1.5
        n_timestep = 1e7
        n_counts = 5
        lr = 0.75
        w = 1
        c = 3
        grad_factor = 6
        g_max = 0.15
        discount = 0.999

        name = 'Hopper'
        path = './init/Hopper_policy.pth'

    for i in range(n_counts):
        # print(env.spec)
        if args.env == 'CartPole':
            policy = CategoricalMLPPolicy(env.spec,
                                       hidden_sizes=[8, 8],
                                       hidden_nonlinearity=torch.tanh,
                                       output_nonlinearity=None)
        else:
            policy = GaussianMLPPolicy(env.spec,
                                       hidden_sizes=[64, 64],
                                       hidden_nonlinearity=torch.tanh,
                                       output_nonlinearity=None)


        policy.load_state_dict(torch.load(path))
        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = MBPG_HA(env_spec=env.spec,
                   env = env,
                    env_name= name,
                   policy=policy,
                   baseline=baseline,
                   max_path_length=max_length,
                   discount=discount,
                   grad_factor=grad_factor,
                   policy_lr= lr,
                   c = c,
                   w = w,
                   th=th,
                   g_max=g_max,
                   n_timestep=n_timestep,

                   batch_size=batch_size,
                   center_adv=True,
                   # delta=delta
                   #decay_learning_rate=d_lr,

                   )

        runner.setup(algo, env)
        runner.train(n_epochs=100, batch_size=batch_size)
Ejemplo n.º 17
0
 def setup_method(self):
     super().setup_method()
     self.env = TfEnv(DummyDiscretePixelEnv())
     self.obs = self.env.reset()
Ejemplo n.º 18
0
def td3_garage_tf(ctxt, env_id, seed):
    """Create garage TensorFlow TD3 model and training.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the
            snapshotter.
        env_id (str): Environment id of the task.
        seed (int): Random positive integer for the trial.

    """
    deterministic.set_seed(seed)

    with LocalTFRunner(ctxt) as runner:
        env = TfEnv(normalize(gym.make(env_id)))

        policy = ContinuousMLPPolicy(
            env_spec=env.spec,
            hidden_sizes=hyper_parameters['policy_hidden_sizes'],
            hidden_nonlinearity=tf.nn.relu,
            output_nonlinearity=tf.nn.tanh)

        exploration_policy = AddGaussianNoise(
            env.spec,
            policy,
            max_sigma=hyper_parameters['sigma'],
            min_sigma=hyper_parameters['sigma'])

        qf = ContinuousMLPQFunction(
            name='ContinuousMLPQFunction',
            env_spec=env.spec,
            hidden_sizes=hyper_parameters['qf_hidden_sizes'],
            action_merge_layer=0,
            hidden_nonlinearity=tf.nn.relu)

        qf2 = ContinuousMLPQFunction(
            name='ContinuousMLPQFunction2',
            env_spec=env.spec,
            hidden_sizes=hyper_parameters['qf_hidden_sizes'],
            action_merge_layer=0,
            hidden_nonlinearity=tf.nn.relu)

        replay_buffer = SimpleReplayBuffer(
            env_spec=env.spec,
            size_in_transitions=hyper_parameters['replay_buffer_size'],
            time_horizon=hyper_parameters['n_rollout_steps'])

        td3 = TD3(env.spec,
                  policy=policy,
                  qf=qf,
                  qf2=qf2,
                  replay_buffer=replay_buffer,
                  steps_per_epoch=hyper_parameters['steps_per_epoch'],
                  policy_lr=hyper_parameters['policy_lr'],
                  qf_lr=hyper_parameters['qf_lr'],
                  target_update_tau=hyper_parameters['tau'],
                  n_train_steps=hyper_parameters['n_train_steps'],
                  discount=hyper_parameters['discount'],
                  smooth_return=hyper_parameters['smooth_return'],
                  min_buffer_size=hyper_parameters['min_buffer_size'],
                  buffer_batch_size=hyper_parameters['buffer_batch_size'],
                  exploration_policy=exploration_policy,
                  policy_optimizer=tf.compat.v1.train.AdamOptimizer,
                  qf_optimizer=tf.compat.v1.train.AdamOptimizer)

        runner.setup(td3, env)
        runner.train(n_epochs=hyper_parameters['n_epochs'],
                     batch_size=hyper_parameters['n_rollout_steps'])
Ejemplo n.º 19
0
 def test_state_info_specs(self):
     env = TfEnv(DummyDiscreteEnv(obs_dim=(10, ), action_dim=4))
     policy = CategoricalGRUPolicy(env_spec=env.spec,
                                   state_include_action=False)
     assert policy.state_info_specs == []
Ejemplo n.º 20
0
 def test_state_info_specs(self):
     env = TfEnv(DummyBoxEnv(obs_dim=(4, ), action_dim=(4, )))
     policy = GaussianGRUPolicy(env_spec=env.spec,
                                state_include_action=False)
     assert policy.state_info_specs == []
Ejemplo n.º 21
0
 def test_clone(self):
     env = TfEnv(DummyDiscreteEnv(obs_dim=(10, ), action_dim=4))
     policy = CategoricalGRUPolicy(env_spec=env.spec)
     policy_clone = policy.clone('CategoricalGRUPolicyClone')
     assert policy.env_spec == policy_clone.env_spec
Ejemplo n.º 22
0
 def test_state_info_specs_with_state_include_action(self):
     env = TfEnv(DummyBoxEnv(obs_dim=(4, ), action_dim=(4, )))
     policy = GaussianGRUPolicy(env_spec=env.spec,
                                state_include_action=True)
     assert policy.state_info_specs == [('prev_action', (4, ))]
 def test_invalid_env(self):
     env = TfEnv(DummyDiscreteEnv())
     with pytest.raises(ValueError):
         GaussianLSTMPolicyWithModel(env_spec=env.spec)
Ejemplo n.º 24
0
 def test_clone(self):
     env = TfEnv(DummyBoxEnv(obs_dim=(4, ), action_dim=(4, )))
     policy = GaussianGRUPolicy(env_spec=env.spec)
     policy_clone = policy.clone('GaussianGRUPolicyClone')
     assert policy_clone.env_spec == policy.env_spec
    def setup_method(self):
        super().setup_method()
        env = TfEnv(DummyDiscreteEnv(obs_dim=(1, ), action_dim=1))
        self.default_initializer = tf.constant_initializer(1)
        self.default_hidden_nonlinearity = tf.nn.tanh
        self.default_recurrent_nonlinearity = tf.nn.sigmoid
        self.default_output_nonlinearity = None
        self.time_step = 1

        self.policy1 = CategoricalLSTMPolicy(
            env_spec=env.spec,
            hidden_dim=4,
            hidden_nonlinearity=self.default_hidden_nonlinearity,
            recurrent_nonlinearity=self.default_recurrent_nonlinearity,
            recurrent_w_x_init=self.default_initializer,
            recurrent_w_h_init=self.default_initializer,
            output_nonlinearity=self.default_output_nonlinearity,
            output_w_init=self.default_initializer,
            state_include_action=True,
            name='P1')
        self.policy2 = CategoricalLSTMPolicy(
            env_spec=env.spec,
            hidden_dim=4,
            hidden_nonlinearity=self.default_hidden_nonlinearity,
            recurrent_nonlinearity=self.default_recurrent_nonlinearity,
            recurrent_w_x_init=self.default_initializer,
            recurrent_w_h_init=self.default_initializer,
            output_nonlinearity=self.default_output_nonlinearity,
            output_w_init=tf.constant_initializer(2),
            state_include_action=True,
            name='P2')

        self.sess.run(tf.global_variables_initializer())

        self.policy3 = CategoricalLSTMPolicyWithModel(
            env_spec=env.spec,
            hidden_dim=4,
            hidden_nonlinearity=self.default_hidden_nonlinearity,
            hidden_w_init=self.default_initializer,
            recurrent_nonlinearity=self.default_recurrent_nonlinearity,
            recurrent_w_init=self.default_initializer,
            output_nonlinearity=self.default_output_nonlinearity,
            output_w_init=self.default_initializer,
            state_include_action=True,
            name='P3')
        self.policy4 = CategoricalLSTMPolicyWithModel(
            env_spec=env.spec,
            hidden_dim=4,
            hidden_nonlinearity=self.default_hidden_nonlinearity,
            hidden_w_init=self.default_initializer,
            recurrent_nonlinearity=self.default_recurrent_nonlinearity,
            recurrent_w_init=self.default_initializer,
            output_nonlinearity=self.default_output_nonlinearity,
            output_w_init=tf.constant_initializer(2),
            state_include_action=True,
            name='P4')

        self.policy1.reset()
        self.policy2.reset()
        self.policy3.reset()
        self.policy4.reset()
        self.obs = [env.reset()]
        self.obs = np.concatenate([self.obs for _ in range(self.time_step)],
                                  axis=0)

        self.obs_ph = tf.placeholder(
            tf.float32, shape=(None, None, env.observation_space.flat_dim))
        self.action_ph = tf.placeholder(
            tf.float32, shape=(None, None, env.action_space.flat_dim))

        self.dist1_sym = self.policy1.dist_info_sym(
            obs_var=self.obs_ph,
            state_info_vars={'prev_action': np.zeros((2, self.time_step, 1))},
            name='p1_sym')
        self.dist2_sym = self.policy2.dist_info_sym(
            obs_var=self.obs_ph,
            state_info_vars={'prev_action': np.zeros((2, self.time_step, 1))},
            name='p2_sym')
        self.dist3_sym = self.policy3.dist_info_sym(
            obs_var=self.obs_ph,
            state_info_vars={'prev_action': np.zeros((2, self.time_step, 1))},
            name='p3_sym')
        self.dist4_sym = self.policy4.dist_info_sym(
            obs_var=self.obs_ph,
            state_info_vars={'prev_action': np.zeros((2, self.time_step, 1))},
            name='p4_sym')
def run_garage(env, seed, log_dir):
    '''
    Create garage model and training.

    Replace the ppo with the algorithm you want to run.

    :param env: Environment of the task.
    :param seed: Random seed for the trial.
    :param log_dir: Log dir path.
    :return:
    '''
    deterministic.set_seed(seed)
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=12,
                            inter_op_parallelism_threads=12)
    sess = tf.Session(config=config)

    with LocalTFRunner(snapshot_config, sess=sess, max_cpus=12) as runner:
        env = TfEnv(normalize(env))

        policy = CategoricalCNNPolicy(
            env_spec=env.spec,
            conv_filters=params['conv_filters'],
            conv_filter_sizes=params['conv_filter_sizes'],
            conv_strides=params['conv_strides'],
            conv_pad=params['conv_pad'],
            hidden_sizes=params['hidden_sizes'])

        baseline = GaussianCNNBaseline(
            env_spec=env.spec,
            regressor_args=dict(num_filters=params['conv_filters'],
                                filter_dims=params['conv_filter_sizes'],
                                strides=params['conv_strides'],
                                padding=params['conv_pads'],
                                hidden_sizes=params['hidden_sizes'],
                                use_trust_region=params['use_trust_region']))

        algo = PPO(
            env_spec=env.spec,
            policy=policy,
            baseline=baseline,
            max_path_length=100,
            discount=0.99,
            gae_lambda=0.95,
            lr_clip_range=0.2,
            policy_ent_coeff=0.0,
            optimizer_args=dict(
                batch_size=32,
                max_epochs=10,
                tf_optimizer_args=dict(learning_rate=1e-3),
            ),
        )

        # Set up logger since we are not using run_experiment
        tabular_log_file = osp.join(log_dir, 'progress.csv')
        dowel_logger.add_output(dowel.StdOutput())
        dowel_logger.add_output(dowel.CsvOutput(tabular_log_file))
        dowel_logger.add_output(dowel.TensorBoardOutput(log_dir))

        runner.setup(algo, env)
        runner.train(n_epochs=params['n_epochs'],
                     batch_size=params['batch_size'])

        dowel_logger.remove_all()

        return tabular_log_file
Ejemplo n.º 27
0
    def test_is_sampler(self):
        with LocalRunner(sess=self.sess) as runner:
            env = TfEnv(normalize(gym.make('InvertedPendulum-v2')))
            policy = GaussianMLPPolicy(env_spec=env.spec,
                                       hidden_sizes=(32, 32))
            baseline = LinearFeatureBaseline(env_spec=env.spec)
            algo = TRPO(env_spec=env.spec,
                        policy=policy,
                        baseline=baseline,
                        max_path_length=100,
                        discount=0.99,
                        max_kl_step=0.01)

            runner.setup(algo,
                         env,
                         sampler_cls=ISSampler,
                         sampler_args=dict(n_backtrack=1, init_is=1))
            runner._start_worker()

            paths = runner.sampler.obtain_samples(1)
            assert paths == [], 'Should return empty paths if no history'

            # test importance and live sampling get called alternatively
            with unittest.mock.patch.object(ISSampler,
                                            '_obtain_is_samples') as mocked:
                assert runner.sampler.obtain_samples(2, 20)
                mocked.assert_not_called()

                assert runner.sampler.obtain_samples(3)
                mocked.assert_called_once_with(3, None, True)

            # test importance sampling for first n_is_pretrain iterations
            with unittest.mock.patch.object(ISSampler,
                                            '_obtain_is_samples') as mocked:
                runner.sampler.n_is_pretrain = 5
                runner.sampler.n_backtrack = 'all'
                runner.sampler.obtain_samples(4)

                mocked.assert_called_once_with(4, None, True)

            runner.sampler.obtain_samples(5)

            # test random draw important samples
            runner.sampler.randomize_draw = True
            assert runner.sampler.obtain_samples(6, 1)
            runner.sampler.randomize_draw = False

            runner.sampler.obtain_samples(7, 30)

            # test ess_threshold use
            runner.sampler.ess_threshold = 500
            paths = runner.sampler.obtain_samples(8, 30)
            assert paths == [], (
                'Should return empty paths when ess_threshold is large')
            runner.sampler.ess_threshold = 0

            # test random sample selection when len(paths) > batch size
            runner.sampler.n_is_pretrain = 15
            runner.sampler.obtain_samples(9, 10)
            runner.sampler.obtain_samples(10, 1)

            runner._shutdown_worker()
Ejemplo n.º 28
0
 def test_baseline(self):
     """Test the baseline initialization."""
     box_env = TfEnv(DummyBoxEnv())
     deterministic_mlp_baseline = DeterministicMLPBaseline(env_spec=box_env)
     gaussian_mlp_baseline = GaussianMLPBaseline(env_spec=box_env)
Ejemplo n.º 29
0
def run_garage_pytorch(env, seed, log_dir):
    """Create garage PyTorch PPO model and training.

    Args:
        env (dict): Environment of the task.
        seed (int): Random positive integer for the trial.
        log_dir (str): Log dir path.

    Returns:
        str: Path to output csv file

    """
    env = TfEnv(normalize(env))

    deterministic.set_seed(seed)

    runner = LocalRunner(snapshot_config)

    policy = PyTorch_GMP(env.spec,
                         hidden_sizes=(32, 32),
                         hidden_nonlinearity=torch.tanh,
                         output_nonlinearity=None)

    value_function = GaussianMLPValueFunction(env_spec=env.spec,
                                              hidden_sizes=(32, 32),
                                              hidden_nonlinearity=torch.tanh,
                                              output_nonlinearity=None)

    policy_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=2.5e-4)),
                                        policy,
                                        max_optimization_epochs=10,
                                        minibatch_size=64)
    vf_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=2.5e-4)),
                                    value_function,
                                    max_optimization_epochs=10,
                                    minibatch_size=64)

    algo = PyTorch_PPO(env_spec=env.spec,
                       policy=policy,
                       value_function=value_function,
                       policy_optimizer=policy_optimizer,
                       vf_optimizer=vf_optimizer,
                       max_path_length=hyper_parameters['max_path_length'],
                       discount=0.99,
                       gae_lambda=0.95,
                       center_adv=True,
                       lr_clip_range=0.2)

    # Set up logger since we are not using run_experiment
    tabular_log_file = osp.join(log_dir, 'progress.csv')
    dowel_logger.add_output(dowel.StdOutput())
    dowel_logger.add_output(dowel.CsvOutput(tabular_log_file))
    dowel_logger.add_output(dowel.TensorBoardOutput(log_dir))

    runner.setup(algo, env)
    runner.train(n_epochs=hyper_parameters['n_epochs'],
                 batch_size=hyper_parameters['batch_size'])

    dowel_logger.remove_all()

    return tabular_log_file
Ejemplo n.º 30
0
def dqn_pong(ctxt=None, seed=1, buffer_size=int(5e4)):
    """Train DQN on PongNoFrameskip-v4 environment.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.
        buffer_size (int): Number of timesteps to store in replay buffer.

    """
    set_seed(seed)
    with LocalTFRunner(ctxt) as runner:
        n_epochs = 100
        steps_per_epoch = 20
        sampler_batch_size = 500
        num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size

        env = gym.make('PongNoFrameskip-v4')
        env = Noop(env, noop_max=30)
        env = MaxAndSkip(env, skip=4)
        env = EpisodicLife(env)
        if 'FIRE' in env.unwrapped.get_action_meanings():
            env = FireReset(env)
        env = Grayscale(env)
        env = Resize(env, 84, 84)
        env = ClipReward(env)
        env = StackFrames(env, 4)

        env = TfEnv(env, is_image=True)

        replay_buffer = SimpleReplayBuffer(env_spec=env.spec,
                                           size_in_transitions=buffer_size,
                                           time_horizon=1)

        qf = DiscreteCNNQFunction(env_spec=env.spec,
                                  filter_dims=(8, 4, 3),
                                  num_filters=(32, 64, 64),
                                  strides=(4, 2, 1),
                                  dueling=False)

        policy = DiscreteQfDerivedPolicy(env_spec=env.spec, qf=qf)
        exploration_policy = EpsilonGreedyPolicy(env_spec=env.spec,
                                                 policy=policy,
                                                 total_timesteps=num_timesteps,
                                                 max_epsilon=1.0,
                                                 min_epsilon=0.02,
                                                 decay_ratio=0.1)

        algo = DQN(env_spec=env.spec,
                   policy=policy,
                   qf=qf,
                   exploration_policy=exploration_policy,
                   replay_buffer=replay_buffer,
                   qf_lr=1e-4,
                   discount=0.99,
                   min_buffer_size=int(1e4),
                   double_q=False,
                   n_train_steps=500,
                   steps_per_epoch=steps_per_epoch,
                   target_network_update_freq=2,
                   buffer_batch_size=32)

        runner.setup(algo, env)
        runner.train(n_epochs=n_epochs, batch_size=sampler_batch_size)