Esempio n. 1
0
def ppo_garage_pytorch(ctxt, env_id, seed):
    """Create garage PyTorch PPO model and training.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the
            snapshotter.
        env_id (str): Environment id of the task.
        seed (int): Random positive integer for the trial.

    """
    deterministic.set_seed(seed)

    runner = LocalRunner(ctxt)

    env = normalize(GymEnv(env_id))

    policy = PyTorch_GMP(env.spec,
                         hidden_sizes=(32, 32),
                         hidden_nonlinearity=torch.tanh,
                         output_nonlinearity=None)

    value_function = GaussianMLPValueFunction(env_spec=env.spec,
                                              hidden_sizes=(32, 32),
                                              hidden_nonlinearity=torch.tanh,
                                              output_nonlinearity=None)

    policy_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=2.5e-4)),
                                        policy,
                                        max_optimization_epochs=10,
                                        minibatch_size=64)

    vf_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=2.5e-4)),
                                    value_function,
                                    max_optimization_epochs=10,
                                    minibatch_size=64)

    algo = PyTorch_PPO(env_spec=env.spec,
                       policy=policy,
                       value_function=value_function,
                       policy_optimizer=policy_optimizer,
                       vf_optimizer=vf_optimizer,
                       discount=0.99,
                       gae_lambda=0.95,
                       center_adv=True,
                       lr_clip_range=0.2)

    runner.setup(algo, env)
    runner.train(n_epochs=hyper_parameters['n_epochs'],
                 batch_size=hyper_parameters['batch_size'])
Esempio n. 2
0
def mtppo_metaworld_mt10(ctxt, seed, epochs, batch_size, n_workers, n_tasks):
    """Set up environment and algorithm and run the task.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by Trainer to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.
        epochs (int): Number of training epochs.
        batch_size (int): Number of environment steps in one batch.
        n_workers (int): The number of workers the sampler should use.
        n_tasks (int): Number of tasks to use. Should be a multiple of 10.

    """
    set_seed(seed)
    mt10 = metaworld.MT10()
    train_task_sampler = MetaWorldTaskSampler(mt10,
                                              'train',
                                              lambda env, _: normalize(env),
                                              add_env_onehot=True)
    assert n_tasks % 10 == 0
    assert n_tasks <= 500
    envs = [env_up() for env_up in train_task_sampler.sample(n_tasks)]
    env = MultiEnvWrapper(envs,
                          sample_strategy=round_robin_strategy,
                          mode='vanilla')

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(64, 64),
        hidden_nonlinearity=torch.tanh,
        output_nonlinearity=None,
    )

    value_function = GaussianMLPValueFunction(env_spec=env.spec,
                                              hidden_sizes=(32, 32),
                                              hidden_nonlinearity=torch.tanh,
                                              output_nonlinearity=None)

    algo = PPO(env_spec=env.spec,
               policy=policy,
               value_function=value_function,
               discount=0.99,
               gae_lambda=0.95,
               center_adv=True,
               lr_clip_range=0.2)

    trainer = Trainer(ctxt)
    trainer.setup(algo, env, n_workers=n_workers)
    trainer.train(n_epochs=epochs, batch_size=batch_size)
Esempio n. 3
0
def gaussian_cnn_baseline(ctxt, env_id, seed):
    """Create Gaussian CNN Baseline on TF-PPO.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the
            snapshotter.
        env_id (str): Environment id of the task.
        seed (int): Random positive integer for the trial.

    """
    deterministic.set_seed(seed)

    with LocalTFRunner(ctxt, max_cpus=12) as runner:
        env = GarageEnv(normalize(gym.make(env_id)))

        policy = CategoricalCNNPolicy(env_spec=env.spec,
                                      conv_filters=params['conv_filters'],
                                      conv_strides=params['conv_strides'],
                                      conv_pad=params['conv_pad'],
                                      hidden_sizes=params['hidden_sizes'])

        baseline = GaussianCNNBaseline(
            env_spec=env.spec,
            regressor_args=dict(filters=params['conv_filters'],
                                strides=params['conv_strides'],
                                padding=params['conv_pad'],
                                hidden_sizes=params['hidden_sizes'],
                                use_trust_region=params['use_trust_region']))

        algo = PPO(
            env_spec=env.spec,
            policy=policy,
            baseline=baseline,
            max_path_length=100,
            discount=0.99,
            gae_lambda=0.95,
            lr_clip_range=0.2,
            policy_ent_coeff=0.0,
            flatten_input=False,
            optimizer_args=dict(
                batch_size=32,
                max_epochs=10,
                learning_rate=1e-3,
            ),
        )

        runner.setup(algo, env)
        runner.train(n_epochs=params['n_epochs'],
                     batch_size=params['batch_size'])
Esempio n. 4
0
def run_garage_tf(env, seed, log_dir):
    """Create garage TensorFlow VPG model and training.

    Args:
        env (dict): Environment of the task.
        seed (int): Random positive integer for the trial.
        log_dir (str): Log dir path.

    Returns:
        str: Path to output csv file

    """
    deterministic.set_seed(seed)

    with LocalTFRunner(snapshot_config) as runner:
        env = TfEnv(normalize(env))

        policy = TF_GMP(
            env_spec=env.spec,
            hidden_sizes=hyper_parameters['hidden_sizes'],
            hidden_nonlinearity=tf.nn.tanh,
            output_nonlinearity=None,
        )

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = TF_VPG(env_spec=env.spec,
                      policy=policy,
                      baseline=baseline,
                      max_path_length=hyper_parameters['max_path_length'],
                      discount=hyper_parameters['discount'],
                      center_adv=hyper_parameters['center_adv'],
                      optimizer_args=dict(
                          tf_optimizer_args=dict(
                              learning_rate=hyper_parameters['learning_rate']),
                          verbose=True))  # yapf: disable

        # Set up logger since we are not using run_experiment
        tabular_log_file = osp.join(log_dir, 'progress.csv')
        dowel_logger.add_output(dowel.StdOutput())
        dowel_logger.add_output(dowel.CsvOutput(tabular_log_file))
        dowel_logger.add_output(dowel.TensorBoardOutput(log_dir))

        runner.setup(algo, env)
        runner.train(n_epochs=hyper_parameters['n_epochs'],
                     batch_size=hyper_parameters['batch_size'])

        dowel_logger.remove_all()

        return tabular_log_file
Esempio n. 5
0
def ppo_pendulum(ctxt=None, seed=1):
    """Train PPO with InvertedDoublePendulum-v2 environment.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by Trainer to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.

    """
    set_seed(seed)
    with TFTrainer(snapshot_config=ctxt) as trainer:
        env = normalize(GymEnv('InvertedDoublePendulum-v2'))

        policy = GaussianMLPPolicy(
            env_spec=env.spec,
            hidden_sizes=(64, 64),
            hidden_nonlinearity=tf.nn.tanh,
            output_nonlinearity=None,
        )

        baseline = GaussianMLPBaseline(
            env_spec=env.spec,
            hidden_sizes=(32, 32),
            use_trust_region=True,
        )

        # NOTE: make sure when setting entropy_method to 'max', set
        # center_adv to False and turn off policy gradient. See
        # tf.algos.NPO for detailed documentation.
        algo = PPO(
            env_spec=env.spec,
            policy=policy,
            baseline=baseline,
            discount=0.99,
            gae_lambda=0.95,
            lr_clip_range=0.2,
            optimizer_args=dict(
                batch_size=32,
                max_optimization_epochs=10,
            ),
            stop_entropy_gradient=True,
            entropy_method='max',
            policy_ent_coeff=0.02,
            center_adv=False,
        )

        trainer.setup(algo, env)

        trainer.train(n_epochs=120, batch_size=2048, plot=False)
Esempio n. 6
0
 def setup_method(self):
     super().setup_method()
     self.env = TfEnv(normalize(gym.make('InvertedDoublePendulum-v2')))
     self.policy = GaussianMLPPolicy(
         env_spec=self.env.spec,
         hidden_sizes=(64, 64),
         hidden_nonlinearity=tf.nn.tanh,
         output_nonlinearity=None,
     )
     self.recurrent_policy = GaussianLSTMPolicy(env_spec=self.env.spec, )
     self.baseline = GaussianMLPBaseline(
         env_spec=self.env.spec,
         regressor_args=dict(hidden_sizes=(32, 32)),
     )
Esempio n. 7
0
 def setup_method(self):
     super().setup_method()
     self.env = normalize(
         GymEnv('InvertedDoublePendulum-v2', max_episode_length=100))
     self.policy = GaussianMLPPolicy(
         env_spec=self.env.spec,
         hidden_sizes=(64, 64),
         hidden_nonlinearity=tf.nn.tanh,
         output_nonlinearity=None,
     )
     self.baseline = GaussianMLPBaseline(
         env_spec=self.env.spec,
         hidden_sizes=(32, 32),
     )
Esempio n. 8
0
def ppo_garage_tf(ctxt, env_id, seed):
    """Create garage TensorFlow PPO model and training.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by Trainer to create the
            snapshotter.
        env_id (str): Environment id of the task.
        seed (int): Random positive integer for the trial.

    """
    deterministic.set_seed(seed)

    with TFTrainer(ctxt) as trainer:
        env = normalize(GymEnv(env_id))

        policy = TF_GMP(
            env_spec=env.spec,
            hidden_sizes=(32, 32),
            hidden_nonlinearity=tf.nn.tanh,
            output_nonlinearity=None,
        )

        baseline = TF_GMB(
            env_spec=env.spec,
            hidden_sizes=(32, 32),
            use_trust_region=False,
            optimizer=FirstOrderOptimizer,
            optimizer_args=dict(
                batch_size=32,
                max_optimization_epochs=10,
                learning_rate=3e-4,
            ),
        )

        algo = TF_PPO(env_spec=env.spec,
                      policy=policy,
                      baseline=baseline,
                      discount=0.99,
                      gae_lambda=0.95,
                      center_adv=True,
                      lr_clip_range=0.2,
                      optimizer_args=dict(batch_size=32,
                                          max_optimization_epochs=10,
                                          learning_rate=3e-4,
                                          verbose=True))

        trainer.setup(algo, env)
        trainer.train(n_epochs=hyper_parameters['n_epochs'],
                      batch_size=hyper_parameters['batch_size'])
Esempio n. 9
0
def run_garage_pytorch(env, seed, log_dir):
    """Create garage PyTorch PPO model and training.

    Args:
        env (dict): Environment of the task.
        seed (int): Random positive integer for the trial.
        log_dir (str): Log dir path.

    Returns:
        str: Path to output csv file

    """
    env = TfEnv(normalize(env))

    deterministic.set_seed(seed)

    runner = LocalRunner(snapshot_config)

    policy = PyTorch_GMP(env.spec,
                         hidden_sizes=hyper_parameters['hidden_sizes'],
                         hidden_nonlinearity=torch.tanh,
                         output_nonlinearity=None)

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = PyTorch_PPO(env_spec=env.spec,
                       policy=policy,
                       baseline=baseline,
                       optimizer=torch.optim.Adam,
                       policy_lr=hyper_parameters['learning_rate'],
                       max_path_length=hyper_parameters['max_path_length'],
                       discount=hyper_parameters['discount'],
                       gae_lambda=hyper_parameters['gae_lambda'],
                       center_adv=hyper_parameters['center_adv'],
                       lr_clip_range=hyper_parameters['lr_clip_range'])

    # Set up logger since we are not using run_experiment
    tabular_log_file = osp.join(log_dir, 'progress.csv')
    dowel_logger.add_output(dowel.StdOutput())
    dowel_logger.add_output(dowel.CsvOutput(tabular_log_file))
    dowel_logger.add_output(dowel.TensorBoardOutput(log_dir))

    runner.setup(algo, env)
    runner.train(n_epochs=hyper_parameters['n_epochs'],
                 batch_size=hyper_parameters['batch_size'])

    dowel_logger.remove_all()

    return tabular_log_file
def run_garage(env, seed, log_dir):
    '''
    Create garage model and training.
    :param env: Environment of the task.
    :param seed: Random seed for the trial.
    :param log_dir: Log dir path.
    :return:
    '''
    deterministic.set_seed(seed)
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=12,
                            inter_op_parallelism_threads=12)
    sess = tf.Session(config=config)
    with LocalTFRunner(snapshot_config, sess=sess, max_cpus=12) as runner:
        env = TfEnv(normalize(env))

        policy = CategoricalMLPPolicy(
            env_spec=env.spec,
            hidden_nonlinearity=tf.nn.tanh,
        )

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = PPO(env_spec=env.spec,
                   policy=policy,
                   baseline=baseline,
                   max_path_length=100,
                   discount=0.99,
                   gae_lambda=0.95,
                   lr_clip_range=0.2,
                   policy_ent_coeff=0.0,
                   optimizer_args=dict(
                       batch_size=32,
                       max_epochs=10,
                       tf_optimizer_args=dict(learning_rate=1e-3),
                   ),
                   name='CategoricalMLPPolicyBenchmark')

        # Set up logger since we are not using run_experiment
        tabular_log_file = osp.join(log_dir, 'progress.csv')
        dowel_logger.add_output(dowel.StdOutput())
        dowel_logger.add_output(dowel.CsvOutput(tabular_log_file))
        dowel_logger.add_output(dowel.TensorBoardOutput(log_dir))

        runner.setup(algo, env, sampler_args=dict(n_envs=12))
        runner.train(n_epochs=5, batch_size=2048)
        dowel_logger.remove_all()

        return tabular_log_file
Esempio n. 11
0
def continuous_mlp_baseline(ctxt, env_id, seed):
    """Create Continuous MLP Baseline on TF-PPO.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the
            snapshotter.
        env_id (str): Environment id of the task.
        seed (int): Random positive integer for the trial.

    """
    deterministic.set_seed(seed)

    with LocalTFRunner(ctxt) as runner:
        env = GarageEnv(normalize(gym.make(env_id)))

        policy = GaussianLSTMPolicy(
            env_spec=env.spec,
            hidden_dim=hyper_params['policy_hidden_sizes'],
            hidden_nonlinearity=hyper_params['hidden_nonlinearity'],
        )

        baseline = ContinuousMLPBaseline(
            env_spec=env.spec,
            hidden_sizes=(64, 64),
        )

        algo = PPO(env_spec=env.spec,
                   policy=policy,
                   baseline=baseline,
                   max_path_length=hyper_params['max_path_length'],
                   discount=hyper_params['discount'],
                   gae_lambda=hyper_params['gae_lambda'],
                   lr_clip_range=hyper_params['lr_clip_range'],
                   entropy_method=hyper_params['entropy_method'],
                   policy_ent_coeff=hyper_params['policy_ent_coeff'],
                   optimizer_args=dict(
                       batch_size=32,
                       max_epochs=10,
                       learning_rate=1e-3,
                   ),
                   center_adv=hyper_params['center_adv'],
                   stop_entropy_gradient=True)

        runner.setup(algo,
                     env,
                     sampler_args=dict(n_envs=hyper_params['n_envs']))
        runner.train(n_epochs=hyper_params['n_epochs'],
                     batch_size=hyper_params['n_rollout_steps'])
Esempio n. 12
0
    def run_task(self, snapshot_config, *_):
        config = tf.ConfigProto(device_count={'GPU': 0},
                                allow_soft_placement=True,
                                intra_op_parallelism_threads=12,
                                inter_op_parallelism_threads=12)
        sess = tf.Session(config=config)
        with LocalTFRunner(snapshot_config=snapshot_config,
                           sess=sess) as runner:
            env = gym.make(self._env)
            env = TfEnv(normalize(env))
            env.reset()
            policy = GaussianMLPPolicy(
                env_spec=env.spec,
                hidden_sizes=(32, 32),
                hidden_nonlinearity=tf.nn.tanh,
                output_nonlinearity=None,
            )

            baseline = GaussianMLPBaseline(
                env_spec=env.spec,
                regressor_args=dict(
                    hidden_sizes=(64, 64),
                    use_trust_region=False,
                    optimizer=FirstOrderOptimizer,
                    optimizer_args=dict(
                        batch_size=32,
                        max_epochs=10,
                        tf_optimizer_args=dict(learning_rate=1e-3),
                    ),
                ),
            )

            algo = PPO(
                env_spec=env.spec,
                policy=policy,
                baseline=baseline,
                max_path_length=100,
                discount=0.99,
                gae_lambda=0.95,
                lr_clip_range=0.2,
                policy_ent_coeff=0.0,
                optimizer_args=dict(
                    batch_size=32,
                    max_epochs=10,
                    tf_optimizer_args=dict(learning_rate=1e-3),
                ),
            )
            runner.setup(algo, env, sampler_args=dict(n_envs=12))
            runner.train(n_epochs=5, batch_size=2048)
Esempio n. 13
0
def multi_env_trpo(ctxt=None, seed=1):
    """Train TRPO on two different PointEnv instances.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by Trainer to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.

    """
    set_seed(seed)
    with TFTrainer(ctxt) as trainer:
        env1 = normalize(PointEnv(goal=(-1., 0.), max_episode_length=100))
        env2 = normalize(PointEnv(goal=(1., 0.), max_episode_length=100))
        env = MultiEnvWrapper([env1, env2])

        policy = GaussianMLPPolicy(env_spec=env.spec)

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        sampler = RaySampler(agents=policy,
                             envs=env,
                             max_episode_length=env.spec.max_episode_length,
                             is_tf_worker=True)

        algo = TRPO(env_spec=env.spec,
                    policy=policy,
                    baseline=baseline,
                    sampler=sampler,
                    discount=0.99,
                    gae_lambda=0.95,
                    lr_clip_range=0.2,
                    policy_ent_coeff=0.0)

        trainer.setup(algo, env)
        trainer.train(n_epochs=40, batch_size=2048, plot=False)
Esempio n. 14
0
def test_flatten():
    env = normalize(gym.make('Pendulum-v0'),
                    normalize_reward=True,
                    normalize_obs=True,
                    flatten_obs=True)
    for i in range(10):
        env.reset()
        for e in range(100):
            env.render()
            action = env.action_space.sample()
            next_obs, reward, done, info = env.step(action)
            assert next_obs.shape == flat_dim(env.observation_space)
            if done:
                break
    env.close()
Esempio n. 15
0
def test_unflatten():
    env = normalize(gym.make('Blackjack-v0'),
                    normalize_reward=True,
                    normalize_obs=True,
                    flatten_obs=False)
    for i in range(10):
        env.reset()
        for e in range(100):
            action = env.action_space.sample()
            next_obs, reward, done, info = env.step(action)
            assert flatten(env.observation_space,
                           next_obs).shape == flat_dim(env.observation_space)
            if done:
                break
    env.close()
Esempio n. 16
0
 def setup_method(self):
     super().setup_method()
     self.env = normalize(GymEnv('InvertedDoublePendulum-v2'))
     self.policy = GaussianMLPPolicy(
         env_spec=self.env.spec,
         hidden_sizes=(64, 64),
         hidden_nonlinearity=tf.nn.tanh,
         output_nonlinearity=None,
     )
     self.lstm_policy = GaussianLSTMPolicy(env_spec=self.env.spec)
     self.gru_policy = GaussianGRUPolicy(env_spec=self.env.spec)
     self.baseline = GaussianMLPBaseline(
         env_spec=self.env.spec,
         hidden_sizes=(32, 32),
     )
Esempio n. 17
0
 def setup_method(self):
     """Setup method which is called before every test."""
     self.env = normalize(GymEnv(HalfCheetahDirEnv(),
                                 max_episode_length=100),
                          expected_action_scale=10.)
     task_sampler = SetTaskSampler(lambda: normalize(
         GymEnv(HalfCheetahDirEnv()), expected_action_scale=10.))
     self.policy = GaussianMLPPolicy(
         env_spec=self.env.spec,
         hidden_sizes=(64, 64),
         hidden_nonlinearity=torch.tanh,
         output_nonlinearity=None,
     )
     self.value_function = GaussianMLPValueFunction(env_spec=self.env.spec,
                                                    hidden_sizes=(32, 32))
     self.algo = MAMLPPO(env=self.env,
                         policy=self.policy,
                         task_sampler=task_sampler,
                         value_function=self.value_function,
                         meta_batch_size=5,
                         discount=0.99,
                         gae_lambda=1.,
                         inner_lr=0.1,
                         num_grad_updates=1)
Esempio n. 18
0
def ddpg_pendulum(ctxt=None, seed=1, lr=1e-4):
    """Train DDPG with InvertedDoublePendulum-v2 environment.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.
        lr (float): Learning rate for policy optimization.

    """
    set_seed(seed)
    runner = LocalRunner(ctxt)
    env = GarageEnv(normalize(gym.make('InvertedDoublePendulum-v2')))

    policy = DeterministicMLPPolicy(env_spec=env.spec,
                                    hidden_sizes=[64, 64],
                                    hidden_nonlinearity=F.relu,
                                    output_nonlinearity=torch.tanh)

    exploration_policy = AddOrnsteinUhlenbeckNoise(env.spec, policy, sigma=0.2)

    qf = ContinuousMLPQFunction(env_spec=env.spec,
                                hidden_sizes=[64, 64],
                                hidden_nonlinearity=F.relu)

    replay_buffer = SimpleReplayBuffer(env_spec=env.spec,
                                       size_in_transitions=int(1e6),
                                       time_horizon=100)

    policy_optimizer = (torch.optim.Adagrad, {'lr': lr, 'lr_decay': 0.99})

    ddpg = DDPG(env_spec=env.spec,
                policy=policy,
                qf=qf,
                replay_buffer=replay_buffer,
                steps_per_epoch=20,
                n_train_steps=50,
                min_buffer_size=int(1e4),
                exploration_policy=exploration_policy,
                target_update_tau=1e-2,
                discount=0.9,
                policy_optimizer=policy_optimizer,
                qf_optimizer=torch.optim.Adam)

    runner.setup(algo=ddpg, env=env)

    runner.train(n_epochs=500, batch_size=100)
def mttrpo_metaworld_mt1_push(ctxt, seed, epochs, batch_size):
    """Set up environment and algorithm and run the task.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by Trainer to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.
        epochs (int): Number of training epochs.
        batch_size (int): Number of environment steps in one batch.

    """
    set_seed(seed)
    n_tasks = 50
    mt1 = metaworld.MT1('push-v1')
    train_task_sampler = MetaWorldTaskSampler(mt1, 'train',
                                              lambda env, _: normalize(env))
    envs = [env_up() for env_up in train_task_sampler.sample(n_tasks)]
    env = MultiEnvWrapper(envs,
                          sample_strategy=round_robin_strategy,
                          mode='vanilla')

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(64, 64),
        hidden_nonlinearity=torch.tanh,
        output_nonlinearity=None,
    )

    value_function = GaussianMLPValueFunction(env_spec=env.spec,
                                              hidden_sizes=(32, 32),
                                              hidden_nonlinearity=torch.tanh,
                                              output_nonlinearity=None)

    sampler = RaySampler(agents=policy,
                         envs=env,
                         max_episode_length=env.spec.max_episode_length)

    algo = TRPO(env_spec=env.spec,
                policy=policy,
                value_function=value_function,
                sampler=sampler,
                discount=0.99,
                gae_lambda=0.95)

    trainer = Trainer(ctxt)
    trainer.setup(algo, env)
    trainer.train(n_epochs=epochs, batch_size=batch_size)
Esempio n. 20
0
 def setup_method(self):
     """Setup method which is called before every test."""
     self.env = normalize(GymEnv('InvertedDoublePendulum-v2'))
     self.policy = GaussianMLPPolicy(
         env_spec=self.env.spec,
         hidden_sizes=(64, 64),
         hidden_nonlinearity=torch.tanh,
         output_nonlinearity=None,
     )
     self.value_function = GaussianMLPValueFunction(env_spec=self.env.spec)
     deterministic.set_seed(0)
     self.sampler = LocalSampler(
         agents=self.policy,
         envs=self.env,
         max_episode_length=self.env.spec.max_episode_length,
         is_tf_worker=True)
def run_task(*_):
    env = TheanoEnv(normalize(CartpoleEnv()))

    policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32))

    baseline = LinearFeatureBaseline(env_spec=env.spec)
    algo = InstrumentedTRPO(env=env,
                            policy=policy,
                            baseline=baseline,
                            batch_size=4000,
                            max_path_length=100,
                            n_itr=4,
                            discount=0.99,
                            step_size=0.01,
                            plot=True)
    algo.train()
Esempio n. 22
0
def run_task(*_):
    env = normalize(CartpoleEnv())

    policy = DummyPolicy(env_spec=env)

    baseline = LinearFeatureBaseline(env_spec=env)
    algo = InstrumentedNOP(env=env,
                           policy=policy,
                           baseline=baseline,
                           batch_size=4000,
                           max_path_length=100,
                           n_itr=4,
                           discount=0.99,
                           step_size=0.01,
                           plot=True)
    algo.train()
Esempio n. 23
0
 def test_unflatten(self):
     env = TheanoEnv(
         normalize(gym.make('Blackjack-v0'),
                   normalize_reward=True,
                   normalize_obs=True,
                   flatten_obs=False))
     for i in range(10):
         env.reset()
         for e in range(5):
             action = env.action_space.sample()
             next_obs, reward, done, info = env.step(action)
             assert (env.observation_space.flatten(next_obs).shape ==
                     env.observation_space.flat_dim)  # yapf: disable
             if done:
                 break
     env.close()
Esempio n. 24
0
def ppo_memorize_digits(ctxt=None, seed=1, batch_size=4000):
    """Train PPO on MemorizeDigits-v0 environment.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.
        batch_size (int): Number of timesteps to use in each training step.

    """
    set_seed(seed)
    with LocalTFRunner(ctxt) as runner:
        env = TfEnv(normalize(gym.make('MemorizeDigits-v0')), is_image=True)
        policy = CategoricalCNNPolicy(env_spec=env.spec,
                                      num_filters=(32, 64, 64),
                                      filter_dims=(5, 3, 2),
                                      strides=(4, 2, 1),
                                      padding='VALID',
                                      hidden_sizes=(256, ))

        baseline = GaussianCNNBaseline(env_spec=env.spec,
                                       regressor_args=dict(
                                           num_filters=(32, 64, 64),
                                           filter_dims=(5, 3, 2),
                                           strides=(4, 2, 1),
                                           padding='VALID',
                                           hidden_sizes=(256, ),
                                           use_trust_region=True))

        algo = PPO(env_spec=env.spec,
                   policy=policy,
                   baseline=baseline,
                   max_path_length=100,
                   discount=0.99,
                   gae_lambda=0.95,
                   lr_clip_range=0.2,
                   policy_ent_coeff=0.0,
                   optimizer_args=dict(
                       batch_size=32,
                       max_epochs=10,
                       tf_optimizer_args=dict(learning_rate=1e-3),
                   ),
                   flatten_input=False)

        runner.setup(algo, env)
        runner.train(n_epochs=1000, batch_size=batch_size)
Esempio n. 25
0
def run_garage(env, seed, log_dir):
    """Create garage Tensorflow PPO model and training.

    Args:
        env (dict): Environment of the task.
        seed (int): Random positive integer for the trial.
        log_dir (str): Log dir path.

    Returns:
        str: Path to output csv file

    """
    deterministic.set_seed(seed)

    with LocalTFRunner(snapshot_config) as runner:
        env = TfEnv(normalize(env))

        policy = GaussianMLPPolicy(
            env_spec=env.spec,
            hidden_sizes=hyper_parameters['hidden_sizes'],
            hidden_nonlinearity=tf.nn.tanh,
            output_nonlinearity=None,
        )

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = TRPO(env_spec=env.spec,
                    policy=policy,
                    baseline=baseline,
                    max_path_length=hyper_parameters['max_path_length'],
                    discount=hyper_parameters['discount'],
                    gae_lambda=hyper_parameters['gae_lambda'],
                    max_kl_step=hyper_parameters['max_kl'])

        # Set up logger since we are not using run_experiment
        tabular_log_file = osp.join(log_dir, 'progress.csv')
        dowel_logger.add_output(dowel.CsvOutput(tabular_log_file))
        dowel_logger.add_output(dowel.StdOutput())
        dowel_logger.add_output(dowel.TensorBoardOutput(log_dir))

        runner.setup(algo, env)
        runner.train(n_epochs=hyper_parameters['n_epochs'],
                     batch_size=hyper_parameters['batch_size'])

        dowel_logger.remove_all()

        return tabular_log_file
Esempio n. 26
0
    def test_pickling(self):
        """Test pickle and unpickle."""

        deterministic.set_seed(0)
        n_epochs = 10
        steps_per_epoch = 20
        sampler_batch_size = 100
        num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size
        env = normalize(
            GymEnv('InvertedDoublePendulum-v2', max_episode_length=100))
        policy = DeterministicMLPPolicy(env_spec=env.spec,
                                        hidden_sizes=[64, 64],
                                        hidden_nonlinearity=F.relu,
                                        output_nonlinearity=None)
        exploration_policy = AddGaussianNoise(env.spec,
                                              policy,
                                              total_timesteps=num_timesteps,
                                              max_sigma=0.1,
                                              min_sigma=0.1)
        qf1 = ContinuousMLPQFunction(env_spec=env.spec,
                                     hidden_sizes=[256, 256],
                                     hidden_nonlinearity=F.relu)
        qf2 = ContinuousMLPQFunction(env_spec=env.spec,
                                     hidden_sizes=[256, 256],
                                     hidden_nonlinearity=F.relu)
        replay_buffer = PathBuffer(capacity_in_transitions=int(1e6))
        sampler = LocalSampler(agents=exploration_policy,
                               envs=env,
                               max_episode_length=env.spec.max_episode_length,
                               worker_class=FragmentWorker)
        td3 = TD3(env_spec=env.spec,
                  policy=policy,
                  qf1=qf1,
                  qf2=qf2,
                  replay_buffer=replay_buffer,
                  sampler=sampler,
                  exploration_policy=exploration_policy,
                  steps_per_epoch=steps_per_epoch,
                  grad_steps_per_env_step=1,
                  num_evaluation_episodes=1,
                  discount=0.99)
        prefer_gpu()
        td3.to()

        pickled = pickle.dumps(td3)
        unpickled = pickle.loads(pickled)
        assert unpickled
Esempio n. 27
0
def mtppo_metaworld_mt10(ctxt, seed, epochs, batch_size, n_worker):
    """Set up environment and algorithm and run the task.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.
        epochs (int): Number of training epochs.
        batch_size (int): Number of environment steps in one batch.
        n_worker (int): The number of workers the sampler should use.

    """
    set_seed(seed)
    tasks = mwb.MT10.get_train_tasks().all_task_names
    envs = []
    for task in tasks:
        envs.append(
            normalize(GymEnv(mwb.MT10.from_task(task),
                             max_episode_length=150)))
    env = MultiEnvWrapper(envs,
                          sample_strategy=round_robin_strategy,
                          mode='vanilla')

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(64, 64),
        hidden_nonlinearity=torch.tanh,
        output_nonlinearity=None,
    )

    value_function = GaussianMLPValueFunction(env_spec=env.spec,
                                              hidden_sizes=(32, 32),
                                              hidden_nonlinearity=torch.tanh,
                                              output_nonlinearity=None)

    algo = PPO(env_spec=env.spec,
               policy=policy,
               value_function=value_function,
               discount=0.99,
               gae_lambda=0.95,
               center_adv=True,
               lr_clip_range=0.2)

    runner = LocalRunner(ctxt)
    runner.setup(algo, env, n_workers=n_worker)
    runner.train(n_epochs=epochs, batch_size=batch_size)
Esempio n. 28
0
def categorical_lstm_policy(ctxt, env_id, seed):
    """Create Categorical LSTM Policy on TF-PPO.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by Trainer to create the
            snapshotter.
        env_id (str): Environment id of the task.
        seed (int): Random positive integer for the trial.

    """
    deterministic.set_seed(seed)

    with TFTrainer(ctxt) as trainer:
        env = normalize(GymEnv(env_id))

        policy = CategoricalLSTMPolicy(
            env_spec=env.spec,
            hidden_dim=32,
            hidden_nonlinearity=tf.nn.tanh,
        )

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        sampler = RaySampler(agents=policy,
                             envs=env,
                             max_episode_length=env.spec.max_episode_length,
                             is_tf_worker=True)

        algo = PPO(
            env_spec=env.spec,
            policy=policy,
            baseline=baseline,
            sampler=sampler,
            discount=0.99,
            gae_lambda=0.95,
            lr_clip_range=0.2,
            policy_ent_coeff=0.0,
            optimizer_args=dict(
                batch_size=32,
                max_optimization_epochs=10,
                learning_rate=1e-3,
            ),
        )

        trainer.setup(algo, env)
        trainer.train(n_epochs=488, batch_size=2048)
Esempio n. 29
0
def categorical_cnn_policy(ctxt, env_id, seed):
    """Create Categorical CNN Policy on TF-PPO.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the
            snapshotter.
        env_id (str): Environment id of the task.
        seed (int): Random positive integer for the trial.

    """
    deterministic.set_seed(seed)

    with LocalTFRunner(ctxt) as runner:
        env = normalize(GymEnv(env_id))

        policy = CategoricalCNNPolicy(
            env_spec=env.spec,
            filters=hyper_params['conv_filters'],
            strides=hyper_params['conv_strides'],
            padding=hyper_params['conv_pad'],
            hidden_sizes=hyper_params['hidden_sizes'])

        baseline = GaussianCNNBaseline(
            env_spec=env.spec,
            filters=hyper_params['conv_filters'],
            strides=hyper_params['conv_strides'],
            padding=hyper_params['conv_pad'],
            hidden_sizes=hyper_params['hidden_sizes'],
            use_trust_region=hyper_params['use_trust_region'])

        algo = PPO(env_spec=env.spec,
                   policy=policy,
                   baseline=baseline,
                   discount=0.99,
                   gae_lambda=0.95,
                   lr_clip_range=0.2,
                   policy_ent_coeff=0.0,
                   optimizer_args=dict(
                       batch_size=32,
                       max_optimization_epochs=10,
                       learning_rate=1e-3,
                   ))

        runner.setup(algo, env)
        runner.train(n_epochs=hyper_params['n_epochs'],
                     batch_size=hyper_params['batch_size'])
Esempio n. 30
0
def test_fixed_alpha():
    """Test if using fixed_alpha ensures that alpha is non differentiable."""
    # pylint: disable=unexpected-keyword-arg
    env = normalize(GymEnv('InvertedDoublePendulum-v2',
                           max_episode_length=100))
    deterministic.set_seed(0)
    policy = TanhGaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=[32, 32],
        hidden_nonlinearity=torch.nn.ReLU,
        output_nonlinearity=None,
        min_std=np.exp(-20.),
        max_std=np.exp(2.),
    )

    qf1 = ContinuousMLPQFunction(env_spec=env.spec,
                                 hidden_sizes=[32, 32],
                                 hidden_nonlinearity=F.relu)

    qf2 = ContinuousMLPQFunction(env_spec=env.spec,
                                 hidden_sizes=[32, 32],
                                 hidden_nonlinearity=F.relu)
    replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), )
    trainer = Trainer(snapshot_config=snapshot_config)
    sampler = LocalSampler(agents=policy,
                           envs=env,
                           max_episode_length=env.spec.max_episode_length,
                           worker_class=FragmentWorker)
    sac = SAC(env_spec=env.spec,
              policy=policy,
              qf1=qf1,
              qf2=qf2,
              sampler=sampler,
              gradient_steps_per_itr=100,
              replay_buffer=replay_buffer,
              min_buffer_size=100,
              target_update_tau=5e-3,
              discount=0.99,
              buffer_batch_size=64,
              reward_scale=1.,
              steps_per_epoch=1,
              fixed_alpha=np.exp(0.5))
    trainer.setup(sac, env)
    sac.to()
    trainer.train(n_epochs=1, batch_size=100, plot=False)
    assert torch.allclose(torch.Tensor([0.5]), sac._log_alpha.cpu())
    assert not sac._use_automatic_entropy_tuning