Ejemplo n.º 1
0
    def test_pickling(self):
        """Test pickle and unpickle."""

        deterministic.set_seed(0)
        n_epochs = 10
        steps_per_epoch = 20
        sampler_batch_size = 100
        num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size
        env = normalize(
            GymEnv('InvertedDoublePendulum-v2', max_episode_length=100))
        policy = DeterministicMLPPolicy(env_spec=env.spec,
                                        hidden_sizes=[64, 64],
                                        hidden_nonlinearity=F.relu,
                                        output_nonlinearity=None)
        exploration_policy = AddGaussianNoise(env.spec,
                                              policy,
                                              total_timesteps=num_timesteps,
                                              max_sigma=0.1,
                                              min_sigma=0.1)
        qf1 = ContinuousMLPQFunction(env_spec=env.spec,
                                     hidden_sizes=[256, 256],
                                     hidden_nonlinearity=F.relu)
        qf2 = ContinuousMLPQFunction(env_spec=env.spec,
                                     hidden_sizes=[256, 256],
                                     hidden_nonlinearity=F.relu)
        replay_buffer = PathBuffer(capacity_in_transitions=int(1e6))
        sampler = LocalSampler(agents=exploration_policy,
                               envs=env,
                               max_episode_length=env.spec.max_episode_length,
                               worker_class=FragmentWorker)
        td3 = TD3(env_spec=env.spec,
                  policy=policy,
                  qf1=qf1,
                  qf2=qf2,
                  replay_buffer=replay_buffer,
                  sampler=sampler,
                  exploration_policy=exploration_policy,
                  steps_per_epoch=steps_per_epoch,
                  grad_steps_per_env_step=1,
                  num_evaluation_episodes=1,
                  discount=0.99)
        prefer_gpu()
        td3.to()

        pickled = pickle.dumps(td3)
        unpickled = pickle.loads(pickled)
        assert unpickled
Ejemplo n.º 2
0
    def test_td3_inverted_double_pendulum(self):
        deterministic.set_seed(0)
        n_epochs = 10
        steps_per_epoch = 20
        sampler_batch_size = 100
        num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size
        trainer = Trainer(snapshot_config=snapshot_config)
        env = normalize(
            GymEnv('InvertedDoublePendulum-v2', max_episode_length=100))
        policy = DeterministicMLPPolicy(env_spec=env.spec,
                                        hidden_sizes=[64, 64],
                                        hidden_nonlinearity=F.relu,
                                        output_nonlinearity=None)
        exploration_policy = AddGaussianNoise(env.spec,
                                              policy,
                                              total_timesteps=num_timesteps,
                                              max_sigma=0.1,
                                              min_sigma=0.1)
        qf1 = ContinuousMLPQFunction(env_spec=env.spec,
                                     hidden_sizes=[256, 256],
                                     hidden_nonlinearity=F.relu)
        qf2 = ContinuousMLPQFunction(env_spec=env.spec,
                                     hidden_sizes=[256, 256],
                                     hidden_nonlinearity=F.relu)
        replay_buffer = PathBuffer(capacity_in_transitions=int(1e6))
        td3 = TD3(env_spec=env.spec,
                  policy=policy,
                  qf1=qf1,
                  qf2=qf2,
                  replay_buffer=replay_buffer,
                  exploration_policy=exploration_policy,
                  steps_per_epoch=steps_per_epoch,
                  grad_steps_per_env_step=1,
                  num_evaluation_episodes=1,
                  discount=0.99)

        prefer_gpu()
        td3.to()
        trainer.setup(td3, env, sampler_cls=LocalSampler)
        trainer.train(n_epochs=n_epochs, batch_size=sampler_batch_size)
Ejemplo n.º 3
0
def td3_half_cheetah(ctxt=None, seed=1):
    """Train TD3 with InvertedDoublePendulum-v2 environment.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
        determinism.
    """
    set_seed(seed)

    n_epochs = 500
    steps_per_epoch = 20
    sampler_batch_size = 250
    num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size

    trainer = Trainer(ctxt)
    env = normalize(GymEnv('HalfCheetah-v2'))

    policy = DeterministicMLPPolicy(env_spec=env.spec,
                                    hidden_sizes=[256, 256],
                                    hidden_nonlinearity=F.relu,
                                    output_nonlinearity=torch.tanh)

    exploration_policy = AddGaussianNoise(env.spec,
                                          policy,
                                          total_timesteps=num_timesteps,
                                          max_sigma=0.1,
                                          min_sigma=0.1)

    uniform_random_policy = UniformRandomPolicy(env.spec)

    qf1 = ContinuousMLPQFunction(env_spec=env.spec,
                                 hidden_sizes=[256, 256],
                                 hidden_nonlinearity=F.relu)

    qf2 = ContinuousMLPQFunction(env_spec=env.spec,
                                 hidden_sizes=[256, 256],
                                 hidden_nonlinearity=F.relu)

    replay_buffer = PathBuffer(capacity_in_transitions=int(1e6))

    td3 = TD3(env_spec=env.spec,
              policy=policy,
              qf1=qf1,
              qf2=qf2,
              replay_buffer=replay_buffer,
              policy_optimizer=torch.optim.Adam,
              qf_optimizer=torch.optim.Adam,
              exploration_policy=exploration_policy,
              uniform_random_policy=uniform_random_policy,
              target_update_tau=0.005,
              discount=0.99,
              policy_noise_clip=0.5,
              policy_noise=0.2,
              policy_lr=1e-3,
              qf_lr=1e-3,
              steps_per_epoch=40,
              start_steps=1000,
              grad_steps_per_env_step=50,
              min_buffer_size=1000,
              buffer_batch_size=100)

    trainer.setup(algo=td3, env=env)
    trainer.train(n_epochs=750, batch_size=100)
Ejemplo n.º 4
0
def td3_garage_pytorch(ctxt, env_id, seed):
    """Create garage TensorFlow TD3 model and training.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by Localtrainer to create the
            snapshotter.
        env_id (str): Environment id of the task.
        seed (int): Random positive integer for the trial.

    """
    deterministic.set_seed(seed)

    with TFTrainer(ctxt) as trainer:
        num_timesteps = hyper_parameters['n_epochs'] * hyper_parameters[
            'steps_per_epoch'] * hyper_parameters['batch_size']
        env = normalize(GymEnv(env_id))

        policy = DeterministicMLPPolicy(
            env_spec=env.spec,
            hidden_sizes=hyper_parameters['policy_hidden_sizes'],
            hidden_nonlinearity=F.relu,
            output_nonlinearity=torch.tanh)

        exploration_policy = AddGaussianNoise(
            env.spec,
            policy,
            total_timesteps=num_timesteps,
            max_sigma=hyper_parameters['sigma'],
            min_sigma=hyper_parameters['sigma'])

        uniform_random_policy = UniformRandomPolicy(env.spec)

        qf1 = ContinuousMLPQFunction(
            env_spec=env.spec,
            hidden_sizes=hyper_parameters['qf_hidden_sizes'],
            hidden_nonlinearity=F.relu)

        qf2 = ContinuousMLPQFunction(
            env_spec=env.spec,
            hidden_sizes=hyper_parameters['qf_hidden_sizes'],
            hidden_nonlinearity=F.relu)

        replay_buffer = PathBuffer(
            capacity_in_transitions=hyper_parameters['replay_buffer_size'])

        td3 = TD3(env_spec=env.spec,
                  policy=policy,
                  qf1=qf1,
                  qf2=qf2,
                  exploration_policy=exploration_policy,
                  uniform_random_policy=uniform_random_policy,
                  replay_buffer=replay_buffer,
                  steps_per_epoch=hyper_parameters['steps_per_epoch'],
                  policy_lr=hyper_parameters['policy_lr'],
                  qf_lr=hyper_parameters['qf_lr'],
                  target_update_tau=hyper_parameters['target_update_tau'],
                  discount=hyper_parameters['discount'],
                  grad_steps_per_env_step=hyper_parameters[
                      'grad_steps_per_env_step'],
                  start_steps=hyper_parameters['start_steps'],
                  min_buffer_size=hyper_parameters['min_buffer_size'],
                  buffer_batch_size=hyper_parameters['buffer_batch_size'],
                  policy_optimizer=torch.optim.Adam,
                  qf_optimizer=torch.optim.Adam,
                  policy_noise_clip=hyper_parameters['policy_noise_clip'],
                  policy_noise=hyper_parameters['policy_noise'])

        prefer_gpu()
        td3.to()
        trainer.setup(td3, env)
        trainer.train(n_epochs=hyper_parameters['n_epochs'],
                      batch_size=hyper_parameters['batch_size'])