def test_pickling(self): """Test pickle and unpickle.""" deterministic.set_seed(0) n_epochs = 10 steps_per_epoch = 20 sampler_batch_size = 100 num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size env = normalize( GymEnv('InvertedDoublePendulum-v2', max_episode_length=100)) policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu, output_nonlinearity=None) exploration_policy = AddGaussianNoise(env.spec, policy, total_timesteps=num_timesteps, max_sigma=0.1, min_sigma=0.1) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) sampler = LocalSampler(agents=exploration_policy, envs=env, max_episode_length=env.spec.max_episode_length, worker_class=FragmentWorker) td3 = TD3(env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, replay_buffer=replay_buffer, sampler=sampler, exploration_policy=exploration_policy, steps_per_epoch=steps_per_epoch, grad_steps_per_env_step=1, num_evaluation_episodes=1, discount=0.99) prefer_gpu() td3.to() pickled = pickle.dumps(td3) unpickled = pickle.loads(pickled) assert unpickled
def test_td3_inverted_double_pendulum(self): deterministic.set_seed(0) n_epochs = 10 steps_per_epoch = 20 sampler_batch_size = 100 num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size trainer = Trainer(snapshot_config=snapshot_config) env = normalize( GymEnv('InvertedDoublePendulum-v2', max_episode_length=100)) policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=F.relu, output_nonlinearity=None) exploration_policy = AddGaussianNoise(env.spec, policy, total_timesteps=num_timesteps, max_sigma=0.1, min_sigma=0.1) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) td3 = TD3(env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, replay_buffer=replay_buffer, exploration_policy=exploration_policy, steps_per_epoch=steps_per_epoch, grad_steps_per_env_step=1, num_evaluation_episodes=1, discount=0.99) prefer_gpu() td3.to() trainer.setup(td3, env, sampler_cls=LocalSampler) trainer.train(n_epochs=n_epochs, batch_size=sampler_batch_size)
def td3_half_cheetah(ctxt=None, seed=1): """Train TD3 with InvertedDoublePendulum-v2 environment. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) n_epochs = 500 steps_per_epoch = 20 sampler_batch_size = 250 num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size trainer = Trainer(ctxt) env = normalize(GymEnv('HalfCheetah-v2')) policy = DeterministicMLPPolicy(env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=F.relu, output_nonlinearity=torch.tanh) exploration_policy = AddGaussianNoise(env.spec, policy, total_timesteps=num_timesteps, max_sigma=0.1, min_sigma=0.1) uniform_random_policy = UniformRandomPolicy(env.spec) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[256, 256], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) td3 = TD3(env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, replay_buffer=replay_buffer, policy_optimizer=torch.optim.Adam, qf_optimizer=torch.optim.Adam, exploration_policy=exploration_policy, uniform_random_policy=uniform_random_policy, target_update_tau=0.005, discount=0.99, policy_noise_clip=0.5, policy_noise=0.2, policy_lr=1e-3, qf_lr=1e-3, steps_per_epoch=40, start_steps=1000, grad_steps_per_env_step=50, min_buffer_size=1000, buffer_batch_size=100) trainer.setup(algo=td3, env=env) trainer.train(n_epochs=750, batch_size=100)
def td3_garage_pytorch(ctxt, env_id, seed): """Create garage TensorFlow TD3 model and training. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Localtrainer to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with TFTrainer(ctxt) as trainer: num_timesteps = hyper_parameters['n_epochs'] * hyper_parameters[ 'steps_per_epoch'] * hyper_parameters['batch_size'] env = normalize(GymEnv(env_id)) policy = DeterministicMLPPolicy( env_spec=env.spec, hidden_sizes=hyper_parameters['policy_hidden_sizes'], hidden_nonlinearity=F.relu, output_nonlinearity=torch.tanh) exploration_policy = AddGaussianNoise( env.spec, policy, total_timesteps=num_timesteps, max_sigma=hyper_parameters['sigma'], min_sigma=hyper_parameters['sigma']) uniform_random_policy = UniformRandomPolicy(env.spec) qf1 = ContinuousMLPQFunction( env_spec=env.spec, hidden_sizes=hyper_parameters['qf_hidden_sizes'], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction( env_spec=env.spec, hidden_sizes=hyper_parameters['qf_hidden_sizes'], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer( capacity_in_transitions=hyper_parameters['replay_buffer_size']) td3 = TD3(env_spec=env.spec, policy=policy, qf1=qf1, qf2=qf2, exploration_policy=exploration_policy, uniform_random_policy=uniform_random_policy, replay_buffer=replay_buffer, steps_per_epoch=hyper_parameters['steps_per_epoch'], policy_lr=hyper_parameters['policy_lr'], qf_lr=hyper_parameters['qf_lr'], target_update_tau=hyper_parameters['target_update_tau'], discount=hyper_parameters['discount'], grad_steps_per_env_step=hyper_parameters[ 'grad_steps_per_env_step'], start_steps=hyper_parameters['start_steps'], min_buffer_size=hyper_parameters['min_buffer_size'], buffer_batch_size=hyper_parameters['buffer_batch_size'], policy_optimizer=torch.optim.Adam, qf_optimizer=torch.optim.Adam, policy_noise_clip=hyper_parameters['policy_noise_clip'], policy_noise=hyper_parameters['policy_noise']) prefer_gpu() td3.to() trainer.setup(td3, env) trainer.train(n_epochs=hyper_parameters['n_epochs'], batch_size=hyper_parameters['batch_size'])