def get_ddpg(vec_env, device): model_factory = DeterministicPolicyModelFactory( input_block=NormalizeObservationsFactory(input_shape=17), policy_backbone=MLPFactory(input_length=17, hidden_layers=[64, 64], activation='tanh'), value_backbone=MLPFactory(input_length=23, hidden_layers=[64, 64], activation='tanh'), ) model = model_factory.instantiate(action_space=vec_env.action_space) reinforcer = BufferedOffPolicyIterationReinforcer( device=device, environment=vec_env, settings=BufferedOffPolicyIterationReinforcerSettings( rollout_steps=2, training_steps=64, ), model=model, algo=DeepDeterministicPolicyGradient( model_factory=model_factory, discount_factor=0.99, tau=0.01, ), env_roller=TransitionReplayEnvRoller( environment=vec_env, device=device, action_noise=OuNoise(std_dev=0.2, environment=vec_env), replay_buffer=CircularReplayBuffer( buffer_capacity=1_000_000, buffer_initial_size=2_000, num_envs=vec_env.num_envs, observation_space=vec_env.observation_space, action_space=vec_env.action_space ), normalize_returns=True, discount_factor=0.99 ), )
def half_cheetah_ddpg(): device = torch.device('cuda:0') seed = 1002 # Set random seed in python std lib, numpy and pytorch set_seed(seed) vec_env = DummyVecEnvWrapper(MujocoEnv('HalfCheetah-v2')).instantiate( parallel_envs=1, seed=seed) model_factory = DeterministicPolicyModelFactory( input_block=NormalizeObservationsFactory(input_shape=17), policy_backbone=MLPFactory(input_length=17, hidden_layers=[64, 64], activation='tanh'), value_backbone=MLPFactory(input_length=23, hidden_layers=[64, 64], activation='tanh'), ) model = model_factory.instantiate(action_space=vec_env.action_space) reinforcer = BufferedOffPolicyIterationReinforcer( device=device, environment=vec_env, settings=BufferedOffPolicyIterationReinforcerSettings( rollout_steps=2, training_steps=64, ), model=model, algo=DeepDeterministicPolicyGradient( model_factory=model_factory, discount_factor=0.99, tau=0.01, ), env_roller=TransitionReplayEnvRoller( environment=vec_env, device=device, action_noise=OuNoise(std_dev=0.2, environment=vec_env), replay_buffer=CircularReplayBuffer( buffer_capacity=1_000_000, buffer_initial_size=2_000, num_envs=vec_env.num_envs, observation_space=vec_env.observation_space, action_space=vec_env.action_space), normalize_returns=True, discount_factor=0.99), )
def get_ppo(vec_env, device): model = StochasticPolicyModelFactory( input_block=NormalizeObservationsFactory(input_shape=17), backbone=MLPFactory(input_length=23, hidden_layers=[64, 64], activation='tanh'), ).instantiate(action_space=vec_env.action_space) cliprange = LinearSchedule( initial_value=0.1, final_value=0.0 ) reinforcer = OnPolicyIterationReinforcer( device=device, settings=OnPolicyIterationReinforcerSettings( batch_size=256, experience_replay=4, number_of_steps=128 ), model=model, algo=PpoPolicyGradient( entropy_coefficient=0.01, value_coefficient=0.5, max_grad_norm=0.5, discount_factor=0.99, gae_lambda=0.95, cliprange=cliprange ), env_roller=StepEnvRoller( environment=vec_env, device=device, ) ) return model, reinforcer
def half_cheetah_ddpg(): device = torch.device('cuda:0') seed = 1002 # Set random seed in python std lib, numpy and pytorch set_seed(seed) env = MujocoEnv('HalfCheetah-v2').instantiate(seed=seed) model_factory = DeterministicPolicyModelFactory( policy_backbone=MLPFactory(input_length=17, hidden_layers=[64, 64], activation='tanh'), value_backbone=MLPFactory(input_length=23, hidden_layers=[64, 64], activation='tanh'), ) model = model_factory.instantiate(action_space=env.action_space) reinforcer = BufferedSingleOffPolicyIterationReinforcer( device=device, settings=BufferedSingleOffPolicyIterationReinforcerSettings( batch_rollout_rounds=100, batch_training_rounds=50, batch_size=64, discount_factor=0.99 ), environment=env, model=model, algo=DeepDeterministicPolicyGradient( model_factory=model_factory, tau=0.01, ), env_roller=DequeReplayRollerOuNoise( environment=env, device=device, batch_size=64, buffer_capacity=1_000_000, buffer_initial_size=2_000, noise_std_dev=0.2, normalize_observations=True, normalize_returns=True, discount_factor=0.99 ) )
def test_trpo_bipedal_walker(): """ 1 iteration of TRPO on bipedal walker """ device = torch.device('cpu') seed = 1001 # Set random seed in python std lib, numpy and pytorch set_seed(seed) vec_env = DummyVecEnvWrapper(MujocoEnv('BipedalWalker-v2'), normalize=True).instantiate(parallel_envs=8, seed=seed) # Again, use a helper to create a model # But because model is owned by the reinforcer, model should not be accessed using this variable # but from reinforcer.model property model_factory = PolicyGradientModelSeparateFactory( policy_backbone=MLPFactory(input_length=24, hidden_layers=[32, 32]), value_backbone=MLPFactory(input_length=24, hidden_layers=[32])) # Reinforcer - an object managing the learning process reinforcer = OnPolicyIterationReinforcer( device=device, settings=OnPolicyIterationReinforcerSettings(discount_factor=0.99, ), model=model_factory.instantiate(action_space=vec_env.action_space), algo=TrpoPolicyGradient( max_kl=0.01, cg_iters=10, line_search_iters=10, improvement_acceptance_ratio=0.1, cg_damping=0.1, vf_iters=5, entropy_coef=0.0, max_grad_norm=0.5, ), env_roller=StepEnvRoller( environment=vec_env, device=device, number_of_steps=12, discount_factor=0.99, )) # Model optimizer optimizer = optim.Adam(reinforcer.model.parameters(), lr=1.0e-3, eps=1e-4) # Overall information store for training information training_info = TrainingInfo( metrics=[ EpisodeRewardMetric( 'episode_rewards'), # Calculate average reward from episode ], callbacks=[FrameTracker(100_000) ] # Print live metrics every epoch to standard output ) # A bit of training initialization bookkeeping... training_info.initialize() reinforcer.initialize_training(training_info) training_info.on_train_begin() # Let's make 100 batches per epoch to average metrics nicely num_epochs = 1 # Normal handrolled training loop for i in range(1, num_epochs + 1): epoch_info = EpochInfo(training_info=training_info, global_epoch_idx=i, batches_per_epoch=1, optimizer=optimizer) reinforcer.train_epoch(epoch_info, interactive=False) training_info.on_train_end()
def test_ddpg_bipedal_walker(): """ 1 iteration of DDPG bipedal walker environment """ device = torch.device('cpu') seed = 1001 # Set random seed in python std lib, numpy and pytorch set_seed(seed) # Only single environment for DDPG env = MujocoEnv('BipedalWalker-v2').instantiate(seed=seed) # Again, use a helper to create a model # But because model is owned by the reinforcer, model should not be accessed using this variable # but from reinforcer.model property model_factory = DeterministicPolicyModelFactory( policy_backbone=MLPFactory(input_length=24, hidden_layers=[64, 64], normalization='layer'), value_backbone=MLPFactory(input_length=28, hidden_layers=[64, 64], normalization='layer')) # Reinforcer - an object managing the learning process reinforcer = BufferedSingleOffPolicyIterationReinforcer( device=device, settings=BufferedSingleOffPolicyIterationReinforcerSettings( batch_rollout_rounds=4, batch_training_rounds=1, batch_size=32, discount_factor=0.99), environment=env, algo=DeepDeterministicPolicyGradient(model_factory=model_factory, tau=0.01, max_grad_norm=0.5), model=model_factory.instantiate(action_space=env.action_space), env_roller=DequeReplayRollerOuNoise(environment=env, device=device, batch_size=32, buffer_capacity=100, buffer_initial_size=100, noise_std_dev=0.2, normalize_observations=True, discount_factor=0.99)) # Model optimizer optimizer = optim.Adam(reinforcer.model.parameters(), lr=2.5e-4, eps=1e-4) # Overall information store for training information training_info = TrainingInfo( metrics=[ EpisodeRewardMetric( 'episode_rewards'), # Calculate average reward from episode ], callbacks=[FrameTracker(100_000) ] # Print live metrics every epoch to standard output ) # A bit of training initialization bookkeeping... training_info.initialize() reinforcer.initialize_training(training_info) training_info.on_train_begin() # Let's make 100 batches per epoch to average metrics nicely num_epochs = 1 # Normal handrolled training loop for i in range(1, num_epochs + 1): epoch_info = EpochInfo(training_info=training_info, global_epoch_idx=i, batches_per_epoch=1, optimizer=optimizer) reinforcer.train_epoch(epoch_info, interactive=False) training_info.on_train_end()