def reinforce(env_fn, reinforce_policy=core.MLPPolicy, policy_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, pi_lr=3e-4, max_ep_len=1000, logger_kwargs=dict(), save_freq=10): """ REINFORCE Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. reinforce_policy: The constructor method for a PyTorch Module with a ``step`` method, and a ``pi`` module, The ``step`` method should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Numpy array of actions for each ``logp_a`` (batch,) | Numpy array of log probs for the | actions in ``a``. =========== ================ ====================================== The ``act`` method behaves the same as ``step`` but only returns ``a``. The ``pi`` module's forward call should accept a batch of observations and optionally a batch of actions, and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` N/A | Torch Distribution object, containing | a batch of distributions describing | the policy for the provided observations. ``logp_a`` (batch,) | Optional (only returned if batch of | actions is given). Tensor containing | the log probability, according to | the policy, of the provided actions. | If actions not given, will contain | ``None``. =========== ================ ====================================== policy_kwargs (dict): Any kwargs appropriate for the Policy object you provided to REINFORCE. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. pi_lr (float): Learning rate for policy optimizer. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ # Special function to avoid certain slowdowns from PyTorch + MPI combo. setup_pytorch_for_mpi() # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Random seed seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) # Instantiate environment env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Create policy module policy = reinforce_policy(env.observation_space, env.action_space, **policy_kwargs) # Sync params across processes sync_params(policy) # Count variables var_counts = tuple(core.count_vars(module) for module in [policy.pi]) logger.log('\nNumber of parameters: \t pi: %d\n' % var_counts) # Set up experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = REINFORCEBuffer(obs_dim, act_dim, local_steps_per_epoch) # Set up function for computing REINFORCE policy loss def compute_loss_pi(data): obs, act, ret, logp_old = data['obs'], data['act'], data['ret'], data['logp'] # Policy loss pi, logp = policy.pi(obs, act) loss_pi = -(logp * ret).mean() # Useful extra info approx_kl = (logp_old - logp).mean().item() ent = pi.entropy().mean().item() pi_info = dict(kl=approx_kl, ent=ent) return loss_pi, pi_info # Set up optimizers for policy and value function pi_optimizer = Adam(policy.pi.parameters(), lr=pi_lr) # Set up model saving logger.setup_pytorch_saver(policy) def update(): data = buf.get() # Get loss and info values before update pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() # Train policy with a single step of gradient descent pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) loss_pi.backward() mpi_avg_grads(policy.pi) # average grads across MPI processes pi_optimizer.step() # Log changes from update kl, ent = pi_info['kl'], pi_info_old['ent'] logger.store(LossPi=pi_l_old, KL=kl, Entropy=ent, DeltaLossPi=(loss_pi.item() - pi_l_old)) # Prepare for interaction with environment start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, logp = policy.step(torch.as_tensor(o, dtype=torch.float32)) next_o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # save and log buf.store(o, a, r, logp) # Update obs (critical!) o = next_o timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t == local_steps_per_epoch-1 if terminal or epoch_ended: if epoch_ended and not terminal: print('Warning: trajectory cut off by epoch at %d steps.' % ep_len, flush=True) buf.finish_path() if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs-1): logger.save_state({'env': env}, None) # Perform REINFORCE update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', (epoch+1)*steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular()
def her_dqn(env_fn, policy=core.MLPQPolicy, policy_kwargs=dict(), double_Q=True, seed=0, steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, q_lr=1e-3, batch_size=100, start_steps=10000, update_after=1000, update_every=50, num_test_episodes=10, eps_start=0.9, eps_end=0.05, eps_decay=200, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Deep Q Network (DQN) with Hindsight Experience Replay (HER). This implementation uses the `final` goal resampling scheme of the paper. Works with Discrete GoalEnv environments, e.g., `rlbase:BitFlip-v0`. Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. policy: The constructor method for a PyTorch Module with an ``act`` method, a ``pi`` module, and a ``q`` module. The ``act`` method and ``pi`` module should accept batches of observations as inputs, and ``q`` should accept a batch of observations and a batch of actions as inputs. When called, these should return: =========== ================ ====================================== Call Output Shape Description =========== ================ ====================================== ``act`` (batch, act_dim) | Numpy array of actions for each | observation. ``pi`` (batch, act_dim) | Tensor containing actions from policy | given observations. ``q`` (batch,) | Tensor containing the current estimate | of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) =========== ================ ====================================== policy_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to DDPG. double_Q: If true, uses double Q-Learning. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) q_lr (float): Learning rate for Q-networks. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. update_after (int): Number of env interactions to collect before starting to do gradient descent updates. Ensures replay buffer is full enough for useful updates. update_every (int): Number of env interactions that should elapse between gradient descent updates. Note: Regardless of how long you wait between updates, the ratio of env steps to gradient steps is locked to 1. num_test_episodes (int): Number of episodes to test the deterministic policy at the end of each epoch. eps_start (float): Starting value of epsilon for epsilon-greedy exploration eps_end (float): Ending value of epsilon for epsilon-greedy exploration eps_decay (int): Decay rate of epsilon for epsilon-greedy exploration max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() assert isinstance(env.action_space, Discrete), "DQN is only implemented for Discrete action." obs_dim = env.observation_space['observation'].shape act_dim = env.action_space.n # Create Q-policy module and target networks q_policy = policy(env.observation_space['observation'], env.action_space, **policy_kwargs) q_target = deepcopy(q_policy) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in q_target.parameters(): p.requires_grad = False # Experience buffer replay_buffer = HERReplayBuffer(obs_dim=obs_dim, size=replay_size) episode_buffer = HEREpisodeBuffer() # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple(core.count_vars(module) for module in [q_policy.q_net]) logger.log('\nNumber of parameters: \t q: %d\n' % var_counts) # Set up function for computing DQN Q loss def compute_loss_q(data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[ 'obs2'], data['done'] # compute Q(s_t, a_t): q_net computes Q(s_t), then we select the columns of actions taken # squeezing the q is critical to ensure is has the right shape q = q_policy.q_net(o).gather(1, a.long()).squeeze(-1) # Bellman backup for Q function with torch.no_grad(): if not double_Q: # expected state action values of the next state q_pi_target = q_target.q_net(o2).max(1)[0].detach() else: # in double Q-learning the action taken in the next state is chosen based on # the policy network instead of the target network. The next action should be # unsqueezed to have consistent size inside the gather method next_state_action = q_policy.q_net(o2).argmax(1).unsqueeze(1) q_pi_target = q_target.q_net(o2).gather( 1, next_state_action.long()).squeeze(-1) backup = r + gamma * (1 - d) * q_pi_target # MSE loss against Bellman backup loss_q = ((q - backup)**2).mean() # Useful info for logging loss_info = dict(QVals=q.detach().numpy()) return loss_q, loss_info # set up optimizer for Q function q_optimizer = Adam(q_policy.q_net.parameters(), lr=q_lr) # set up model saving logger.setup_pytorch_saver(q_policy) def update(data): # run one gradient descent step for Q function q_optimizer.zero_grad() loss_q, loss_info = compute_loss_q(data) loss_q.backward() q_optimizer.step() logger.store(LossQ=loss_q.item(), **loss_info) # update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(q_policy.parameters(), q_target.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) def get_action(obs, eps_threshold=0.0): sample = random.random() if sample > eps_threshold: a = q_policy.act(torch.as_tensor(obs, dtype=torch.float32)) else: a = env.action_space.sample() return a def test_agent(): for j in range(num_test_episodes): full_o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 o = full_o['observation'] while not (d or (ep_len == max_ep_len)): # choose action deterministically full_o, r, d, _ = test_env.step( get_action(o, eps_threshold=0.0)) o = full_o['observation'] ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) # Prepare for interaction with environment total_steps = steps_per_epoch * epochs start_time = time.time() full_o, ep_ret, ep_len = env.reset(), 0, 0 o = full_o['observation'] # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): eps_threshold = eps_end + (eps_start - eps_end) * math.exp( -1. * t / eps_decay) # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration if t > start_steps: a = get_action(o, eps_threshold) else: a = env.action_space.sample() # Step the env full_o2, r, d, _ = env.step(a) o2 = full_o2['observation'] ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Store experience to episode buffer episode_buffer.store(o, a, r, o2, d) # update most recent observation! o = o2 # End of trajectory handling if d or (ep_len == max_ep_len): # Use HER to store additional experience in the replay buffer final_state = episode_buffer.final_state() for data in iter(episode_buffer): new_rew = env.compute_reward(data['obs2'], final_state, None) replay_buffer.store(data['obs'], data['act'], new_rew, data['obs2'], data['done']) logger.store(EpRet=ep_ret, EpLen=ep_len) full_o, ep_ret, ep_len = env.reset(), 0, 0 o = full_o['observation'] episode_buffer = HEREpisodeBuffer() # Update handling if t >= update_after and t % update_every == 0: for _ in range(update_every): batch = replay_buffer.sample_batch(batch_size) update(data=batch) # End of epoch handling if (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('QVals', with_min_and_max=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def td3(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, update_after=1000, update_every=50, act_noise=0.1, target_noise=0.2, noise_clip=0.5, policy_delay=2, num_test_episodes=10, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Twin Delayed Deep Deterministic Policy Gradient (TD3) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with an ``act`` method, a ``pi`` module, a ``q1`` module, and a ``q2`` module. The ``act`` method and ``pi`` module should accept batches of observations as inputs, and ``q1`` and ``q2`` should accept a batch of observations and a batch of actions as inputs. When called, these should return: =========== ================ ====================================== Call Output Shape Description =========== ================ ====================================== ``act`` (batch, act_dim) | Numpy array of actions for each | observation. ``pi`` (batch, act_dim) | Tensor containing actions from policy | given observations. ``q1`` (batch,) | Tensor containing one current estimate | of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) ``q2`` (batch,) | Tensor containing the other current | estimate of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to TD3. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) pi_lr (float): Learning rate for policy. q_lr (float): Learning rate for Q-networks. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. update_after (int): Number of env interactions to collect before starting to do gradient descent updates. Ensures replay buffer is full enough for useful updates. update_every (int): Number of env interactions that should elapse between gradient descent updates. Note: Regardless of how long you wait between updates, the ratio of env steps to gradient steps is locked to 1. act_noise (float): Stddev for Gaussian exploration noise added to policy at training time. (At test time, no noise is added.) target_noise (float): Stddev for smoothing noise added to target policy. noise_clip (float): Limit for absolute value of target policy smoothing noise. policy_delay (int): Policy will only be updated once every policy_delay times for each update of the Q-networks. num_test_episodes (int): Number of episodes to test the deterministic policy at the end of each epoch. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Create actor-critic module and target networks ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) ac_targ = deepcopy(ac) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in ac_targ.parameters(): p.requires_grad = False # List of parameters for both Q-networks (save this for convenience) q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters()) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple( core.count_vars(module) for module in [ac.pi, ac.q1, ac.q2]) logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' % var_counts) # Set up function for computing TD3 Q-losses def compute_loss_q(data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[ 'obs2'], data['done'] q1 = ac.q1(o, a) q2 = ac.q2(o, a) # Bellman backup for Q functions with torch.no_grad(): pi_targ = ac_targ.pi(o2) # Target policy smoothing epsilon = torch.randn_like(pi_targ) * target_noise epsilon = torch.clamp(epsilon, -noise_clip, noise_clip) a2 = pi_targ + epsilon a2 = torch.clamp(a2, -act_limit, act_limit) # Target Q-values q1_pi_targ = ac_targ.q1(o2, a2) q2_pi_targ = ac_targ.q2(o2, a2) q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ) backup = r + gamma * (1 - d) * q_pi_targ # MSE loss against Bellman backup loss_q1 = ((q1 - backup)**2).mean() loss_q2 = ((q2 - backup)**2).mean() loss_q = loss_q1 + loss_q2 # Useful info for logging loss_info = dict(Q1Vals=q1.detach().numpy(), Q2Vals=q2.detach().numpy()) return loss_q, loss_info # Set up function for computing TD3 pi loss def compute_loss_pi(data): o = data['obs'] q1_pi = ac.q1(o, ac.pi(o)) return -q1_pi.mean() # Set up optimizers for policy and q-function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) q_optimizer = Adam(q_params, lr=q_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(data, timer): # First run one gradient descent step for Q1 and Q2 q_optimizer.zero_grad() loss_q, loss_info = compute_loss_q(data) loss_q.backward() q_optimizer.step() # Record things logger.store(LossQ=loss_q.item(), **loss_info) # Possibly update pi and target networks if timer % policy_delay == 0: # Freeze Q-networks so you don't waste computational effort # computing gradients for them during the policy learning step. for p in q_params: p.requires_grad = False # Next run one gradient descent step for pi. pi_optimizer.zero_grad() loss_pi = compute_loss_pi(data) loss_pi.backward() pi_optimizer.step() # Unfreeze Q-networks so you can optimize it at next DDPG step. for p in q_params: p.requires_grad = True # Record things logger.store(LossPi=loss_pi.item()) # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(ac.parameters(), ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) def get_action(o, noise_scale): a = ac.act(torch.as_tensor(o, dtype=torch.float32)) a += noise_scale * np.random.randn(act_dim) return np.clip(a, -act_limit, act_limit) def test_agent(): for j in range(num_test_episodes): o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) o, r, d, _ = test_env.step(get_action(o, 0)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) # Prepare for interaction with environment total_steps = steps_per_epoch * epochs start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy (with some noise, via act_noise). if t > start_steps: a = get_action(o, act_noise) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling if d or (ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Update handling if t >= update_after and t % update_every == 0: for j in range(update_every): batch = replay_buffer.sample_batch(batch_size) update(data=batch, timer=j) # End of epoch handling if (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def ppo(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Proximal Policy Optimization (by clipping), with early stopping based on approximate KL Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with a ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` module. The ``step`` method should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Numpy array of actions for each | observation. ``v`` (batch,) | Numpy array of value estimates | for the provided observations. ``logp_a`` (batch,) | Numpy array of log probs for the | actions in ``a``. =========== ================ ====================================== The ``act`` method behaves the same as ``step`` but only returns ``a``. The ``pi`` module's forward call should accept a batch of observations and optionally a batch of actions, and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` N/A | Torch Distribution object, containing | a batch of distributions describing | the policy for the provided observations. ``logp_a`` (batch,) | Optional (only returned if batch of | actions is given). Tensor containing | the log probability, according to | the policy, of the provided actions. | If actions not given, will contain | ``None``. =========== ================ ====================================== The ``v`` module's forward call should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``v`` (batch,) | Tensor containing the value estimates | for the provided observations. (Critical: | make sure to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) Typically denoted by :math:`\epsilon`. pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ # Special function to avoid certain slowdowns from PyTorch + MPI combo. setup_pytorch_for_mpi() # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Random seed seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) # Instantiate environment env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Create actor-critic module ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) # Sync params across processes sync_params(ac) # Count variables var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # Set up experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Set up function for computing PPO policy loss def compute_loss_pi(data): obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data[ 'logp'] # Policy loss pi, logp = ac.pi(obs, act) ratio = torch.exp(logp - logp_old) clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv loss_pi = -(torch.min(ratio * adv, clip_adv)).mean() # Useful extra info approx_kl = (logp_old - logp).mean().item() ent = pi.entropy().mean().item() clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio) clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item() pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac) return loss_pi, pi_info # Set up function for computing value loss def compute_loss_v(data): obs, ret = data['obs'], data['ret'] return ((ac.v(obs) - ret)**2).mean() # Set up optimizers for policy and value function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(): data = buf.get() pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() # Train policy with multiple steps of gradient descent for i in range(train_pi_iters): pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) kl = mpi_avg(pi_info['kl']) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break loss_pi.backward() mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() logger.store(StopIter=i) # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) loss_v.backward() mpi_avg_grads(ac.v) # average grads across MPI processes vf_optimizer.step() # Log changes from update kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old)) # Prepare for interaction with environment start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32)) next_o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # save and log buf.store(o, a, r, v, logp) logger.store(VVals=v) # Update obs (critical!) o = next_o timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t == local_steps_per_epoch - 1 if terminal or epoch_ended: if epoch_ended and not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32)) else: v = 0 buf.finish_path(v) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()