def ppo(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Proximal Policy Optimization (by clipping), with early stopping based on approximate KL Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with a ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` module. The ``step`` method should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Numpy array of actions for each | observation. ``v`` (batch,) | Numpy array of value estimates | for the provided observations. ``logp_a`` (batch,) | Numpy array of log probs for the | actions in ``a``. =========== ================ ====================================== The ``act`` method behaves the same as ``step`` but only returns ``a``. The ``pi`` module's forward call should accept a batch of observations and optionally a batch of actions, and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` N/A | Torch Distribution object, containing | a batch of distributions describing | the policy for the provided observations. ``logp_a`` (batch,) | Optional (only returned if batch of | actions is given). Tensor containing | the log probability, according to | the policy, of the provided actions. | If actions not given, will contain | ``None``. =========== ================ ====================================== The ``v`` module's forward call should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``v`` (batch,) | Tensor containing the value estimates | for the provided observations. (Critical: | make sure to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) Typically denoted by :math:`\epsilon`. pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ # Special function to avoid certain slowdowns from PyTorch + MPI combo. setup_pytorch_for_mpi() # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Random seed seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) # Instantiate environment env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Create actor-critic module ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) # Sync params across processes sync_params(ac) # Count variables var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # Set up experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Set up function for computing PPO policy loss def compute_loss_pi(data): obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data[ 'logp'] # Policy loss pi, logp = ac.pi(obs, act) ratio = torch.exp(logp - logp_old) clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv loss_pi = -(torch.min(ratio * adv, clip_adv)).mean() # Useful extra info approx_kl = (logp_old - logp).mean().item() ent = pi.entropy().mean().item() clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio) clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item() pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac) return loss_pi, pi_info # Set up function for computing value loss def compute_loss_v(data): obs, ret = data['obs'], data['ret'] return ((ac.v(obs) - ret)**2).mean() # Set up optimizers for policy and value function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(): data = buf.get() pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() # Train policy with multiple steps of gradient descent for i in range(train_pi_iters): pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) kl = mpi_avg(pi_info['kl']) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break loss_pi.backward() mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() logger.store(StopIter=i) # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) loss_v.backward() mpi_avg_grads(ac.v) # average grads across MPI processes vf_optimizer.step() # Log changes from update kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old)) # Prepare for interaction with environment start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32)) next_o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # save and log buf.store(o, a, r, v, logp) logger.store(VVals=v) # Update obs (critical!) o = next_o timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t == local_steps_per_epoch - 1 if terminal or epoch_ended: if epoch_ended and not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32)) else: v = 0 buf.finish_path(v) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def ppo(env, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=2048, epochs=250, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=100, train_v_iters=70, lam=0.95, max_ep_len=512, target_kl=0.005, logger_kwargs=dict(), save_freq=5): """ Proximal Policy Optimization (by clipping), with early stopping based on approximate KL Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with a ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` module. The ``step`` method should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Numpy array of actions for each | observation. ``v`` (batch,) | Numpy array of value estimates | for the provided observations. ``logp_a`` (batch,) | Numpy array of log probs for the | actions in ``a``. =========== ================ ====================================== The ``act`` method behaves the same as ``step`` but only returns ``a``. The ``pi`` module's forward call should accept a batch of observations and optionally a batch of actions, and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` N/A | Torch Distribution object, containing | a batch of distributions describing | the policy for the provided observations. ``logp_a`` (batch,) | Optional (only returned if batch of | actions is given). Tensor containing | the log probability, according to | the policy, of the provided actions. | If actions not given, will contain | ``None``. =========== ================ ====================================== The ``v`` module's forward call should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``v`` (batch,) | Tensor containing the value estimates | for the provided observations. (Critical: | make sure to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) Typically denoted by :math:`\epsilon`. pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ # Special function to avoid certain slowdowns from PyTorch + MPI combo. setup_pytorch_for_mpi() # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Random seed seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) # Instantiate environment env = env( "PandaPegIn", has_offscreen_renderer=True, # has_renderer=True, use_camera_obs=False, control_freq=100, ) obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Create actor-critic module ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) #加载预训练模型 # fname = "data/ppo_peg_in_add_delta_pos_plus_plus/ppo_peg_in_add_delta_pos_plus_plus_s0/pyt_save/model24.pt" # pre_model = torch.load(fname) # ac.pi = pre_model.pi # ac.v =pre_model.v #使用TensorboardX writer = logger.create_writer() # Sync params across processes sync_params(ac) # Count variables var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # Set up experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Set up function for computing PPO policy loss def compute_loss_pi(data): obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data[ 'logp'] # Policy loss pi, logp = ac.pi( obs, act ) #变化的是网络pi data['obs'],data['act'],data['adv'],data['logp']在回合内未变 ratio = torch.exp(logp - logp_old) clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv loss_pi = -(torch.min(ratio * adv, clip_adv)).mean() # Useful extra info approx_kl = (logp_old - logp).mean().item() ent = pi.entropy().mean().item() clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio) clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item() pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac) return loss_pi, pi_info #需要最小化loss_pi,pi_info包括kl散度、熵、越界程度(都针对单个回合) # Set up function for computing value loss def compute_loss_v(data): obs, ret = data['obs'], data['ret'] return ((ac.v(obs) - ret)**2).mean() # Set up optimizers for policy and value function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(): data = buf.get() #一次更新改变一次data pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() # Train policy with multiple steps of gradient descent for i in range(train_pi_iters): #在kl散度不超标的前提下尽可能减小损失 pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) kl = mpi_avg(pi_info['kl']) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break loss_pi.backward() mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() logger.store(StopIter=i) # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) loss_v.backward() mpi_avg_grads(ac.v) # average grads across MPI processes vf_optimizer.step() # print(i,':',loss_v) # print('='*20) # Log changes from update kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old)) # Prepare for interaction with environment start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 #运动到初始位置 pre_action = [0, 0, 0] for i in range(4): o, _, _, _ = env.step(pre_action) # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): print("epoch:", epoch) for t in range(local_steps_per_epoch): # if( t == steps_per_epoch/2 ): # print("Half finished!") #通过policy网络和值函数网络计算出:动作、值函数和采取这个动作的概率 a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32)) next_o, r, d, _ = env.step(a) ep_ret += r #单次游戏回报 ep_len += 1 #单次游戏时长 # save and log buf.store(o, a, r, v, logp) logger.store(VVals=v) # Update obs (critical!) o = next_o timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t == local_steps_per_epoch - 1 if terminal or epoch_ended: #game die; game超出时间; 回合结束 if epoch_ended and not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32)) else: v = 0 buf.finish_path(v) #计算GAE和rewards-to-go # print("steps:",t) # print("done",d) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, epoch) # Perform PPO update! update() #将数据写入TensorboardX stats_to_write = logger.get_stats('EpRet') writer.add_scalar('AverageEpRet', stats_to_write[0], global_step=(epoch + 1) * 2048) # Log info about epoch 一个回合的数据 logger.log_tabular('Epoch', epoch) #第几个回合 logger.log_tabular('EpRet', with_min_and_max=True) #回报的最大、最小、平均值,游戏结束时停留的状态的回报 logger.log_tabular('EpLen', average_only=True) #单次游戏长度的平均值 logger.log_tabular('VVals', with_min_and_max=True) #值函数的最大、最小、平均值 logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) #目前总步数 logger.log_tabular('LossPi', average_only=True) #回合开始时策略网络的损失 logger.log_tabular('LossV', average_only=True) #回合开始时值网络的损失 logger.log_tabular('DeltaLossPi', average_only=True) #策略网络回合结束损失-开始损失 logger.log_tabular('DeltaLossV', average_only=True) #值略网络回合结束损失-开始损失 logger.log_tabular('Entropy', average_only=True) #? logger.log_tabular('KL', average_only=True) #散度值 logger.log_tabular('ClipFrac', average_only=True) #越界程度 logger.log_tabular('StopIter', average_only=True) #ppo策略网络迭代次数 logger.log_tabular('Time', time.time() - start_time) #回合时间 logger.dump_tabular() # if __name__ == '__main__': # import argparse # parser = argparse.ArgumentParser() # parser.add_argument('--env', type=str, default='HalfCheetah-v2') # parser.add_argument('--hid', type=int, default=64) # parser.add_argument('--l', type=int, default=2) # parser.add_argument('--gamma', type=float, default=0.99) # parser.add_argument('--seed', '-s', type=int, default=0) # parser.add_argument('--cpu', type=int, default=1) # parser.add_argument('--steps', type=int, default=4000) # parser.add_argument('--epochs', type=int, default=50) # parser.add_argument('--exp_name', type=str, default='ppo') # args = parser.parse_args() # mpi_fork(args.cpu) # run parallel code with mpi # from spinup.utils.run_utils import setup_logger_kwargs # logger_kwargs = setup_logger_kwargs(args.exp_name, args.seed) # ppo(lambda : gym.make(args.env), actor_critic=core.MLPActorCritic, # ac_kwargs=dict(hidden_sizes=[args.hid]*args.l), gamma=args.gamma, # seed=args.seed, steps_per_epoch=args.steps, epochs=args.epochs, # logger_kwargs=logger_kwargs)
def __init__(self, env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=None, save_freq=10, train_graph_path='/home/visgean/', train_graph_name='return.svg', model=None): self.actor_critic = actor_critic self.ac_kwargs = ac_kwargs self.seed = seed self.steps_per_epoch = steps_per_epoch self.epochs = epochs self.gamma = gamma self.clip_ratio = clip_ratio self.pi_lr = pi_lr self.vf_lr = vf_lr self.train_pi_iters = train_pi_iters self.train_v_iters = train_v_iters self.lam = lam self.max_ep_len = max_ep_len self.target_kl = target_kl self.logger_kwargs = logger_kwargs if logger_kwargs else {} self.save_freq = save_freq # Special function to avoid certain slowdowns from PyTorch + MPI combo. setup_pytorch_for_mpi() # Set up logger and save configuration self.logger = EpochLoggerFixed(**self.logger_kwargs) self.logger.save_config(locals()) # Random seed self.seed += 10000 * proc_id() torch.manual_seed(self.seed) np.random.seed(self.seed) # Instantiate environment self.env = env_fn() self.obs_dim = self.env.observation_space.shape self.act_dim = self.env.action_space.shape # Create actor-critic module if model: self.ac = model else: self.ac = actor_critic(self.env.observation_space, self.env.action_space, **ac_kwargs) # Sync params across processes sync_params(self.ac) # Count variables self.var_counts = tuple( core.count_vars(module) for module in [self.ac.pi, self.ac.v]) self.logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % self.var_counts) # Set up experience buffer self.local_steps_per_epoch = int(steps_per_epoch / num_procs()) self.buf = PPOBuffer(self.obs_dim, self.act_dim, self.local_steps_per_epoch, gamma, lam) # Set up optimizers for policy and value function self.pi_optimizer = Adam(self.ac.pi.parameters(), lr=pi_lr) self.vf_optimizer = Adam(self.ac.v.parameters(), lr=vf_lr) # Set up model saving self.logger.setup_pytorch_saver(self.ac) # Prepare for interaction with environment self.start_time = time.time() self.obs = self.env.reset() self.ep_ret = 0 self.ep_len = 0 self.test_returns = [] self.train_returns = [] self.max_return = 0 self.test_lengths = [] self.train_graph_path = train_graph_path + f'{proc_id()}_{train_graph_name}'
def ppo(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10, resume=None, reinitialize_optimizer_on_resume=True, render=False, notes='', env_config=None, boost_explore=0, partial_net_load=False, num_inputs_to_add=0, episode_cull_ratio=0, try_rollouts=0, steps_per_try_rollout=0, take_worst_rollout=False, shift_advs_pct=0, **kwargs): """ Proximal Policy Optimization (by clipping), with early stopping based on approximate KL Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with a ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` module. The ``step`` method should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Numpy array of actions for each | observation. ``v`` (batch,) | Numpy array of value estimates | for the provided observations. ``logp_a`` (batch,) | Numpy array of log probs for the | actions in ``a``. =========== ================ ====================================== The ``act`` method behaves the same as ``step`` but only returns ``a``. The ``pi`` module's forward call should accept a batch of observations and optionally a batch of actions, and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` N/A | Torch Distribution object, containing | a batch of distributions describing | the policy for the provided observations. ``logp_a`` (batch,) | Optional (only returned if batch of | actions is given). Tensor containing | the log probability, according to | the policy, of the provided actions. | If actions not given, will contain | ``None``. =========== ================ ====================================== The ``v`` module's forward call should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``v`` (batch,) | Tensor containing the value estimates | for the provided observations. (Critical: | make sure to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) Typically denoted by :math:`\epsilon`. pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. resume (str): Path to directory with simple_save model info you wish to resume from reinitialize_optimizer_on_resume: (bool) Whether to initialize training state in the optimizers, i.e. the individual learning rates for weights in Adam render: (bool) Whether to render the env during training. Useful for checking that resumption of training caused visual performance to carry over notes: (str) Experimental notes on what this run is testing env_config (dict): Environment configuration pass through boost_explore (float): Amount to increase std of actions in order to reinvigorate exploration. partial_net_load (bool): Whether to partially load the network when resuming. https://pytorch.org/tutorials/beginner/saving_loading_models.html#id4 num_inputs_to_add (int): Number of new inputs to add, if resuming and partially loading a new network. episode_cull_ratio (float): Ratio of bad episodes to cull from epoch try_rollouts (int): Number of times to sample actions steps_per_try_rollout (int): Number of steps per attempted rollout take_worst_rollout (bool): Use worst rollout in training shift_advs_pct (float): Action should be better than this pct of actions to be considered advantageous. """ config = deepcopy(locals()) # Special function to avoid certain slowdowns from PyTorch + MPI combo. setup_pytorch_for_mpi() # Random seed seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) import_custom_envs() # Instantiate environment env = env_fn() if hasattr(env.unwrapped, 'configure_env'): env.unwrapped.configure_env(env_config) obs_dim = env.observation_space.shape act_dim = env.action_space.shape num_agents = getattr(env, 'num_agents', 1) if hasattr(env.unwrapped, 'logger'): print('Logger set by environment') logger_kwargs['logger'] = env.unwrapped.logger logger = EpochLogger(**logger_kwargs) logger.add_key_stat('won') logger.add_key_stat('trip_pct') logger.add_key_stat('HorizonReturn') logger.save_config(config) # Create actor-critic module ac = actor_critic(env.observation_space, env.action_space, num_inputs_to_add=num_inputs_to_add, **ac_kwargs) # Set up optimizers for policy and value function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr) # Resume if resume is not None: ac, pi_optimizer, vf_optimizer = get_model_to_resume( resume, ac, pi_lr, vf_lr, reinitialize_optimizer_on_resume, actor_critic, partial_net_load, num_inputs_to_add) if num_inputs_to_add: add_inputs(ac, ac_kwargs, num_inputs_to_add) if boost_explore: boost_exploration(ac, boost_explore) # Count variables var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n'%var_counts) # Set up experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = ppo_buffer_factory(obs_dim, act_dim, local_steps_per_epoch, gamma, lam, num_agents, shift_advs_pct, cull_ratio=episode_cull_ratio) # Set up function for computing PPO policy loss def compute_loss_pi(data): obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data['logp'] # Policy loss pi, logp = ac.pi(obs, act) ratio = torch.exp(logp - logp_old) clip_adv = torch.clamp(ratio, 1-clip_ratio, 1+clip_ratio) * adv loss_pi = -(torch.min(ratio * adv, clip_adv)).mean() # Useful extra info approx_kl = (logp_old - logp).mean().item() ent = pi.entropy().mean().item() clipped = ratio.gt(1+clip_ratio) | ratio.lt(1-clip_ratio) clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item() pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac) return loss_pi, pi_info # Set up function for computing value loss def compute_loss_v(data): obs, ret = data['obs'], data['ret'] return ((ac.v(obs) - ret)**2).mean() # Sync params across processes sync_params(ac) # Set up model saving logger.setup_pytorch_saver(ac) def update(): data = buf.get() pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() # Train policy with multiple steps of gradient descent for i in range(train_pi_iters): pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) kl = mpi_avg(pi_info['kl']) if kl > 1.5 * target_kl: logger.log('Early stopping at step %d due to reaching max kl.'%i) break loss_pi.backward() mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() logger.store(StopIter=i) # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) loss_v.backward() mpi_avg_grads(ac.v) # average grads across MPI processes vf_optimizer.step() # Log changes from update kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old)) # Prepare for interaction with environment start_time = time.time() o, r, d = reset(env) effective_horizon = round(1 / (1 - gamma)) effective_horizon_rewards = [] for _ in range(num_agents): effective_horizon_rewards.append(deque(maxlen=effective_horizon)) if hasattr(env, 'agent_index'): agent_index = env.agent_index agent = env.agents[agent_index] is_multi_agent = True else: agent_index = 0 agent = None is_multi_agent = False def get_action_fn(_obz): return ac.step(torch.as_tensor(_obz, dtype=torch.float32)) # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): epoch_episode = 0 info = {} epoch_ended = False step_num = 0 ep_len = 0 ep_ret = 0 while not epoch_ended: if try_rollouts != 0: # a, v, logp, next_o, r, d, info # a, v, logp, obs, r, done, info rollout = do_rollouts( get_action_fn, env, o, steps_per_try_rollout, try_rollouts, take_worst_rollout) else: a, v, logp = get_action_fn(o) # NOTE: For multi-agent, steps current agent, # but returns values for next agent (from its previous action)! # TODO: Just return multiple agents observations next_o, r, d, info = env.step(a) if render: env.render() curr_reward = env.curr_reward if is_multi_agent else r # save and log buf.store(o, a, curr_reward, v, logp, agent_index) logger.store(VVals=v) # Update obs (critical!) o = next_o if 'stats' in info and info['stats']: # TODO: Optimize this logger.store(**info['stats']) if is_multi_agent: agent_index = env.agent_index agent = env.agents[agent_index] # TODO: Store vector of these for each agent when changing step API ep_len = agent.episode_steps ep_ret = agent.episode_reward else: ep_len += 1 ep_ret += r calc_effective_horizon_reward( agent_index, effective_horizon_rewards, logger, r) timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = buf.epoch_ended(step_num) if terminal or epoch_ended: if epoch_ended and not terminal: print('Warning: trajectory cut off by epoch at %d steps.'%ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32)) else: v = 0 buf.finish_path(agent_index, v) if terminal: buf.record_episode(ep_len=ep_len, ep_ret=ep_ret, step_num=step_num) # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) if 'stats' in info and info['stats'] and info['stats']['done_only']: logger.store(**info['stats']['done_only']) o, r, d = reset(env) if not is_multi_agent: ep_len = 0 ep_ret = 0 step_num += 1 buf.prepare_for_update() # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('DateTime', get_date_str()) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.log_tabular('HorizonReturn', with_min_and_max=True) if getattr(env.unwrapped, 'is_deepdrive', False): logger.log_tabular('trip_pct', with_min_and_max=True) logger.log_tabular('collided') logger.log_tabular('harmful_gs') logger.log_tabular('timeup') logger.log_tabular('exited_lane') logger.log_tabular('circles') logger.log_tabular('skipped') logger.log_tabular('backwards') logger.log_tabular('won') if 'stats' in info and info['stats']: for stat, value in info['stats'].items(): logger.log_tabular(stat, with_min_and_max=True) if logger.best_category or (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state(dict(env=env), pytorch_save=dict( ac=ac.state_dict(), pi_optimizer=pi_optimizer.state_dict(), vf_optimizer=vf_optimizer.state_dict(), epoch=epoch, stats=logger.epoch_dict, ), itr=None, best_category=logger.best_category) logger.dump_tabular()
def ppo(env_fn, actor_critic=MLPActorCritic, ac_kwargs={}, seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Proximal Policy Optimization (by clipping), with early stopping based on approximate KL Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with a ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` module. The ``step`` method should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Numpy array of actions for each | observation. ``v`` (batch,) | Numpy array of value estimates | for the provided observations. ``logp_a`` (batch,) | Numpy array of log probs for the | actions in ``a``. =========== ================ ====================================== The ``act`` method behaves the same as ``step`` but only returns ``a``. The ``pi`` module's forward call should accept a batch of observations and optionally a batch of actions, and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` N/A | Torch Distribution object, containing | a batch of distributions describing | the policy for the provided observations. ``logp_a`` (batch,) | Optional (only returned if batch of | actions is given). Tensor containing | the log probability, according to | the policy, of the provided actions. | If actions not given, will contain | ``None``. =========== ================ ====================================== The ``v`` module's forward call should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``v`` (batch,) | Tensor containing the value estimates | for the provided observations. (Critical: | make sure to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) Typically denoted by :math:`\epsilon`. pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ torch.manual_seed(10) np.random.seed(10) random.seed(10) # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Instantiate environment env = env_fn() # Create actor-critic module ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs).cuda() # Sync params across processes sync_params(ac) # Count variables var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # Set up experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOTrajectory(gamma, lam) training_queue = TrainingQueue(200) # Set up optimizers for policy and value function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr) # Prepare for interaction with environment start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 num_training = 15 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): training_data = training_queue.get_batch(num_training) num_training = len(training_data['act']) o = torch.tensor(o).float().unsqueeze(0) if num_training > 0: o = torch.cat([o, training_data['obs']]) o = o.cuda() pi = ac.pi._distribution(o) a = pi.sample() if num_training > 0: a[-num_training:] = training_data['act'] logp = ac.pi._log_prob_from_distribution(pi, a) v = ac.v(o) if num_training > 0: run_update(logp[-num_training:], v[-num_training:], training_data['ret'].cuda(), training_data['adv'].cuda(), training_data['logp'].cuda(), pi_optimizer, vf_optimizer, clip_ratio, logger) a = a[:len(a) - num_training].cpu().item() o = o[:len(o) - num_training].cpu().numpy().squeeze() v = v[:len(v) - num_training].cpu().item() logp = logp[:len(logp) - num_training].cpu().item() next_o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # save and log buf.store(o, a, r, v, logp) logger.store(VVals=v) # Update obs (critical!) o = next_o num_training = 15 timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t == local_steps_per_epoch - 1 if terminal: if epoch_ended and not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: v = ac.v( torch.as_tensor(o, dtype=torch.float32).unsqueeze( 0).cuda()).cpu().detach().item() else: v = 0 trajectory = buf.finish_path(v) training_queue.put_batch(trajectory) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) # logger.log_tabular('LossPi', average_only=True) # logger.log_tabular('LossV', average_only=True) # logger.log_tabular('DeltaLossPi', average_only=True) # logger.log_tabular('DeltaLossV', average_only=True) # logger.log_tabular('Entropy', average_only=True) # logger.log_tabular('KL', average_only=True) # logger.log_tabular('ClipFrac', average_only=True) # logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()