def sync_params(module): """ Sync all parameters of module across all MPI processes. """ if num_procs() == 1: return for p in module.parameters(): p_numpy = p.data.numpy() broadcast(p_numpy)
def sync_params(module): if num_procs() == 1: return for p in module.parameters(): p_numpy = p.data.numpy() broadcast(p_numpy)
def mpi_avg_grads(module): """ Average contents of gradient buffers across MPI processes. """ if num_procs() == 1: return for p in module.parameters(): p_grad_numpy = p.grad.numpy() # numpy view of tensor data avg_p_grad = mpi_avg(p.grad) p_grad_numpy[:] = avg_p_grad[:]
def mpi_avg_grads(module): if num_procs() == 1: return for p in module.parameters(): p_grad_numpy = p.grad.numpy() avg_p_grad = mpi_avg(p.grad) p_grad_numpy[:] = avg_p_grad[:]
def setup_pytorch_for_mpi(): """ Avoid slowdowns caused by each separate process's PyTorch using more than its fair share of CPU resources. """ if torch.get_num_threads() == 1: return fair_num_threads = max(int(torch.get_num_threads() / num_procs()), 1) torch.set_num_threads(fair_num_threads)
def setup_pytorch_for_mpi(): """ Avoid slowdowns caused by each separate process's PyTorch using more than its fair share of CPU resources. """ #print('Proc %d: Reporting original number of Torch threads as %d.'%(proc_id(), torch.get_num_threads()), flush=True) if torch.get_num_threads() == 1: return fair_num_threads = max(int(torch.get_num_threads() / num_procs()), 1) torch.set_num_threads(fair_num_threads)
def ppo(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Proximal Policy Optimization (by clipping), with early stopping based on approximate KL Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with a ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` module. The ``step`` method should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Numpy array of actions for each | observation. ``v`` (batch,) | Numpy array of value estimates | for the provided observations. ``logp_a`` (batch,) | Numpy array of log probs for the | actions in ``a``. =========== ================ ====================================== The ``act`` method behaves the same as ``step`` but only returns ``a``. The ``pi`` module's forward call should accept a batch of observations and optionally a batch of actions, and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` N/A | Torch Distribution object, containing | a batch of distributions describing | the policy for the provided observations. ``logp_a`` (batch,) | Optional (only returned if batch of | actions is given). Tensor containing | the log probability, according to | the policy, of the provided actions. | If actions not given, will contain | ``None``. =========== ================ ====================================== The ``v`` module's forward call should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``v`` (batch,) | Tensor containing the value estimates | for the provided observations. (Critical: | make sure to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) Typically denoted by :math:`\epsilon`. pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ # Special function to avoid certain slowdowns from PyTorch + MPI combo. setup_pytorch_for_mpi() # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Random seed seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) # Instantiate environment env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Create actor-critic module if inspect.isclass(actor_critic): ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) else: ac = actor_critic # Sync params across processes sync_params(ac) # Count variables var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # Set up experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Set up function for computing PPO policy loss def compute_loss_pi(data): obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data[ 'logp'] # Policy loss pi, logp = ac.pi(obs, act) ratio = torch.exp(logp - logp_old) clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv loss_pi = -(torch.min(ratio * adv, clip_adv)).mean() # Useful extra info approx_kl = (logp_old - logp).mean().item() ent = pi.entropy().mean().item() clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio) clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item() pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac) return loss_pi, pi_info # Set up function for computing value loss def compute_loss_v(data): obs, ret = data['obs'], data['ret'] return ((ac.v(obs) - ret)**2).mean() # Set up optimizers for policy and value function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(): data = buf.get() pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() # Train policy with multiple steps of gradient descent for i in range(train_pi_iters): pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) kl = mpi_avg(pi_info['kl']) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break loss_pi.backward() mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() logger.store(StopIter=i) # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) loss_v.backward() mpi_avg_grads(ac.v) # average grads across MPI processes vf_optimizer.step() # Log changes from update kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old)) # Prepare for interaction with environment start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32)) next_o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # save and log buf.store(o, a, r, v, logp) logger.store(VVals=v) # Update obs (critical!) o = next_o timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t == local_steps_per_epoch - 1 if terminal or epoch_ended: if epoch_ended and not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32)) else: v = 0 buf.finish_path(v) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # current state logger.save_state({'env': env}, epoch) # for rendering # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() logger.output_file.close()
def train(self): """Run training across multiple environments using MPI. """ # Save parameters to YAML if the root process. if proc_id() == 0: self.log_params() seed = 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) local_steps_per_epoch = int(self.steps_per_epoch / num_procs()) obs_dim = self.env.observation_space.shape act_dim = self.env.action_space.shape replay = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, self.gamma, self.lam) pbar = tqdm(range(self.epochs), ncols=100) # Initial observation o, ep_ret, ep_len = self.env.reset(), 0, 0 for epoch in pbar: episode_lengths = [] episode_rewards = [] for t in range(local_steps_per_epoch): a, v, logp = self.ac.step( torch.as_tensor(o, dtype=torch.float32)) next_o, r, d, _ = self.env.step(a) ep_ret += r ep_len += 1 replay.store(o, a, r, v, logp) o = next_o timeout = ep_len == self.max_ep_len terminal = d or timeout epoch_ended = t == local_steps_per_epoch - 1 if terminal or epoch_ended: if epoch_ended and not (terminal): print( f"Warning: trajectory cut off by epoch at {ep_len} steps.", flush=True) if timeout or epoch_ended: _, v, _ = self.ac.step( torch.as_tensor(o, dtype=torch.float32)) else: v = 0 replay.finish_path(v) episode_lengths.append(ep_len) episode_rewards.append(ep_ret) o, ep_ret, ep_len = self.env.reset(), 0, 0 data = replay.sample() pi_loss, value_loss, kl_div, entropy, clip_fraction = self.update( data) pbar.set_postfix( dict(avg_epsiode_length=f"{np.mean(episode_lengths): .2f}")) metrics = { "Environment/Episode Length": np.mean(episode_lengths), "Environment/Cumulative Reward": np.mean(episode_rewards), "Loss/Policy": pi_loss, "Loss/Value": value_loss, "Metrics/KL Divergence": kl_div, "Metrics/Entropy": entropy, "Metrics/Clip Fraction": clip_fraction, } episode_lengths = [] episode_rewards = [] self.log_summary(epoch, metrics) if proc_id() == 0 and ((epoch % self.save_freq == 0) or (epoch == self.epochs - 1)): self.save_model()
def ppo(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Proximal Policy Optimization (by clipping), with early stopping based on approximate KL Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) Typically denoted by :math:`\epsilon`. pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) o2, r, d, _ = env.step(a[0]) ep_ret += r ep_len += 1 # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) # Update obs (critical!) o = o2 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = 0 if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def vpg(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, pi_lr=3e-4, vf_lr=1e-3, train_v_iters=80, lam=0.97, max_ep_len=1000, logger_kwargs=dict(), save_freq=10): """ Vanilla Policy Gradient (with GAE-Lambda for advantage estimation) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to VPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = VPGBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n'%var_counts) # VPG objectives pi_loss = -tf.reduce_mean(logp * adv_ph) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean(logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean(-logp) # a sample estimate for entropy, also easy to compute # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def update(): inputs = {k:v for k,v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Policy gradient step sess.run(train_pi, feed_dict=inputs) # Value function learning for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl = sess.run([pi_loss, v_loss, approx_kl], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1,-1)}) o2, r, d, _ = env.step(a[0]) ep_ret += r ep_len += 1 # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) # Update obs (critical!) o = o2 terminal = d or (ep_len == max_ep_len) if terminal or (t==local_steps_per_epoch-1): if not(terminal): print('Warning: trajectory cut off by epoch at %d steps.'%ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = 0 if d else sess.run(v, feed_dict={x_ph: o.reshape(1,-1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs-1): logger.save_state({'env': env}, None) # Perform VPG update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch+1)*steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular()
def gail(env_fn, actor_critic=ActorCritic, ac_kwargs=dict(), disc=Discriminator, dc_kwargs=dict(), seed=0, episodes_per_epoch=40, epochs=500, gamma=0.99, lam=0.97, pi_lr=3e-3, vf_lr=3e-3, dc_lr=5e-4, train_v_iters=80, train_dc_iters=80, max_ep_len=1000, logger_kwargs=dict(), save_freq=10): l_lam = 0 # balance two loss term logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape ac_kwargs['action_space'] = env.action_space # Models ac = actor_critic(input_dim=obs_dim[0], **ac_kwargs) disc = disc(input_dim=obs_dim[0], **dc_kwargs) # TODO: Load expert policy here expert = actor_critic(input_dim=obs_dim[0], **ac_kwargs) expert_name = "expert_torch_save.pt" expert = torch.load(osp.join(logger_kwargs['output_dir'], expert_name)) # Buffers local_episodes_per_epoch = int(episodes_per_epoch / num_procs()) buff_s = BufferS(obs_dim[0], act_dim[0], local_episodes_per_epoch, max_ep_len) buff_t = BufferT(obs_dim[0], act_dim[0], local_episodes_per_epoch, max_ep_len) # Count variables var_counts = tuple( count_vars(module) for module in [ac.policy, ac.value_f, disc.policy]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d, \t d: %d\n' % var_counts) # Optimizers train_pi = torch.optim.Adam(ac.policy.parameters(), lr=pi_lr) train_v = torch.optim.Adam(ac.value_f.parameters(), lr=vf_lr) train_dc = torch.optim.Adam(disc.policy.parameters(), lr=dc_lr) # Parameters Sync sync_all_params(ac.parameters()) sync_all_params(disc.parameters()) def update(e): obs_s, act, adv, ret, lgp_old = [ torch.Tensor(x) for x in buff_s.retrieve_all() ] obs_t, _ = [torch.Tensor(x) for x in buff_t.retrieve_all()] # Policy _, lgp, _ = ac.policy(obs_s, act) entropy = (-lgp).mean() # Policy loss # policy gradient term + entropy term pi_loss = -(lgp * adv).mean() - l_lam * entropy # Train policy if e > 10: train_pi.zero_grad() pi_loss.backward() average_gradients(train_pi.param_groups) train_pi.step() # Value function v = ac.value_f(obs_s) v_l_old = F.mse_loss(v, ret) for _ in range(train_v_iters): v = ac.value_f(obs_s) v_loss = F.mse_loss(v, ret) # Value function train train_v.zero_grad() v_loss.backward() average_gradients(train_v.param_groups) train_v.step() # Discriminator gt1 = torch.ones(obs_s.size()[0], dtype=torch.int) gt2 = torch.zeros(obs_t.size()[0], dtype=torch.int) _, lgp_s, _ = disc(obs_s, gt=gt1) _, lgp_t, _ = disc(obs_t, gt=gt2) dc_loss_old = -lgp_s.mean() - lgp_t.mean() for _ in range(train_dc_iters): _, lgp_s, _ = disc(obs_s, gt=gt1) _, lgp_t, _ = disc(obs_t, gt=gt2) dc_loss = -lgp_s.mean() - lgp_t.mean() # Discriminator train train_dc.zero_grad() dc_loss.backward() average_gradients(train_dc.param_groups) train_dc.step() _, lgp_s, _ = disc(obs_s, gt=gt1) _, lgp_t, _ = disc(obs_t, gt=gt2) dc_loss_new = -lgp_s.mean() - lgp_t.mean() # Log the changes _, lgp, _, v = ac(obs, act) entropy_new = (-lgp).mean() pi_loss_new = -(lgp * adv).mean() - l_lam * entropy v_loss_new = F.mse_loss(v, ret) kl = (lgp_old - lgp).mean() logger.store(LossPi=pi_loss, LossV=v_l_old, LossDC=dc_loss_old, DeltaLossPi=(pi_loss_new - pi_loss), DeltaLossV=(v_loss_new - v_l_old), DeltaLossDC=(dc_loss_new - dc_loss_old), DeltaEnt=(entropy_new - entropy), Entropy=entropy, KL=kl) start_time = time.time() o, r, sdr, d, ep_ret, ep_sdr, ep_len = env.reset(), 0, 0, False, 0, 0, 0 total_t = 0 ep_len_t = 0 for epoch in range(epochs): ac.eval() disc.eval() # We recognize the probability term of index [0] correspond to the teacher's policy # Student's policy rollout for _ in range(local_episodes_per_epoch): for _ in range(max_ep_len): obs = torch.Tensor(o.reshape(1, -1)) a, _, lopg_t, v_t = ac(obs) buff_s.store(o, a.detach().numpy(), r, sdr, v_t.item(), lopg_t.detach().numpy()) logger.store(VVals=v_t) o, r, d, _ = env.step(a.detach().numpy()[0]) _, sdr, _ = disc(torch.Tensor(o.reshape(1, -1)), gt=torch.Tensor([0])) if sdr < -4: # Truncate rewards sdr = -4 ep_ret += r ep_sdr += sdr ep_len += 1 total_t += 1 terminal = d or (ep_len == max_ep_len) if terminal: buff_s.end_episode() logger.store(EpRetS=ep_ret, EpLenS=ep_len, EpSdrS=ep_sdr) o, r, sdr, d, ep_ret, ep_sdr, ep_len = env.reset( ), 0, 0, False, 0, 0, 0 # Teacher's policy rollout for _ in range(local_episodes_per_epoch): for _ in range(max_ep_len): obs = torch.Tensor(o.reshape(1, -1)) a, _, _, _ = expert(obs) buff_t.store(o, a.detach().numpy(), r) o, r, d, _ = env.step(a.detach().numpy()[0]) ep_ret += r ep_len += 1 total_t += 1 terminal = d or (ep_len == max_ep_len) if terminal: buff_t.end_episode() logger.store(EpRetT=ep_ret, EpLenT=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, [ac, disc], None) # Update ac.train() disc.train() update(epoch) # Log logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRetS', with_min_and_max=True) logger.log_tabular('EpSdrS', with_min_and_max=True) logger.log_tabular('EpLenS', average_only=True) logger.log_tabular('EpRetT', with_min_and_max=True) logger.log_tabular('EpLenT', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', total_t) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('LossDC', average_only=True) logger.log_tabular('DeltaLossDC', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('DeltaEnt', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def policyg(env_fn, actor_critic=ActorCritic, ac_kwargs=dict(), seed=0, episodes_per_epoch=40, epochs=500, gamma=0.99, lam=0.97, pi_lr=3e-4, vf_lr=1e-3, train_v_iters=80, max_ep_len=1000, logger_kwargs=dict(), save_freq=10): logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape ac_kwargs['action_space'] = env.action_space # Models ac = actor_critic(input_dim=obs_dim[0], **ac_kwargs) # Buffers local_episodes_per_epoch = int(episodes_per_epoch / num_procs()) buff = BufferA(obs_dim[0], act_dim[0], local_episodes_per_epoch, max_ep_len) # Count variables var_counts = tuple( count_vars(module) for module in [ac.policy, ac.value_f]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # Optimizers train_pi = torch.optim.Adam(ac.policy.parameters(), lr=pi_lr) train_v = torch.optim.Adam(ac.value_f.parameters(), lr=vf_lr) # Parameters Sync sync_all_params(ac.parameters()) def update(e): obs, act, adv, ret, lgp_old = [ torch.Tensor(x) for x in buff.retrieve_all() ] # Policy _, lgp, _ = ac.policy(obs, act) entropy = (-lgp).mean() # Policy loss # policy gradient term + entropy term pi_loss = -(lgp * adv).mean() # Train policy train_pi.zero_grad() pi_loss.backward() average_gradients(train_pi.param_groups) train_pi.step() # Value function v = ac.value_f(obs) v_l_old = F.mse_loss(v, ret) for _ in range(train_v_iters): v = ac.value_f(obs) v_loss = F.mse_loss(v, ret) # Value function train train_v.zero_grad() v_loss.backward() average_gradients(train_v.param_groups) train_v.step() # Log the changes _, lgp, _, v = ac(obs, act) entropy_new = (-lgp).mean() pi_loss_new = -(lgp * adv).mean() v_loss_new = F.mse_loss(v, ret) kl = (lgp_old - lgp).mean() logger.store(LossPi=pi_loss, LossV=v_l_old, DeltaLossPi=(pi_loss_new - pi_loss), DeltaLossV=(v_loss_new - v_l_old), Entropy=entropy, KL=kl) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_t = 0 for epoch in range(epochs): ac.eval() # Policy rollout for _ in range(local_episodes_per_epoch): for _ in range(max_ep_len): obs = torch.Tensor(o.reshape(1, -1)) a, _, lopg_t, v_t = ac(obs) buff.store(o, a.detach().numpy(), r, v_t.item(), lopg_t.detach().numpy()) logger.store(VVals=v_t) o, r, d, _ = env.step(a.detach().numpy()[0]) ep_ret += r ep_len += 1 total_t += 1 terminal = d or (ep_len == max_ep_len) if terminal: buff.end_episode() logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 if (epoch % save_freq == 0) or (epoch == epochs - 1): logger._torch_save(ac, fname="expert_torch_save.pt") # Update ac.train() update(epoch) # Log logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', total_t) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def valor(env_fn, actor_critic=ActorCritic, ac_kwargs=dict(), disc=Discriminator, dc_kwargs=dict(), seed=0, episodes_per_epoch=40, epochs=50, gamma=0.99, pi_lr=3e-4, vf_lr=1e-3, dc_lr=5e-4, train_v_iters=80, train_dc_iters=10, train_dc_interv=10, lam=0.97, max_ep_len=1000, logger_kwargs=dict(), con_dim=5, save_freq=10, k=1): logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape ac_kwargs['action_space'] = env.action_space # Model actor_critic = actor_critic(input_dim=obs_dim[0]+con_dim, **ac_kwargs) disc = disc(input_dim=obs_dim[0], context_dim=con_dim, **dc_kwargs) # Buffer local_episodes_per_epoch = int(episodes_per_epoch / num_procs()) buffer = Buffer(con_dim, obs_dim[0], act_dim[0], local_episodes_per_epoch, max_ep_len, train_dc_interv) # Count variables var_counts = tuple(count_vars(module) for module in [actor_critic.policy, actor_critic.value_f, disc.policy]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d, \t d: %d\n'%var_counts) # Optimizers train_pi = torch.optim.Adam(actor_critic.policy.parameters(), lr=pi_lr) train_v = torch.optim.Adam(actor_critic.value_f.parameters(), lr=vf_lr) train_dc = torch.optim.Adam(disc.policy.parameters(), lr=dc_lr) # Parameters Sync sync_all_params(actor_critic.parameters()) sync_all_params(disc.parameters()) def update(e): obs, act, adv, pos, ret, logp_old = [torch.Tensor(x) for x in buffer.retrieve_all()] # Policy _, logp, _ = actor_critic.policy(obs, act) entropy = (-logp).mean() # Policy loss pi_loss = -(logp*(k*adv+pos)).mean() # Train policy train_pi.zero_grad() pi_loss.backward() average_gradients(train_pi.param_groups) train_pi.step() # Value function v = actor_critic.value_f(obs) v_l_old = F.mse_loss(v, ret) for _ in range(train_v_iters): v = actor_critic.value_f(obs) v_loss = F.mse_loss(v, ret) # Value function train train_v.zero_grad() v_loss.backward() average_gradients(train_v.param_groups) train_v.step() # Discriminator if (e+1) % train_dc_interv == 0: print('Discriminator Update!') con, s_diff = [torch.Tensor(x) for x in buffer.retrieve_dc_buff()] _, logp_dc, _ = disc(s_diff, con) d_l_old = -logp_dc.mean() # Discriminator train for _ in range(train_dc_iters): _, logp_dc, _ = disc(s_diff, con) d_loss = -logp_dc.mean() train_dc.zero_grad() d_loss.backward() average_gradients(train_dc.param_groups) train_dc.step() _, logp_dc, _ = disc(s_diff, con) dc_l_new = -logp_dc.mean() else: d_l_old = 0 dc_l_new = 0 # Log the changes _, logp, _, v = actor_critic(obs, act) pi_l_new = -(logp*(k*adv+pos)).mean() v_l_new = F.mse_loss(v, ret) kl = (logp_old - logp).mean() logger.store(LossPi=pi_loss, LossV=v_l_old, KL=kl, Entropy=entropy, DeltaLossPi=(pi_l_new-pi_loss), DeltaLossV=(v_l_new-v_l_old), LossDC=d_l_old, DeltaLossDC=(dc_l_new-d_l_old)) # logger.store(Adv=adv.reshape(-1).numpy().tolist(), Pos=pos.reshape(-1).numpy().tolist()) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 context_dist = Categorical(logits=torch.Tensor(np.ones(con_dim))) total_t = 0 for epoch in range(epochs): actor_critic.eval() disc.eval() for _ in range(local_episodes_per_epoch): c = context_dist.sample() c_onehot = F.one_hot(c, con_dim).squeeze().float() for _ in range(max_ep_len): concat_obs = torch.cat([torch.Tensor(o.reshape(1, -1)), c_onehot.reshape(1, -1)], 1) a, _, logp_t, v_t = actor_critic(concat_obs) buffer.store(c, concat_obs.squeeze().detach().numpy(), a.detach().numpy(), r, v_t.item(), logp_t.detach().numpy()) logger.store(VVals=v_t) o, r, d, _ = env.step(a.detach().numpy()[0]) ep_ret += r ep_len += 1 total_t += 1 terminal = d or (ep_len == max_ep_len) if terminal: dc_diff = torch.Tensor(buffer.calc_diff()).unsqueeze(0) con = torch.Tensor([float(c)]).unsqueeze(0) _, _, log_p = disc(dc_diff, con) buffer.end_episode(log_p.detach().numpy()) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, [actor_critic, disc], None) # Update actor_critic.train() disc.train() update(epoch) # Log logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', total_t) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('LossDC', average_only=True) logger.log_tabular('DeltaLossDC', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular()
def trpo(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, delta=0.01, vf_lr=1e-3, train_v_iters=80, damping_coeff=0.1, cg_iters=10, backtrack_iters=10, backtrack_coeff=0.8, lam=0.97, max_ep_len=1000, logger_kwargs=dict(), save_freq=10, algo='trpo'): """ Trust Region Policy Optimization (with support for Natural Policy Gradient) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: ============ ================ ======================================== Symbol Shape Description ============ ================ ======================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``info`` N/A | A dict of any intermediate quantities | (from calculating the policy or log | probabilities) which are needed for | analytically computing KL divergence. | (eg sufficient statistics of the | distributions) ``info_phs`` N/A | A dict of placeholders for old values | of the entries in ``info``. ``d_kl`` () | A symbol for computing the mean KL | divergence between the current policy | (``pi``) and the old policy (as | specified by the inputs to | ``info_phs``) over the batch of | states given in ``x_ph``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) ============ ================ ======================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to TRPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) delta (float): KL-divergence limit for TRPO / NPG update. (Should be small for stability. Values like 0.01, 0.05.) vf_lr (float): Learning rate for value function optimizer. train_v_iters (int): Number of gradient descent steps to take on value function per epoch. damping_coeff (float): Artifact for numerical stability, should be smallish. Adjusts Hessian-vector product calculation: .. math:: Hv \\rightarrow (\\alpha I + H)v where :math:`\\alpha` is the damping coefficient. Probably don't play with this hyperparameter. cg_iters (int): Number of iterations of conjugate gradient to perform. Increasing this will lead to a more accurate approximation to :math:`H^{-1} g`, and possibly slightly-improved performance, but at the cost of slowing things down. Also probably don't play with this hyperparameter. backtrack_iters (int): Maximum number of steps allowed in the backtracking line search. Since the line search usually doesn't backtrack, and usually only steps back once when it does, this hyperparameter doesn't often matter. backtrack_coeff (float): How far back to step during backtracking line search. (Always between 0 and 1, usually above 0.5.) lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. algo: Either 'trpo' or 'npg': this code supports both, since they are almost the same. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph, plus placeholders for old pdist (for KL) pi, logp, logp_pi, info, info_phs, d_kl, v = actor_critic(x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] + core.values_as_sorted_list(info_phs) # Every step, get: action, value, logprob, & info for pdist (for computing kl div) get_action_ops = [pi, v, logp_pi] + core.values_as_sorted_list(info) # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) info_shapes = {k: v.shape.as_list()[1:] for k,v in info_phs.items()} buf = GAEBuffer(obs_dim, act_dim, local_steps_per_epoch, info_shapes, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n'%var_counts) # TRPO losses ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) pi_loss = -tf.reduce_mean(ratio * adv_ph) v_loss = tf.reduce_mean((ret_ph - v)**2) # Optimizer for value function train_vf = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) # Symbols needed for CG solver pi_params = core.get_vars('pi') gradient = core.flat_grad(pi_loss, pi_params) v_ph, hvp = core.hessian_vector_product(d_kl, pi_params) if damping_coeff > 0: hvp += damping_coeff * v_ph # Symbols for getting and setting params get_pi_params = core.flat_concat(pi_params) set_pi_params = core.assign_params_from_flat(v_ph, pi_params) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def cg(Ax, b): """ Conjugate gradient algorithm (see https://en.wikipedia.org/wiki/Conjugate_gradient_method) """ x = np.zeros_like(b) r = b.copy() # Note: should be 'b - Ax(x)', but for x=0, Ax(x)=0. Change if doing warm start. p = r.copy() r_dot_old = np.dot(r,r) for _ in range(cg_iters): z = Ax(p) alpha = r_dot_old / (np.dot(p, z) + EPS) x += alpha * p r -= alpha * z r_dot_new = np.dot(r,r) p = r + (r_dot_new / r_dot_old) * p r_dot_old = r_dot_new return x def update(): # Prepare hessian func, gradient eval inputs = {k:v for k,v in zip(all_phs, buf.get())} Hx = lambda x : mpi_avg(sess.run(hvp, feed_dict={**inputs, v_ph: x})) g, pi_l_old, v_l_old = sess.run([gradient, pi_loss, v_loss], feed_dict=inputs) g, pi_l_old = mpi_avg(g), mpi_avg(pi_l_old) # Core calculations for TRPO or NPG x = cg(Hx, g) alpha = np.sqrt(2*delta/(np.dot(x, Hx(x))+EPS)) old_params = sess.run(get_pi_params) def set_and_eval(step): sess.run(set_pi_params, feed_dict={v_ph: old_params - alpha * x * step}) return mpi_avg(sess.run([d_kl, pi_loss], feed_dict=inputs)) if algo=='npg': # npg has no backtracking or hard kl constraint enforcement kl, pi_l_new = set_and_eval(step=1.) elif algo=='trpo': # trpo augments npg with backtracking line search, hard kl for j in range(backtrack_iters): kl, pi_l_new = set_and_eval(step=backtrack_coeff**j) if kl <= delta and pi_l_new <= pi_l_old: logger.log('Accepting new params at step %d of line search.'%j) logger.store(BacktrackIters=j) break if j==backtrack_iters-1: logger.log('Line search failed! Keeping old params.') logger.store(BacktrackIters=j) kl, pi_l_new = set_and_eval(step=0.) # Value function updates for _ in range(train_v_iters): sess.run(train_vf, feed_dict=inputs) v_l_new = sess.run(v_loss, feed_dict=inputs) # Log changes from update logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): agent_outs = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1,-1)}) a, v_t, logp_t, info_t = agent_outs[0][0], agent_outs[1], agent_outs[2], agent_outs[3:] o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # save and log buf.store(o, a, r, v_t, logp_t, info_t) logger.store(VVals=v_t) # Update obs (critical!) o = o2 terminal = d or (ep_len == max_ep_len) if terminal or (t==local_steps_per_epoch-1): if not(terminal): print('Warning: trajectory cut off by epoch at %d steps.'%ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = 0 if d else sess.run(v, feed_dict={x_ph: o.reshape(1,-1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs-1): logger.save_state({'env': env}, None) # Perform TRPO or NPG update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch+1)*steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('KL', average_only=True) if algo=='trpo': logger.log_tabular('BacktrackIters', average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular()
def vpg(env, ac_kwargs=None, seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, lam=0.97, max_ep_len=1000, save_freq=10): seed += 10000 * proc_id() tf.random.set_seed(seed) np.random.seed(seed) # Create actor-critic agent and synchronize it ac_kwargs['action_space'] = env.action_space actor_critic = ActorCritic(**ac_kwargs) # Experience buffer obs_dim = env.observation_space.shape act_dim = env.action_space.shape local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = VPGBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) """ Main loop: collect experience in env and update/log each epoch """ # o for observation, r for reward, d for done o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 all_ep_ret = [] summary_ep_ret = [] totalEnvInteracts = [] for epoch in range(epochs): for t in range(local_steps_per_epoch): a, logp_t, v_t = actor_critic(o.reshape(1, -1)) # save and log a = a.numpy()[0] buf.store(o, a, r, v_t, logp_t) o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal) and proc_id() == 0: print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else v_t buf.finish_path(last_val) if terminal: all_ep_ret.append(ep_ret) # reset environment o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Perform VPG update! actor_critic.update(buf) mean, std = mpi_statistics_scalar(all_ep_ret) all_ep_ret = [] if proc_id() == 0: print(f'epoch {epoch}: mean {mean}, std {std}') summary_ep_ret.append(mean) totalEnvInteracts.append((epoch + 1) * steps_per_epoch) if proc_id() == 0: plt.plot(totalEnvInteracts, summary_ep_ret) plt.grid(True) plt.show()