def f_loss_kl_impl(need_loss, need_kl): retval = dict() if need_loss: new_dists = policy.compute_dists(all_obs) old_dists = all_dists elif need_kl: # if only kl is needed, compute distribution from sub-sampled data new_dists = policy.compute_dists(subsamp_obs) old_dists = subsamp_dists def compute_surr_loss(old_dists, new_dists, all_acts, all_advs): """ :param old_dists: An instance of subclass of Distribution :param new_dists: An instance of subclass of Distribution :param all_acts: A chainer variable, which should be a matrix of size N * |A| :param all_advs: A chainer variable, which should be a vector of size N :return: A chainer variable, which should be a scalar """ surr_loss = Variable(np.array(0.)) "*** YOUR CODE HERE ***" # SOLUTION surr_loss = -F.mean( new_dists.likelihood_ratio(old_dists, all_acts) * all_advs) # END OF SOLUTION return surr_loss def compute_kl(old_dists, new_dists): """ :param old_dists: An instance of subclass of Distribution :param new_dists: An instance of subclass of Distribution :return: A chainer variable, which should be a scalar """ kl = Variable(np.array(0.)) "*** YOUR CODE HERE ***" # SOLUTION kl = F.mean(old_dists.kl_div(new_dists)) # END OF SOLUTION return kl test_once(compute_surr_loss) test_once(compute_kl) if need_loss: retval["surr_loss"] = compute_surr_loss( old_dists, new_dists, all_acts, all_advs) if need_kl: retval["kl"] = compute_kl(old_dists, new_dists) return retval
def f_loss_kl_impl(need_loss, need_kl): retval = dict() if need_loss: new_dists = policy.compute_dists(all_obs) old_dists = all_dists elif need_kl: # if only kl is needed, compute distribution from sub-sampled data new_dists = policy.compute_dists(subsamp_obs) old_dists = subsamp_dists def compute_surr_loss(old_dists, new_dists, all_acts, all_advs): """ :param old_dists: An instance of subclass of Distribution :param new_dists: An instance of subclass of Distribution :param all_acts: A chainer variable, which should be a matrix of size N * |A| :param all_advs: A chainer variable, which should be a vector of size N :return: A chainer variable, which should be a scalar """ "*** YOUR CODE HERE ***" surr_loss = Variable(np.array( 0., dtype=np.float32)) # must be float32 # So we want to do eq 8 L_old likelihood_ratio = new_dists.likelihood_ratio( old_dists, all_acts) # they gave us a hint to use this surr_loss -= F.mean(likelihood_ratio * all_advs) return surr_loss def compute_kl(old_dists, new_dists): """ :param old_dists: An instance of subclass of Distribution :param new_dists: An instance of subclass of Distribution :return: A chainer variable, which should be a scalar """ "*** YOUR CODE HERE ***" kl = Variable(np.array( 0., dtype=np.float32)) # must be float32 # they gave us the clue to use dist.kl_div, and you can look it up in utils.py # I got it the wrong way round the first time kl += F.mean(old_dists.kl_div(new_dists)) return kl test_once(compute_surr_loss) test_once(compute_kl) if need_loss: retval["surr_loss"] = compute_surr_loss( old_dists, new_dists, all_acts, all_advs) if need_kl: retval["kl"] = compute_kl(old_dists, new_dists) return retval
def f_loss_kl_impl(need_loss, need_kl): retval = dict() if need_loss: new_dists = policy.compute_dists(all_obs) old_dists = all_dists elif need_kl: # if only kl is needed, compute distribution from sub-sampled data new_dists = policy.compute_dists(subsamp_obs) old_dists = subsamp_dists def compute_surr_loss(old_dists, new_dists, all_acts, all_advs): """ :param old_dists: An instance of subclass of Distribution :param new_dists: An instance of subclass of Distribution :param all_acts: A chainer variable, which should be a matrix of size N * |A| :param all_advs: A chainer variable, which should be a vector of size N :return: A chainer variable, which should be a scalar """ # We use the same formula as we used before in part 4 # But this time I do it in a one-liner because you can # Refer to previous implmenetation from pt.4 to get better sense return -F.mean( new_dists.likelihood_ratio(old_dists, all_acts) * all_advs) def compute_kl(old_dists, new_dists): """ :param old_dists: An instance of subclass of Distribution :param new_dists: An instance of subclass of Distribution :return: A chainer variable, which should be a scalar """ # We are using the functions proposed by the authors # of the lab in part 5.2(right before pt. 5.3) return F.mean(old_dists.kl_div(new_dists)) test_once(compute_surr_loss) test_once(compute_kl) if need_loss: retval["surr_loss"] = compute_surr_loss( old_dists, new_dists, all_acts, all_advs) if need_kl: retval["kl"] = compute_kl(old_dists, new_dists) return retval
def f_loss_kl_impl(need_loss, need_kl): retval = dict() if need_loss: new_dists = policy.compute_dists(all_obs) old_dists = all_dists elif need_kl: # if only kl is needed, compute distribution from sub-sampled data new_dists = policy.compute_dists(subsamp_obs) old_dists = subsamp_dists def compute_surr_loss(old_dists, new_dists, all_acts, all_advs): """ :param old_dists: An instance of subclass of Distribution :param new_dists: An instance of subclass of Distribution :param all_acts: A chainer variable, which should be a matrix of size N * |A| :param all_advs: A chainer variable, which should be a vector of size N :return: A chainer variable, which should be a scalar """ "*** YOUR CODE HERE ***" return -F.mean(new_dists.likelihood_ratio(old_dists, all_acts) * all_advs) def compute_kl(old_dists, new_dists): """ :param old_dists: An instance of subclass of Distribution :param new_dists: An instance of subclass of Distribution :return: A chainer variable, which should be a scalar """ "*** YOUR CODE HERE ***" return F.mean(old_dists.kl_div(new_dists)) test_once(compute_surr_loss) test_once(compute_kl) if need_loss: retval["surr_loss"] = compute_surr_loss( old_dists, new_dists, all_acts, all_advs) if need_kl: retval["kl"] = compute_kl(old_dists, new_dists) return retval
def pg(env, env_maker, policy, baseline, n_envs=mp.cpu_count(), last_iter=-1, n_iters=100, batch_size=1000, optimizer=chainer.optimizers.Adam(), discount=0.99, gae_lambda=0.97, snapshot_saver=None): """ This method implements policy gradient algorithm. :param env: An environment instance, which should have the same class as what env_maker.make() returns. :param env_maker: An object such that calling env_maker.make() will generate a new environment. :param policy: A stochastic policy which we will be optimizing. :param baseline: A baseline used for variance reduction and estimating future returns for unfinished trajectories. :param n_envs: Number of environments running simultaneously. :param last_iter: The index of the last iteration. This is normally -1 when starting afresh, but may be different when loaded from a snapshot. :param n_iters: The total number of iterations to run. :param batch_size: The number of samples used per iteration. :param optimizer: A Chainer optimizer instance. By default we use the Adam algorithm with learning rate 1e-3. :param discount: Discount factor. :param gae_lambda: Lambda parameter used for generalized advantage estimation. :param snapshot_saver: An object for saving snapshots. """ if getattr(optimizer, 'target', None) is not policy: optimizer.setup(policy) logger.info("Starting env pool") with EnvPool(env_maker, n_envs=n_envs) as env_pool: for iter in range(last_iter + 1, n_iters): logger.info("Starting iteration {}".format(iter)) logger.logkv('Iteration', iter) logger.info("Start collecting samples") trajs = parallel_collect_samples(env_pool, policy, batch_size) logger.info("Computing input variables for policy optimization") all_obs, all_acts, all_advs, _ = compute_pg_vars( trajs, policy, baseline, discount, gae_lambda) # Begin policy update # Now, you need to implement the computation of the policy gradient # The policy gradient is given by -1/T \sum_t \nabla_\theta(log(p_\theta(a_t|s_t))) * A_t # Note the negative sign in the front, since optimizers are most often minimizing a loss rather # This is the same as \nabla_\theta(-1/T \sum_t log(p_\theta(a_t|s_t)) * A_t) = \nabla_\theta(L), where L is the surrogate loss term logger.info("Computing policy gradient") # Methods that may be useful: # - `dists.logli(actions)' returns the log probability of the actions under the distribution `dists'. # This method returns a chainer variable. dists = policy.compute_dists(all_obs) def compute_surr_loss(dists, all_acts, all_advs): """ :param dists: An instance of subclass of Distribution :param all_acts: A chainer variable, which should be a matrix of size N * |A| :param all_advs: A chainer variable, which should be a vector of size N :return: A chainer variable, which should be a scalar """ surr_loss = Variable(np.array(0.)) "*** YOUR CODE HERE ***" # SOLUTION surr_loss = -F.mean(dists.logli(all_acts) * all_advs) # END OF SOLUTION return surr_loss test_once(compute_surr_loss) surr_loss = compute_surr_loss(dists, all_acts, all_advs) # reset gradients stored in the policy parameters policy.cleargrads() surr_loss.backward() # apply the computed gradient optimizer.update() # Update baseline logger.info("Updating baseline") baseline.update(trajs) # log statistics logger.info("Computing logging information") logger.logkv('SurrLoss', surr_loss.data) log_action_distribution_statistics(dists) log_reward_statistics(env) log_baseline_statistics(trajs) logger.dumpkvs() if snapshot_saver is not None: logger.info("Saving snapshot") snapshot_saver.save_state( iter, dict(alg=pg, alg_state=dict(env_maker=env_maker, policy=policy, baseline=baseline, n_envs=n_envs, last_iter=iter, n_iters=n_iters, batch_size=batch_size, optimizer=optimizer, discount=discount, gae_lambda=gae_lambda)))
def main(env_id, batch_size, discount, learning_rate, n_itrs, render, use_baseline, natural, natural_step_size): # Check gradient implementation rng = np.random.RandomState(42) if env_id == 'CartPole-v0': cartpole_test_grad_impl() env = gym.make('CartPole-v0') obs_dim = env.observation_space.shape[0] action_dim = env.action_space.n get_action = cartpole_get_action get_grad_logp_action = cartpole_get_grad_logp_action elif env_id == 'Point-v0': point_test_grad_impl() from simplepg import point_env env = gym.make('Point-v0') obs_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] get_action = point_get_action get_grad_logp_action = point_get_grad_logp_action else: raise ValueError( "Unsupported environment: must be one of 'CartPole-v0', 'Point-v0'" ) env.seed(42) timestep_limit = env.spec.timestep_limit # Initialize parameters theta = rng.normal(scale=0.1, size=(action_dim, obs_dim + 1)) # Store baselines for each time step. baselines = np.zeros(timestep_limit) # Policy training loop for itr in range(n_itrs): # Collect trajectory loop n_samples = 0 grad = np.zeros_like(theta) episode_rewards = [] # Store cumulative returns for each time step all_returns = [[] for _ in range(timestep_limit)] all_observations = [] all_actions = [] while n_samples < batch_size: observations = [] actions = [] rewards = [] ob = env.reset() done = False # Only render the first trajectory render_episode = n_samples == 0 # Collect a new trajectory while not done: action = get_action(theta, ob, rng=rng) next_ob, rew, done, _ = env.step(action) observations.append(ob) actions.append(action) rewards.append(rew) ob = next_ob n_samples += 1 if render and render_episode: env.render() # Go back in time to compute returns and accumulate gradient # Compute the gradient along this trajectory R = 0. for t in reversed(range(len(observations))): def compute_update(discount, R_tplus1, theta, s_t, a_t, r_t, b_t, get_grad_logp_action): """ :param discount: A scalar :param R_tplus1: A scalar :param theta: A matrix of size |A| * (|S|+1) :param s_t: A vector of size |S| :param a_t: Either a vector of size |A| or an integer, depending on the environment :param r_t: A scalar :param b_t: A scalar :param get_grad_logp_action: A function, mapping from (theta, ob, action) to the gradient (a matrix of size |A| * (|S|+1) ) :return: A tuple, consisting of a scalar and a matrix of size |A| * (|S|+1) """ R_t = 0. pg_theta = np.zeros_like(theta) "*** YOUR CODE HERE ***" R_t = r_t + discount * R_tplus1 pg_theta = get_grad_logp_action(theta, s_t, a_t) * (R_t - b_t) return R_t, pg_theta # Test the implementation, but only once test_once(compute_update) R, grad_t = compute_update( discount=discount, R_tplus1=R, theta=theta, s_t=observations[t], a_t=actions[t], r_t=rewards[t], b_t=baselines[t], get_grad_logp_action=get_grad_logp_action) all_returns[t].append(R) grad += grad_t episode_rewards.append(np.sum(rewards)) all_observations.extend(observations) all_actions.extend(actions) def compute_baselines(all_returns): """ :param all_returns: A list of size T, where the t-th entry is a list of numbers, denoting the returns collected at time step t across different episodes :return: A vector of size T """ baselines = np.zeros(len(all_returns)) for t in range(len(all_returns)): "*** YOUR CODE HERE ***" baselines[t] = 0. if len(all_returns[t]) == 0 else np.mean( all_returns[t]) return baselines if use_baseline: test_once(compute_baselines) baselines = compute_baselines(all_returns) else: baselines = np.zeros(timestep_limit) # Roughly normalize the gradient grad = grad / (np.linalg.norm(grad) + 1e-8) if not natural: theta += learning_rate * grad else: def compute_fisher_matrix(theta, get_grad_logp_action, all_observations, all_actions): """ :param theta: A matrix of size |A| * (|S|+1) :param get_grad_logp_action: A function, mapping from (theta, ob, action) to the gradient (a matrix of size |A| * (|S|+1) ) :param all_observations: A list of vectors of size |S| :param all_actions: A list of vectors of size |A| :return: A matrix of size (|A|*(|S|+1)) * (|A|*(|S|+1)), i.e. #columns and #rows are the number of entries in theta """ d = len(theta.flatten()) F = np.zeros((d, d)) "*** YOUR CODE HERE ***" # this is an intuitive but very inefficient implementation: # ws = [] # for action in all_actions: # for ob in all_observations: # g = get_grad_logp_action(theta, ob, action).reshape(d,1) # ws.append(g.dot(g.T)) # F = np.mean(np.array(ws), axis=0) # this is an efficient implementation for i in range(len(all_actions)): grads = get_grad_logp_action(theta, all_observations[i], all_actions[i]).flatten() F += np.outer(grads, grads.T) F /= len(all_actions) return F def compute_natural_gradient(F, grad, reg=1e-4): """ :param F: A matrix of size (|A|*(|S|+1)) * (|A|*(|S|+1)) :param grad: A matrix of size |A| * (|S|+1) :param reg: A scalar :return: A matrix of size |A| * (|S|+1) """ natural_grad = np.zeros_like(grad) "*** YOUR CODE HERE ***" F_inv = np.linalg.inv(F + reg * np.eye(F.shape[0])) natural_grad = F_inv.dot(grad.flatten()).reshape(grad.shape) return natural_grad def compute_step_size(F, natural_grad, natural_step_size): """ :param F: A matrix of size (|A|*(|S|+1)) * (|A|*(|S|+1)) :param natural_grad: A matrix of size |A| * (|S|+1) :param natural_step_size: A scalar :return: A scalar """ step_size = 0. "*** YOUR CODE HERE ***" # this works with the inefficient implementation from compute_fisher_matrix # w = natural_grad.dot(F).dot(natural_grad.T) natural_grad = natural_grad.flatten() w = natural_grad.T.dot(F).dot(natural_grad) step_size = np.sqrt(2 * natural_step_size / w) return step_size test_once(compute_fisher_matrix) test_once(compute_natural_gradient) test_once(compute_step_size) F = compute_fisher_matrix( theta=theta, get_grad_logp_action=get_grad_logp_action, all_observations=all_observations, all_actions=all_actions) natural_grad = compute_natural_gradient(F, grad) step_size = compute_step_size(F, natural_grad, natural_step_size) theta += step_size * natural_grad if env_id == 'CartPole-v0': logits = compute_logits(theta, np.array(all_observations)) ent = np.mean(compute_entropy(logits)) perp = np.exp(ent) print( "Iteration: %d AverageReturn: %.2f Entropy: %.2f Perplexity: %.2f |theta|_2: %.2f" % (itr, np.mean(episode_rewards), ent, perp, np.linalg.norm(theta))) else: print("Iteration: %d AverageReturn: %.2f |theta|_2: %.2f" % (itr, np.mean(episode_rewards), np.linalg.norm(theta)))
def a2c(env, env_maker, policy, vf, joint_model=None, k=20, n_envs=16, discount=0.99, optimizer=chainer.optimizers.RMSprop(lr=1e-3), max_grad_norm=1.0, vf_loss_coeff=0.5, ent_coeff=0.01, last_epoch=-1, epoch_length=10000, n_epochs=8000, snapshot_saver=None): """ This method implements (Synchronous) Advantage Actor-Critic algorithm. Rather than having asynchronous workers, which can be more efficient due to less coordination but also less stable and harder to extend / debug, we use a pool of environment workers performing simulation, while computing actions and performing gradient updates centrally. This also makes it easier to utilize GPUs for neural network computation. :param env: An environment instance, which should have the same class as what env_maker.make() returns. :param env_maker: An object such that calling env_maker.make() will generate a new environment. :param policy: A stochastic policy which we will be optimizing. :param vf: A value function which estimates future returns given a state. It can potentially share weights with the policy by calling policy.create_vf(). :param joint_model: The joint model of policy and value function. This is usually automatically computed. :param k: Number of simulation steps per environment for each gradient update. :param n_envs: Number of environments running simultaneously. :param discount: Discount factor. :param optimizer: A chainer optimizer instance. By default we use the RMSProp algorithm. :param max_grad_norm: If provided, apply gradient clipping with the specified maximum L2 norm. :param vf_loss_coeff: Coefficient for the value function loss. :param ent_coeff: Coefficient for the entropy loss (the negative bonus). :param last_epoch: The index of the last epoch. This is normally -1 when starting afresh, but may be different when loaded from a snapshot. :param epoch: The starting epoch. This is normally 0, but may be different when loaded from a snapshot. Since A2C is an online algorithm, an epoch is just an artificial boundary so that we record logs after each epoch. :param epoch_length: Number of total environment steps per epoch. :param n_epochs: Total number of epochs to run the algorithm. :param snapshot_saver: An object for saving snapshots. """ # ensures that shared parameters are only counted once if joint_model is None: joint_model = UniqueChainList(policy, vf) if getattr(optimizer, 'target', None) is not joint_model: optimizer.setup(joint_model) try: # remove existing hook if necessary (this should only be needed when restarting experiments) optimizer.remove_hook('gradient_clipping') except KeyError: pass if max_grad_norm is not None: # Clip L2 norm of gradient, to improve stability optimizer.add_hook(chainer.optimizer.GradientClipping( threshold=max_grad_norm), 'gradient_clipping') epoch = last_epoch + 1 global_t = epoch * epoch_length loggings = defaultdict(list) logger.info("Starting env pool") with EnvPool(env_maker, n_envs=n_envs) as env_pool: gen = samples_generator(env_pool, policy, vf, k) logger.info("Starting epoch {}".format(epoch)) if logger.get_level() <= logger.INFO: progbar = tqdm(total=epoch_length) else: progbar = None while global_t < epoch_length * n_epochs: # Run k steps in the environment # Note: # - all_actions, all_values, all_dists, and next_values are chainer variables # - all_rewards, all_dones are lists numpy arrays # The first dimension of each variable is time, and the second dimension is the index of the environment all_actions, all_rewards, all_dones, all_dists, all_values, next_values = next( gen) global_t += n_envs * k # Compute returns and advantages # Size: (k, n_envs) all_values = F.stack(all_values) all_rewards = np.asarray(all_rewards, dtype=np.float32) all_dones = np.asarray(all_dones, dtype=np.float32) all_values_data = all_values.data next_values_data = next_values.data test_once(compute_returns_advantages) all_returns, all_advs = compute_returns_advantages( all_rewards, all_dones, all_values_data, next_values_data, discount ) all_returns = chainer.Variable(all_returns.astype(np.float32)) all_advs = chainer.Variable(all_advs.astype(np.float32)) # Concatenate data # Size: (k*n_envs,) + action_shape all_flat_actions = F.concat(all_actions, axis=0) # Size: key -> (k*n_envs,) + dist_shape all_flat_dists = {k: F.concat( [d[k] for d in all_dists], axis=0) for k in all_dists[0].keys()} all_flat_dists = policy.distribution.from_dict(all_flat_dists) # Prepare variables needed for gradient computation logli = all_flat_dists.logli(all_flat_actions) ent = all_flat_dists.entropy() # Flatten advantages all_advs = F.concat(all_advs, axis=0) # Form the loss - you should only need to use the variables provided as input arguments below def compute_total_loss(logli, all_advs, ent_coeff, ent, vf_loss_coeff, all_returns, all_values): """ :param logli: A chainer variable, which should be a vector of size N :param all_advs: A chainer variable, which should be a vector of size N :param ent_coeff: A scalar :param ent: A chainer variable, which should be a vector of size N :param vf_loss_coeff: A scalar :param all_returns: A chainer variable, which should be a vector of size N :param all_values: A chainer variable, which should be a vector of size N :return: A tuple of (policy_loss, vf_loss, total_loss) policy_loss should be the weighted sum of the surrogate loss and the average entropy loss vf_loss should be the (unweighted) squared loss of value function prediction. total_loss should be the weighted sum of policy_loss and vf_loss """ policy_loss = -1*F.mean(logli*all_advs)-ent_coeff*F.mean(ent) #Variable(np.array(0.)) vf_loss = F.mean_squared_error(all_returns,all_values) #Variable(np.array(0.)) total_loss = policy_loss + vf_loss_coeff*vf_loss return policy_loss, vf_loss, total_loss test_once(compute_total_loss) policy_loss, vf_loss, total_loss = compute_total_loss( logli=logli, all_advs=all_advs, ent_coeff=ent_coeff, ent=ent, vf_loss_coeff=vf_loss_coeff, all_returns=all_returns, all_values=all_values ) joint_model.cleargrads() total_loss.backward() optimizer.update() vf_loss_data = vf_loss.data all_returns_data = all_returns.data all_flat_dists_data = { k: v.data for k, v in all_flat_dists.as_dict().items() } loggings["vf_loss"].append(vf_loss_data) loggings["vf_preds"].append(all_values_data) loggings["vf_targets"].append(all_returns_data) loggings["dists"].append(all_flat_dists_data) if progbar is not None: progbar.update(k * n_envs) # An entire epoch has passed if global_t // epoch_length > epoch: logger.logkv('Epoch', epoch) log_reward_statistics(env) all_dists = { k: Variable(np.concatenate([d[k] for d in loggings["dists"]], axis=0)) for k in loggings["dists"][0].keys() } log_action_distribution_statistics( policy.distribution.from_dict(all_dists)) logger.logkv('|VfPred|', np.mean(np.abs(loggings["vf_preds"]))) logger.logkv('|VfTarget|', np.mean( np.abs(loggings["vf_targets"]))) logger.logkv('VfLoss', np.mean(loggings["vf_loss"])) logger.dumpkvs() if snapshot_saver is not None: logger.info("Saving snapshot") snapshot_saver.save_state( epoch, dict( alg=a2c, alg_state=dict( env_maker=env_maker, policy=policy, vf=vf, joint_model=joint_model, k=k, n_envs=n_envs, discount=discount, last_epoch=epoch, n_epochs=n_epochs, epoch_length=epoch_length, optimizer=optimizer, vf_loss_coeff=vf_loss_coeff, ent_coeff=ent_coeff, max_grad_norm=max_grad_norm, ) ) ) # Reset stored logging information loggings = defaultdict(list) if progbar is not None: progbar.close() epoch = global_t // epoch_length logger.info("Starting epoch {}".format(epoch)) if progbar is not None: progbar = tqdm(total=epoch_length) if progbar is not None: progbar.close()
def main(env_id, batch_size, discount, learning_rate, n_itrs, render, use_baseline, natural, natural_step_size): # Check gradient implementation rng = np.random.RandomState(42) if env_id == 'CartPole-v0': cartpole_test_grad_impl() env = gym.make('CartPole-v0') obs_dim = env.observation_space.shape[0] action_dim = env.action_space.n get_action = cartpole_get_action get_grad_logp_action = cartpole_get_grad_logp_action elif env_id == 'Point-v0': point_test_grad_impl() from simplepg import point_env env = gym.make('Point-v0') obs_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] get_action = point_get_action get_grad_logp_action = point_get_grad_logp_action else: raise ValueError( "Unsupported environment: must be one of 'CartPole-v0', 'Point-v0'" ) env.seed(42) timestep_limit = env.spec.timestep_limit # Initialize parameters theta = rng.normal(scale=0.1, size=(action_dim, obs_dim + 1)) # Store baselines for each time step. baselines = np.zeros(timestep_limit) # Policy training loop for itr in range(n_itrs): # Collect trajectory loop n_samples = 0 grad = np.zeros_like(theta) episode_rewards = [] # Store cumulative returns for each time step all_returns = [[] for _ in range(timestep_limit)] all_observations = [] all_actions = [] while n_samples < batch_size: observations = [] actions = [] rewards = [] ob = env.reset() done = False # Only render the first trajectory render_episode = n_samples == 0 # Collect a new trajectory while not done: action = get_action(theta, ob, rng=rng) next_ob, rew, done, _ = env.step(action) observations.append(ob) actions.append(action) rewards.append(rew) ob = next_ob n_samples += 1 if render and render_episode: env.render() # Go back in time to compute returns and accumulate gradient # Compute the gradient along this trajectory R = 0. for t in reversed(range(len(observations))): def compute_update(discount, R_tplus1, theta, s_t, a_t, r_t, b_t, get_grad_logp_action): """ :param discount: A scalar :param R_tplus1: A scalar :param theta: A matrix of size |A| * (|S|+1) :param s_t: A vector of size |S| :param a_t: Either a vector of size |A| or an integer, depending on the environment :param r_t: A scalar :param b_t: A scalar :param get_grad_logp_action: A function, mapping from (theta, ob, action) to the gradient (a matrix of size |A| * (|S|+1) ) :return: A tuple, consisting of a scalar and a matrix of size |A| * (|S|+1) """ # Use the formula from the lab instructions, part 3.4 R_t = discount * R_tplus1 + r_t # Compute the single gradient contribution by formula from step 3.3 pg_theta = get_grad_logp_action(theta, s_t, a_t) * (R_t - b_t) return R_t, pg_theta # Test the implementation, but only once test_once(compute_update) R, grad_t = compute_update( discount=discount, R_tplus1=R, theta=theta, s_t=observations[t], a_t=actions[t], r_t=rewards[t], b_t=baselines[t], get_grad_logp_action=get_grad_logp_action) all_returns[t].append(R) grad += grad_t episode_rewards.append(np.sum(rewards)) all_observations.extend(observations) all_actions.extend(actions) def compute_baselines(all_returns): """ :param all_returns: A list of size T, where the t-th entry is a list of numbers, denoting the returns collected at time step t across different episodes :return: A vector of size T """ baselines = np.zeros(len(all_returns)) for t in range(len(all_returns)): # Use trajectories from previous episodes to compute the baseline if len(all_returns[t]) > 0: # We need to check do we have any trajectories at all # If not the default value of zero will be remaining baselines[t] = np.mean(all_returns[t]) return baselines if use_baseline: test_once(compute_baselines) baselines = compute_baselines(all_returns) else: baselines = np.zeros(timestep_limit) # Roughly normalize the gradient grad = grad / (np.linalg.norm(grad) + 1e-8) if not natural: theta += learning_rate * grad else: def compute_fisher_matrix(theta, get_grad_logp_action, all_observations, all_actions): """ :param theta: A matrix of size |A| * (|S|+1) :param get_grad_logp_action: A function, mapping from (theta, ob, action) to the gradient (a matrix of size |A| * (|S|+1) ) :param all_observations: A list of vectors of size |S| :param all_actions: A list of vectors of size |A| :return: A matrix of size (|A|*(|S|+1)) * (|A|*(|S|+1)), i.e. #columns and #rows are the number of entries in theta """ d = len(theta.flatten()) F = np.zeros((d, d)) # Compute for each action for i in range(len(all_actions)): # First compute the inner value of the action gradient and make theta a flattened vector grads = get_grad_logp_action(theta, all_observations[i], all_actions[i]).flatten() # Accumulate the inner product of the gradient F += np.outer(grads, grads.T) # Compute the mean (expected value) F /= len(all_actions) return F def compute_natural_gradient(F, grad, reg=1e-4): """ :param F: A matrix of size (|A|*(|S|+1)) * (|A|*(|S|+1)) :param grad: A matrix of size |A| * (|S|+1) :param reg: A scalar :return: A matrix of size |A| * (|S|+1) """ # First ensure that Fisher inf. matrix is positive definite F_inv = np.linalg.inv(F + reg * np.eye(*F.shape)) # Compute natural gradient with flattened version natural_grad = F_inv.dot(grad.flatten()) # Reshape back to the g shape natural_grad = natural_grad.reshape(grad.shape) return natural_grad def compute_step_size(F, natural_grad, natural_step_size): """ :param F: A matrix of size (|A|*(|S|+1)) * (|A|*(|S|+1)) :param natural_grad: A matrix of size |A| * (|S|+1) :param natural_step_size: A scalar :return: A scalar """ # Flatten the natural gradient again natural_grad = natural_grad.flatten() # The computation is performed on accordance with # Formula from the solution footnote denominator = natural_grad.T.dot(F).dot(natural_grad) nominator = 2 * natural_step_size step_size = np.sqrt(nominator / denominator) return step_size test_once(compute_fisher_matrix) test_once(compute_natural_gradient) test_once(compute_step_size) F = compute_fisher_matrix( theta=theta, get_grad_logp_action=get_grad_logp_action, all_observations=all_observations, all_actions=all_actions) natural_grad = compute_natural_gradient(F, grad) step_size = compute_step_size(F, natural_grad, natural_step_size) theta += step_size * natural_grad if env_id == 'CartPole-v0': logits = compute_logits(theta, np.array(all_observations)) ent = np.mean(compute_entropy(logits)) perp = np.exp(ent) print( "Iteration: %d AverageReturn: %.2f Entropy: %.2f Perplexity: %.2f |theta|_2: %.2f" % (itr, np.mean(episode_rewards), ent, perp, np.linalg.norm(theta))) else: print("Iteration: %d AverageReturn: %.2f |theta|_2: %.2f" % (itr, np.mean(episode_rewards), np.linalg.norm(theta)))
""" R_t = 0. pg_theta = np.zeros_like(theta) "*** YOUR CODE HERE ***" "Rt satis es the recurrence relation: R_t = discount * R_tplus1 + r_t" R_t = discount * R_tplus1 + r_t "single contribution to the overall policy gradient,... Formula shown in 3.4 Accummulating Policy Gradient" pg_theta = get_grad_logp_action(theta, s_t, a_t) * (R_t - b_t) return R_t, pg_theta # Test the implementation, but only once test_once(compute_update) R, grad_t = compute_update( discount=discount, R_tplus1=R, theta=theta, s_t=observations[t], a_t=actions[t], r_t=rewards[t], b_t=baselines[t], get_grad_logp_action=get_grad_logp_action ) all_returns[t].append(R) grad += grad_t episode_rewards.append(np.sum(rewards))
def main(env_id, batch_size, discount, learning_rate, n_itrs, render, use_baseline, natural, natural_step_size): # Check gradient implementation rng = np.random.RandomState(42) if env_id == 'CartPole-v0': cartpole_test_grad_impl() env = gym.make('CartPole-v0') obs_dim = env.observation_space.shape[0] action_dim = env.action_space.n get_action = cartpole_get_action get_grad_logp_action = cartpole_get_grad_logp_action elif env_id == 'Point-v0': point_test_grad_impl() from simplepg import point_env env = gym.make('Point-v0') obs_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] get_action = point_get_action get_grad_logp_action = point_get_grad_logp_action else: raise ValueError( "Unsupported environment: must be one of 'CartPole-v0', 'Point-v0'" ) env.seed(42) timestep_limit = env.spec.timestep_limit # Initialize parameters theta = rng.normal(scale=0.1, size=(action_dim, obs_dim + 1)) # Store baselines for each time step. baselines = np.zeros(timestep_limit) # Policy training loop for itr in range(n_itrs): # Collect trajectory loop n_samples = 0 grad = np.zeros_like(theta) episode_rewards = [] # Store cumulative returns for each time step all_returns = [[] for _ in range(timestep_limit)] all_observations = [] all_actions = [] while n_samples < batch_size: observations = [] actions = [] rewards = [] ob = env.reset() done = False # Only render the first trajectory render_episode = n_samples == 0 # Collect a new trajectory while not done: action = get_action(theta, ob, rng=rng) next_ob, rew, done, _ = env.step(action) observations.append(ob) actions.append(action) rewards.append(rew) ob = next_ob n_samples += 1 if render and render_episode: env.render() # Go back in time to compute returns and accumulate gradient # Compute the gradient along this trajectory R = 0. for t in reversed(range(len(observations))): def compute_update(discount, R_tplus1, theta, s_t, a_t, r_t, b_t, get_grad_logp_action): """ :param discount: A scalar :param R_tplus1: A scalar :param theta: A matrix of size |A| * (|S|+1) :param s_t: A vector of size |S| :param a_t: Either a vector of size |A| or an integer, depending on the environment :param r_t: A scalar :param b_t: A scalar :param get_grad_logp_action: A function, mapping from (theta, ob, action) to the gradient (a matrix of size |A| * (|S|+1) ) :return: A tuple, consisting of a scalar and a matrix of size |A| * (|S|+1) """ R_t = 0. pg_theta = np.zeros_like(theta) "*** YOUR CODE HERE ***" R_t = discount * R_tplus1 + r_t pg_theta = get_grad_logp_action(theta, s_t, a_t) * (R_t - b_t) return R_t, pg_theta # Test the implementation, but only once test_once(compute_update) R, grad_t = compute_update( discount=discount, R_tplus1=R, theta=theta, s_t=observations[t], a_t=actions[t], r_t=rewards[t], b_t=baselines[t], get_grad_logp_action=get_grad_logp_action) all_returns[t].append(R) grad += grad_t episode_rewards.append(np.sum(rewards)) all_observations.extend(observations) all_actions.extend(actions) def compute_baselines(all_returns): """ :param all_returns: A list of size T, where the t-th entry is a list of numbers, denoting the returns collected at time step t across different episodes :return: A vector of size T """ baselines = np.zeros(len(all_returns)) for t in range(len(all_returns)): "*** YOUR CODE HERE ***" if len(all_returns[t]): baselines[t] = np.mean(all_returns[t]) else: baselines[t] = 0 return baselines if use_baseline: test_once(compute_baselines) baselines = compute_baselines(all_returns) else: baselines = np.zeros(timestep_limit) # Roughly normalize the gradient grad = grad / (np.linalg.norm(grad) + 1e-8) if not natural: theta += learning_rate * grad else: def compute_fisher_matrix(theta, get_grad_logp_action, all_observations, all_actions): """ :param theta: A matrix of size |A| * (|S|+1) :param get_grad_logp_action: A function, mapping from (theta, ob, action) to the gradient (a matrix of size |A| * (|S|+1) ) :param all_observations: A list of vectors of size |S| :param all_actions: A list of vectors of size |A| :return: A matrix of size (|A|*(|S|+1)) * (|A|*(|S|+1)), i.e. #columns and #rows are the number of entries in theta """ d = len(theta.flatten()) F = np.zeros((d, d)) "*** YOUR CODE HERE ***" # Where approximating the fisher matrix from sampled gradlogs # so we use the mean of all our estimates, which are outer(gradlogs, gradlogs) for i in range(len(all_observations)): ob = all_observations[i] action = all_actions[i] grad_logp = get_grad_logp_action(theta, ob, action) F += np.outer(grad_logp, grad_logp) F /= len(all_observations) # Watch the error carefully, when this works it will say # Test for __main__.compute_fisher_matrix passed! # Not equal to tolerance rtol=1e-05, atol=0 <= error for next part return F def compute_natural_gradient(F, grad, reg=1e-4): """ :param F: A matrix of size (|A|*(|S|+1)) * (|A|*(|S|+1)) :param grad: A matrix of size |A| * (|S|+1) :param reg: A scalar :return: A matrix of size |A| * (|S|+1) """ natural_grad = np.zeros_like(grad) "*** YOUR CODE HERE ***" I = np.eye(F.shape[0]) # no dot product since reg is a scalar not a matrix F_1 = F + reg * I natural_grad = np.linalg.inv(F_1).dot(grad.flatten()) F_inv = np.linalg.inv(F_1) # Assumes theta is flattened, therefore that grad is natural_grad = F_inv.dot(grad.flatten()) # But we need to reshape the output return natural_grad.reshape(grad.shape) def compute_step_size(F, natural_grad, natural_step_size): """ :param F: A matrix of size (|A|*(|S|+1)) * (|A|*(|S|+1)) :param natural_grad: A matrix of size |A| * (|S|+1) :param natural_step_size: A scalar :return: A scalar """ step_size = 0. "*** YOUR CODE HERE ***" epsilon = natural_step_size g = natural_grad.reshape(-1, 1) alpha_squared = (2 * epsilon) / np.dot(np.dot(g.T, F), g) step_size = np.sqrt(alpha_squared) return step_size test_once(compute_fisher_matrix) test_once(compute_natural_gradient) test_once(compute_step_size) F = compute_fisher_matrix( theta=theta, get_grad_logp_action=get_grad_logp_action, all_observations=all_observations, all_actions=all_actions) natural_grad = compute_natural_gradient(F, grad) step_size = compute_step_size(F, natural_grad, natural_step_size) theta += step_size * natural_grad if env_id == 'CartPole-v0': logits = compute_logits(theta, np.array(all_observations)) ent = np.mean(compute_entropy(logits)) perp = np.exp(ent) print( "Iteration: %d AverageReturn: %.2f Entropy: %.2f Perplexity: %.2f |theta|_2: %.2f" % (itr, np.mean(episode_rewards), ent, perp, np.linalg.norm(theta))) else: print("Iteration: %d AverageReturn: %.2f |theta|_2: %.2f" % (itr, np.mean(episode_rewards), np.linalg.norm(theta)))
def pg(env, env_maker, policy, baseline, n_envs=mp.cpu_count(), last_iter=-1, n_iters=100, batch_size=1000, optimizer=chainer.optimizers.Adam(), discount=0.99, gae_lambda=0.97, snapshot_saver=None): """ This method implements policy gradient algorithm. :param env: An environment instance, which should have the same class as what env_maker.make() returns. :param env_maker: An object such that calling env_maker.make() will generate a new environment. :param policy: A stochastic policy which we will be optimizing. :param baseline: A baseline used for variance reduction and estimating future returns for unfinished trajectories. :param n_envs: Number of environments running simultaneously. :param last_iter: The index of the last iteration. This is normally -1 when starting afresh, but may be different when loaded from a snapshot. :param n_iters: The total number of iterations to run. :param batch_size: The number of samples used per iteration. :param optimizer: A Chainer optimizer instance. By default we use the Adam algorithm with learning rate 1e-3. :param discount: Discount factor. :param gae_lambda: Lambda parameter used for generalized advantage estimation. :param snapshot_saver: An object for saving snapshots. """ if getattr(optimizer, 'target', None) is not policy: optimizer.setup(policy) logger.info("Starting env pool") with EnvPool(env_maker, n_envs=n_envs) as env_pool: for iter in range(last_iter + 1, n_iters): logger.info("Starting iteration {}".format(iter)) logger.logkv('Iteration', iter) logger.info("Start collecting samples") trajs = parallel_collect_samples(env_pool, policy, batch_size) logger.info("Computing input variables for policy optimization") all_obs, all_acts, all_advs, _ = compute_pg_vars( trajs, policy, baseline, discount, gae_lambda ) # Begin policy update # Now, you need to implement the computation of the policy gradient # The policy gradient is given by -1/T \sum_t \nabla_\theta(log(p_\theta(a_t|s_t))) * A_t # Note the negative sign in the front, since optimizers are most often minimizing a loss rather # This is the same as \nabla_\theta(-1/T \sum_t log(p_\theta(a_t|s_t)) * A_t) = \nabla_\theta(L), where L is the surrogate loss term logger.info("Computing policy gradient") # Methods that may be useful: # - `dists.logli(actions)' returns the log probability of the actions under the distribution `dists'. # This method returns a chainer variable. dists = policy.compute_dists(all_obs) def compute_surr_loss(dists, all_acts, all_advs): """ :param dists: An instance of subclass of Distribution :param all_acts: A chainer variable, which should be a matrix of size N * |A| :param all_advs: A chainer variable, which should be a vector of size N :return: A chainer variable, which should be a scalar """ "*** YOUR CODE HERE ***" return -F.mean(dists.logli(all_acts) * all_advs) test_once(compute_surr_loss) surr_loss = compute_surr_loss(dists, all_acts, all_advs) # reset gradients stored in the policy parameters policy.cleargrads() surr_loss.backward() # apply the computed gradient optimizer.update() # Update baseline logger.info("Updating baseline") baseline.update(trajs) # log statistics logger.info("Computing logging information") logger.logkv('SurrLoss', surr_loss.data) log_action_distribution_statistics(dists) log_reward_statistics(env) log_baseline_statistics(trajs) logger.dumpkvs() if snapshot_saver is not None: logger.info("Saving snapshot") snapshot_saver.save_state( iter, dict( alg=pg, alg_state=dict( env_maker=env_maker, policy=policy, baseline=baseline, n_envs=n_envs, last_iter=iter, n_iters=n_iters, batch_size=batch_size, optimizer=optimizer, discount=discount, gae_lambda=gae_lambda ) ) )
def main(env_id, batch_size, discount, learning_rate, n_itrs, render, use_baseline, natural, natural_step_size): # Check gradient implementation rng = np.random.RandomState(42) if env_id == 'CartPole-v0': cartpole_test_grad_impl() env = gym.make('CartPole-v0') obs_dim = env.observation_space.shape[0] action_dim = env.action_space.n get_action = cartpole_get_action get_grad_logp_action = cartpole_get_grad_logp_action elif env_id == 'Point-v0': point_test_grad_impl() from simplepg import point_env env = gym.make('Point-v0') obs_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] get_action = point_get_action get_grad_logp_action = point_get_grad_logp_action else: raise ValueError( "Unsupported environment: must be one of 'CartPole-v0', 'Point-v0'") env.seed(42) timestep_limit = env.spec.timestep_limit # Initialize parameters theta = rng.normal(scale=0.1, size=(action_dim, obs_dim + 1)) # Store baselines for each time step. baselines = np.zeros(timestep_limit) # Policy training loop for itr in range(n_itrs): # Collect trajectory loop n_samples = 0 grad = np.zeros_like(theta) episode_rewards = [] # Store cumulative returns for each time step all_returns = [[] for _ in range(timestep_limit)] all_observations = [] all_actions = [] while n_samples < batch_size: observations = [] actions = [] rewards = [] ob = env.reset() done = False # Only render the first trajectory render_episode = n_samples == 0 # Collect a new trajectory while not done: action = get_action(theta, ob, rng=rng) next_ob, rew, done, _ = env.step(action) observations.append(ob) actions.append(action) rewards.append(rew) ob = next_ob n_samples += 1 if render and render_episode: env.render() # Go back in time to compute returns and accumulate gradient # Compute the gradient along this trajectory R = 0. for t in reversed(range(len(observations))): def compute_update(discount, R_tplus1, theta, s_t, a_t, r_t, b_t, get_grad_logp_action): """ :param discount: A scalar :param R_tplus1: A scalar :param theta: A matrix of size |A| * (|S|+1) :param s_t: A vector of size |S| :param a_t: Either a vector of size |A| or an integer, depending on the environment :param r_t: A scalar :param b_t: A scalar :param get_grad_logp_action: A function, mapping from (theta, ob, action) to the gradient (a matrix of size |A| * (|S|+1) ) :return: A tuple, consisting of a scalar and a matrix of size |A| * (|S|+1) """ "*** YOUR CODE HERE ***" R_t = discount * R_tplus1 + r_t pg_theta = get_grad_logp_action(theta, s_t, a_t) * (R_t - b_t) return R_t, pg_theta # Test the implementation, but only once test_once(compute_update) R, grad_t = compute_update( discount=discount, R_tplus1=R, theta=theta, s_t=observations[t], a_t=actions[t], r_t=rewards[t], b_t=baselines[t], get_grad_logp_action=get_grad_logp_action ) all_returns[t].append(R) grad += grad_t episode_rewards.append(np.sum(rewards)) all_observations.extend(observations) all_actions.extend(actions) def compute_baselines(all_returns): """ :param all_returns: A list of size T, where the t-th entry is a list of numbers, denoting the returns collected at time step t across different episodes :return: A vector of size T """ baselines = np.zeros(len(all_returns)) for t in range(len(all_returns)): "*** YOUR CODE HERE ***" if len(all_returns[t]) > 0: baselines[t] = np.mean(all_returns[t]) return baselines if use_baseline: test_once(compute_baselines) baselines = compute_baselines(all_returns) else: baselines = np.zeros(timestep_limit) # Roughly normalize the gradient grad = grad / (np.linalg.norm(grad) + 1e-8) if not natural: theta += learning_rate * grad else: def compute_fisher_matrix(theta, get_grad_logp_action, all_observations, all_actions): """ :param theta: A matrix of size |A| * (|S|+1) :param get_grad_logp_action: A function, mapping from (theta, ob, action) to the gradient (a matrix of size |A| * (|S|+1) ) :param all_observations: A list of vectors of size |S| :param all_actions: A list of vectors of size |A| :return: A matrix of size (|A|*(|S|+1)) * (|A|*(|S|+1)), i.e. #columns and #rows are the number of entries in theta """ d = len(theta.flatten()) F = np.zeros((d, d)) "*** YOUR CODE HERE ***" for i in range(len(all_actions)): grads = get_grad_logp_action(theta, all_observations[i], all_actions[i]).flatten() F += np.outer(grads, grads.T) F /= len(all_actions) return F def compute_natural_gradient(F, grad, reg=1e-4): """ :param F: A matrix of size (|A|*(|S|+1)) * (|A|*(|S|+1)) :param grad: A matrix of size |A| * (|S|+1) :param reg: A scalar :return: A matrix of size |A| * (|S|+1) """ "*** YOUR CODE HERE ***" F_inv = np.linalg.inv(F + reg * np.eye(*F.shape)) natural_grad = F_inv.dot(grad.flatten()).reshape(grad.shape) return natural_grad def compute_step_size(F, natural_grad, natural_step_size): """ :param F: A matrix of size (|A|*(|S|+1)) * (|A|*(|S|+1)) :param natural_grad: A matrix of size |A| * (|S|+1) :param natural_step_size: A scalar :return: A scalar """ "*** YOUR CODE HERE ***" natural_grad = natural_grad.flatten() step_size = np.sqrt(2*natural_step_size/natural_grad.T.dot(F).dot(natural_grad)) return step_size test_once(compute_fisher_matrix) test_once(compute_natural_gradient) test_once(compute_step_size) F = compute_fisher_matrix(theta=theta, get_grad_logp_action=get_grad_logp_action, all_observations=all_observations, all_actions=all_actions) natural_grad = compute_natural_gradient(F, grad) step_size = compute_step_size(F, natural_grad, natural_step_size) theta += step_size * natural_grad if env_id == 'CartPole-v0': logits = compute_logits(theta, np.array(all_observations)) ent = np.mean(compute_entropy(logits)) perp = np.exp(ent) print("Iteration: %d AverageReturn: %.2f Entropy: %.2f Perplexity: %.2f |theta|_2: %.2f" % ( itr, np.mean(episode_rewards), ent, perp, np.linalg.norm(theta))) else: print("Iteration: %d AverageReturn: %.2f |theta|_2: %.2f" % ( itr, np.mean(episode_rewards), np.linalg.norm(theta)))
def a2c(env, env_maker, policy, vf, joint_model=None, k=20, n_envs=16, discount=0.99, optimizer=chainer.optimizers.RMSprop(lr=1e-3), max_grad_norm=1.0, vf_loss_coeff=0.5, ent_coeff=0.01, last_epoch=-1, epoch_length=10000, n_epochs=8000, snapshot_saver=None): """ This method implements (Synchronous) Advantage Actor-Critic algorithm. Rather than having asynchronous workers, which can be more efficient due to less coordination but also less stable and harder to extend / debug, we use a pool of environment workers performing simulation, while computing actions and performing gradient updates centrally. This also makes it easier to utilize GPUs for neural network computation. :param env: An environment instance, which should have the same class as what env_maker.make() returns. :param env_maker: An object such that calling env_maker.make() will generate a new environment. :param policy: A stochastic policy which we will be optimizing. :param vf: A value function which estimates future returns given a state. It can potentially share weights with the policy by calling policy.create_vf(). :param joint_model: The joint model of policy and value function. This is usually automatically computed. :param k: Number of simulation steps per environment for each gradient update. :param n_envs: Number of environments running simultaneously. :param discount: Discount factor. :param optimizer: A chainer optimizer instance. By default we use the RMSProp algorithm. :param max_grad_norm: If provided, apply gradient clipping with the specified maximum L2 norm. :param vf_loss_coeff: Coefficient for the value function loss. :param ent_coeff: Coefficient for the entropy loss (the negative bonus). :param last_epoch: The index of the last epoch. This is normally -1 when starting afresh, but may be different when loaded from a snapshot. :param epoch: The starting epoch. This is normally 0, but may be different when loaded from a snapshot. Since A2C is an online algorithm, an epoch is just an artificial boundary so that we record logs after each epoch. :param epoch_length: Number of total environment steps per epoch. :param n_epochs: Total number of epochs to run the algorithm. :param snapshot_saver: An object for saving snapshots. """ # ensures that shared parameters are only counted once if joint_model is None: joint_model = UniqueChainList(policy, vf) if getattr(optimizer, 'target', None) is not joint_model: optimizer.setup(joint_model) try: # remove existing hook if necessary (this should only be needed when restarting experiments) optimizer.remove_hook('gradient_clipping') except KeyError: pass if max_grad_norm is not None: # Clip L2 norm of gradient, to improve stability optimizer.add_hook(chainer.optimizer.GradientClipping( threshold=max_grad_norm), 'gradient_clipping') epoch = last_epoch + 1 global_t = epoch * epoch_length loggings = defaultdict(list) logger.info("Starting env pool") with EnvPool(env_maker, n_envs=n_envs) as env_pool: gen = samples_generator(env_pool, policy, vf, k) logger.info("Starting epoch {}".format(epoch)) if logger.get_level() <= logger.INFO: progbar = tqdm(total=epoch_length) else: progbar = None while global_t < epoch_length * n_epochs: # Run k steps in the environment # Note: # - all_actions, all_values, all_dists, and next_values are chainer variables # - all_rewards, all_dones are lists numpy arrays # The first dimension of each variable is time, and the second dimension is the index of the environment all_actions, all_rewards, all_dones, all_dists, all_values, next_values = next( gen) global_t += n_envs * k # Compute returns and advantages # Size: (k, n_envs) all_values = F.stack(all_values) all_rewards = np.asarray(all_rewards, dtype=np.float32) all_dones = np.asarray(all_dones, dtype=np.float32) all_values_data = all_values.data next_values_data = next_values.data test_once(compute_returns_advantages) all_returns, all_advs = compute_returns_advantages( all_rewards, all_dones, all_values_data, next_values_data, discount ) all_returns = chainer.Variable(all_returns.astype(np.float32)) all_advs = chainer.Variable(all_advs.astype(np.float32)) # Concatenate data # Size: (k*n_envs,) + action_shape all_flat_actions = F.concat(all_actions, axis=0) # Size: key -> (k*n_envs,) + dist_shape all_flat_dists = {k: F.concat( [d[k] for d in all_dists], axis=0) for k in all_dists[0].keys()} all_flat_dists = policy.distribution.from_dict(all_flat_dists) # Prepare variables needed for gradient computation logli = all_flat_dists.logli(all_flat_actions) ent = all_flat_dists.entropy() # Flatten advantages all_advs = F.concat(all_advs, axis=0) # Form the loss - you should only need to use the variables provided as input arguments below def compute_total_loss(logli, all_advs, ent_coeff, ent, vf_loss_coeff, all_returns, all_values): """ :param logli: A chainer variable, which should be a vector of size N :param all_advs: A chainer variable, which should be a vector of size N :param ent_coeff: A scalar :param ent: A chainer variable, which should be a vector of size N :param vf_loss_coeff: A scalar :param all_returns: A chainer variable, which should be a vector of size N :param all_values: A chainer variable, which should be a vector of size N :return: A tuple of (policy_loss, vf_loss, total_loss) policy_loss should be the weighted sum of the surrogate loss and the average entropy loss vf_loss should be the (unweighted) squared loss of value function prediction. total_loss should be the weighted sum of policy_loss and vf_loss """ "*** YOUR CODE HERE ***" policy_loss = -F.mean(logli * all_advs) - ent_coeff * F.mean(ent) vf_loss = F.mean_squared_error(all_returns, all_values) total_loss = policy_loss + vf_loss_coeff * vf_loss return policy_loss, vf_loss, total_loss test_once(compute_total_loss) policy_loss, vf_loss, total_loss = compute_total_loss( logli=logli, all_advs=all_advs, ent_coeff=ent_coeff, ent=ent, vf_loss_coeff=vf_loss_coeff, all_returns=all_returns, all_values=all_values ) joint_model.cleargrads() total_loss.backward() optimizer.update() vf_loss_data = vf_loss.data all_returns_data = all_returns.data all_flat_dists_data = { k: v.data for k, v in all_flat_dists.as_dict().items() } loggings["vf_loss"].append(vf_loss_data) loggings["vf_preds"].append(all_values_data) loggings["vf_targets"].append(all_returns_data) loggings["dists"].append(all_flat_dists_data) if progbar is not None: progbar.update(k * n_envs) # An entire epoch has passed if global_t // epoch_length > epoch: logger.logkv('Epoch', epoch) log_reward_statistics(env) all_dists = { k: Variable(np.concatenate([d[k] for d in loggings["dists"]], axis=0)) for k in loggings["dists"][0].keys() } log_action_distribution_statistics( policy.distribution.from_dict(all_dists)) logger.logkv('|VfPred|', np.mean(np.abs(loggings["vf_preds"]))) logger.logkv('|VfTarget|', np.mean( np.abs(loggings["vf_targets"]))) logger.logkv('VfLoss', np.mean(loggings["vf_loss"])) logger.dumpkvs() if snapshot_saver is not None: logger.info("Saving snapshot") snapshot_saver.save_state( epoch, dict( alg=a2c, alg_state=dict( env_maker=env_maker, policy=policy, vf=vf, joint_model=joint_model, k=k, n_envs=n_envs, discount=discount, last_epoch=epoch, n_epochs=n_epochs, epoch_length=epoch_length, optimizer=optimizer, vf_loss_coeff=vf_loss_coeff, ent_coeff=ent_coeff, max_grad_norm=max_grad_norm, ) ) ) # Reset stored logging information loggings = defaultdict(list) if progbar is not None: progbar.close() epoch = global_t // epoch_length logger.info("Starting epoch {}".format(epoch)) if progbar is not None: progbar = tqdm(total=epoch_length) if progbar is not None: progbar.close()