def dump_tabular(self): """ Write all of the diagnostics from the current iteration. Writes both to stdout, and to the output file. """ if proc_id() == 0: vals = [] key_lens = [len(key) for key in self.log_headers] max_key_len = max(15, max(key_lens)) keystr = '%' + '%d' % max_key_len fmt = "| " + keystr + "s | %15s |" n_slashes = 22 + max_key_len print("-" * n_slashes) for key in self.log_headers: val = self.log_current_row.get(key, "") valstr = "%8.3g" % val if hasattr(val, "__float__") else val print(fmt % (key, valstr)) vals.append(val) print("-" * n_slashes) if self.output_file is not None: if self.first_row: self.output_file.write("\t".join(self.log_headers) + "\n") self.output_file.write("\t".join(map(str, vals)) + "\n") self.output_file.flush() self.log_current_row.clear() self.first_row = False
def save_state(self, state_dict, model, itr=None): """ Saves the state of an experiment. To be clear: this is about saving *state*, not logging diagnostics. All diagnostic logging is separate from this function. This function will save whatever is in ``state_dict``---usually just a copy of the environment---and the most recent model parameters``. Call with any frequency you prefer. If you only want to maintain a single state and overwrite it at each call with the most recent version, leave ``itr=None``. If you want to keep all of the states you save, provide unique (increasing) values for 'itr'. Args: state_dict (dict): Dictionary containing essential elements to describe the current state of training. itr: An int, or None. Current iteration of training. """ if proc_id() == 0: fname = 'vars.pkl' if itr is None else 'vars%d.pkl' % itr try: joblib.dump(state_dict, osp.join(self.output_dir, fname)) except: self.log('Warning: could not pickle state_dict.', color='red') self.torch_simple_save(model, itr)
def save_config(self, config): """ Log an experiment configuration. Call this once at the top of your experiment, passing in all important config vars as a dict. This will serialize the config to JSON, while handling anything which can't be serialized in a graceful way (writing as informative a string as possible). Example use: .. code-block:: python logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) """ config_json = convert_json(config) if self.exp_name is not None: config_json['exp_name'] = self.exp_name if proc_id() == 0: output = json.dumps(config_json, separators=(',', ':\t'), indent=4, sort_keys=True) print(colorize('Saving config:\n', color='cyan', bold=True)) print(output) with open(osp.join(self.output_dir, "config.json"), 'w') as out: out.write(output)
def torch_simple_save(self, model, itr=None): """ Uses simple_save to save a trained model """ if proc_id() == 0: fpath = 'simple_save' + ('%d.pth' % itr if itr is not None else '.pth') fpath = osp.join(self.output_dir, fpath) torch.save(model, fpath)
def __init__(self, output_dir=None, output_fname='progress.txt', exp_name=None): """ Initialize a Logger. Args: output_dir (string): A directory for saving results to. If ``None``, defaults to a temp directory of the form ``/tmp/experiments/somerandomnumber``. output_fname (string): Name for the tab-separated-value file containing metrics logged throughout a training run. Defaults to ``progress.txt``. exp_name (string): Experiment name. If you run multiple training runs and give them all the same ``exp_name``, the plotter will know to group them. (Use case: if you run the same hyperparameter configuration with multiple random seeds, you should give them all the same ``exp_name``.) """ if proc_id() == 0: self.output_dir = output_dir or "/tmp/experiments/%i" % int( time.time()) if osp.exists(self.output_dir): print( "Warning: Log dir %s already exists! Storing info there anyway." % self.output_dir) else: os.makedirs(self.output_dir) self.output_file = open(osp.join(self.output_dir, output_fname), 'w') atexit.register(self.output_file.close) print( colorize("Logging data to %s" % self.output_file.name, 'green', bold=True)) else: self.output_dir = None self.output_file = None self.first_row = True self.log_headers = [] self.log_current_row = {} self.exp_name = exp_name
def log(self, msg, color='green'): """Print a colorized message to stdout.""" if proc_id() == 0: print(colorize(msg, color, bold=True))
def trpo(env_fn, actor_critic=core.Actor_Critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, delta=0.01, vf_lr=1e-3, train_v_iters=80, damping_coeff=0.1, cg_iters=10, backtrack_iters=10, backtrack_coeff=0.8, lam=0.97, max_ep_len=1000, logger_kwargs=dict(), save_freq=10, algo='trpo'): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: ============ ================ ======================================== Symbol Shape Description ============ ================ ======================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``info`` N/A | A dict of any intermediate quantities | (from calculating the policy or log | probabilities) which are needed for | analytically computing KL divergence. | (eg sufficient statistics of the | distributions) ``info_phs`` N/A | A dict of placeholders for old values | of the entries in ``info``. ``d_kl`` () | A symbol for computing the mean KL | divergence between the current policy | (``pi``) and the old policy (as | specified by the inputs to | ``info_phs``) over the batch of | states given in ``x_ph``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) ============ ================ ======================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to TRPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) delta (float): KL-divergence limit for TRPO / NPG update. (Should be small for stability. Values like 0.01, 0.05.) vf_lr (float): Learning rate for value function optimizer. train_v_iters (int): Number of gradient descent steps to take on value function per epoch. damping_coeff (float): Artifact for numerical stability, should be smallish. Adjusts Hessian-vector product calculation: .. math:: Hv \\rightarrow (\\alpha I + H)v where :math:`\\alpha` is the damping coefficient. Probably don't play with this hyperparameter. cg_iters (int): Number of iterations of conjugate gradient to perform. Increasing this will lead to a more accurate approximation to :math:`H^{-1} g`, and possibly slightly-improved performance, but at the cost of slowing things down. Also probably don't play with this hyperparameter. backtrack_iters (int): Maximum number of steps allowed in the backtracking line search. Since the line search usually doesn't backtrack, and usually only steps back once when it does, this hyperparameter doesn't often matter. backtrack_coeff (float): How far back to step during backtracking line search. (Always between 0 and 1, usually above 0.5.) lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. algo: Either 'trpo' or 'npg': this code supports both, since they are almost the same. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) if isinstance(env.action_space, Discrete): info_shapes = {'logp_all': [env.action_space.n]} else: info_shapes = {'mu': [env.action_space.shape[0]]} buf = GAEBuffer(obs_dim, act_dim, local_steps_per_epoch, info_shapes, gamma, lam) # make core of policy network net = actor_critic(obs_dim[0], **ac_kwargs) print(net) # loss function criterion_mse = nn.MSELoss() # optim optimizer_critic = optim.Adam(net.critic.parameters(), lr=vf_lr) # Sync params across processes sync_all_params(net.parameters()) # Count variables # var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) # logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n'%var_counts) def cg(Ax, b): """ Conjugate gradient algorithm (see https://en.wikipedia.org/wiki/Conjugate_gradient_method) """ x = np.zeros_like(b) r = b.copy( ) # Note: should be 'b - Ax(x)', but for x=0, Ax(x)=0. Change if doing warm start. p = r.copy() r_dot_old = np.dot(r, r) for _ in range(cg_iters): z = Ax(p) alpha = r_dot_old / (np.dot(p, z) + EPS) x += alpha * p r -= alpha * z r_dot_new = np.dot(r, r) p = r + (r_dot_new / r_dot_old) * p r_dot_old = r_dot_new return x def update(): net.train() inputs = [torch.from_numpy(x) for x in buf.get()] # Main outputs from computation graph, plus placeholders for old pdist (for KL) x_ph, a_ph, adv_ph, ret_ph, logp_old_ph = inputs[:5] _, logp, _, _, d_kl = net.apply_actor(x_ph, a_ph, old_logp_or_mu=inputs[-1]) v = net.apply_critic(x_ph) # TRPO losses ratio = torch.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) pi_l_old = -torch.mean(ratio * adv_ph) v_l_old = criterion_mse(v, ret_ph) # Prepare hessian func, gradient eval g = core.flat_grad(pi_l_old, net.actor.parameters(), retain_graph=True) g = mpi_avg(g.numpy()) pi_l_old = mpi_avg(pi_l_old.item()) def Hx(x): x = torch.from_numpy(x) hvp = core.hessian_vector_product(d_kl, net.actor, x) if damping_coeff > 0: hvp += damping_coeff * x return mpi_avg(hvp.detach().numpy()) # Core calculations for TRPO or NPG x = cg(Hx, g) alpha = np.sqrt(2 * delta / (np.dot(x, Hx(x)) + EPS)) old_params = parameters_to_vector(net.actor.parameters()) x = torch.from_numpy(x) def set_and_eval(step): vector_to_parameters(old_params - alpha * x * step, net.actor.parameters()) _, logp, _, _, d_kl = net.apply_actor(x_ph, a_ph, old_logp_or_mu=inputs[-1]) ratio = torch.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) pi_loss = -torch.mean(ratio * adv_ph) return mpi_avg(d_kl.item()), mpi_avg(pi_loss.item()) if algo == 'npg': # npg has no backtracking or hard kl constraint enforcement kl, pi_l_new = set_and_eval(step=1.) elif algo == 'trpo': # trpo augments npg with backtracking line search, hard kl for j in range(backtrack_iters): kl, pi_l_new = set_and_eval(step=backtrack_coeff**j) if kl <= delta and pi_l_new <= pi_l_old: logger.log( 'Accepting new params at step %d of line search.' % j) logger.store(BacktrackIters=j) break if j == backtrack_iters - 1: logger.log('Line search failed! Keeping old params.') logger.store(BacktrackIters=j) kl, pi_l_new = set_and_eval(step=0.) # Value function learning for _ in range(train_v_iters): v = net.apply_critic(x_ph) v_loss = criterion_mse(v, ret_ph) optimizer_critic.zero_grad() v_loss.backward() average_gradients(optimizer_critic.param_groups) optimizer_critic.step() # Log changes from update with torch.no_grad(): net.eval() v = net.apply_critic(x_ph) v_l_new = criterion_mse(v, ret_ph) # Log changes from update logger.store(LossPi=pi_l_old, LossV=v_l_old.item(), KL=kl, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old).item()) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): x_ph = torch.from_numpy(o[np.newaxis].astype(np.float32)) with torch.no_grad(): net.eval() a, _, logp_t, info_t, _ = net.apply_actor(x_ph) v_t = net.apply_critic(x_ph) # save and log a = a.numpy()[0] v_t = v_t.data.numpy() logp_t = logp_t.data.numpy() ot = o.copy() buf.store(o, a, r, v_t, logp_t, info_t) logger.store(VVals=v_t) o, r, d, _ = env.step(a) # buf.store(ot, a, r, v_t, logp_t, info_t) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else v_t # if d: # last_val = 0 # else: # with torch.no_grad(): # net.eval() # x_ph = torch.from_numpy(o[np.newaxis].astype(np.float32)) # v_t = net.apply_critic(x_ph) # last_val = v_t.detach().numpy() buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, net, None) # Perform TRPO or NPG update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('KL', average_only=True) if algo == 'trpo': logger.log_tabular('BacktrackIters', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def vpg(env_fn, actor_critic=core.Actor_Critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, pi_lr=1e-3, vf_lr=1e-3, train_v_iters=80, lam=0.97, max_ep_len=1000, logger_kwargs=dict(), save_freq=10): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to VPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = VPGBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # make core of policy network net = actor_critic(obs_dim[0], **ac_kwargs) print(net) # loss function criterion_mse = nn.MSELoss() # optim optimizer_actor = optim.Adam(net.actor.parameters(), lr=pi_lr) optimizer_critic = optim.Adam(net.critic.parameters(), lr=vf_lr) # Sync params across processes sync_all_params(net.parameters()) # Count variables # var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) # logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n'%var_counts) def update(): obs_buf, act_buf, adv_buf, ret_buf, logp_buf = buf.get() net.train() # update actor x_ph = torch.from_numpy(obs_buf) a_ph = torch.from_numpy(act_buf) adv_ph = torch.from_numpy(adv_buf) _, logp, _ = net.apply_actor(x_ph, a_ph) pi_loss = -torch.mean(logp * adv_ph) optimizer_actor.zero_grad() pi_loss.backward() average_gradients(optimizer_actor.param_groups) optimizer_actor.step() pi_l_old = pi_loss approx_ent = torch.mean( -logp) # a sample estimate for entropy, also easy to compute # Value function learning ret_ph = torch.from_numpy(ret_buf) v_ph = net.apply_critic(x_ph) v_l_old = criterion_mse(v_ph, ret_ph) for _ in range(train_v_iters): v_ph = net.apply_critic(x_ph) v_loss = criterion_mse(v_ph, ret_ph) optimizer_critic.zero_grad() v_loss.backward() average_gradients(optimizer_critic.param_groups) optimizer_critic.step() # Log changes from update with torch.no_grad(): net.eval() _, logp, _ = net.apply_actor(x_ph, a_ph) v_ph = net.apply_critic(x_ph) pi_l_new = -torch.mean(logp * adv_ph) v_l_new = criterion_mse(v_ph, ret_ph) # Info (useful to watch during learning) logp_old_ph = torch.from_numpy(logp_buf) approx_kl = torch.mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute logger.store(LossPi=pi_l_old.item(), LossV=v_l_old.item(), KL=approx_kl.item(), Entropy=approx_ent.item(), DeltaLossPi=(pi_l_new - pi_l_old).item(), DeltaLossV=(v_l_new - v_l_old).item()) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): x_ph = torch.from_numpy(o[np.newaxis].astype(np.float32)) with torch.no_grad(): net.eval() a, _, logp_t = net.apply_actor(x_ph) v_t = net.apply_critic(x_ph) # save and log a = a.numpy()[0] v_t = v_t.data.numpy() logp_t = logp_t.data.numpy() ot = o.copy() buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else v_t # if d: # last_val = 0 # else: # with torch.no_grad(): # net.eval() # x_ph = torch.from_numpy(o[np.newaxis].astype(np.float32)) # v_t = net.apply_critic(x_ph) # last_val = v_t.detach().numpy() buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, net, None) # Perform VPG update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()