def dump_tabular(self): """ Write all of the diagnostics from the current iteration. Writes both to stdout, and to the output file. """ if proc_id() == 0: vals = [] key_lens = [len(key) for key in self.log_headers] max_key_len = max(15, max(key_lens)) keystr = '%' + '%d' % max_key_len fmt = "| " + keystr + "s | %15s |" n_slashes = 22 + max_key_len print("-" * n_slashes) for key in self.log_headers: val = self.log_current_row.get(key, "") valstr = "%8.3g" % val if hasattr(val, "__float__") else val print(fmt % (key, valstr)) vals.append(val) print("-" * n_slashes) if self.output_file is not None: if self.first_row: self.output_file.write("\t".join(self.log_headers) + "\n") self.output_file.write("\t".join(map(str, vals)) + "\n") self.output_file.flush() self.log_current_row.clear() self.first_row = False
def save_config(self, config): """ Log an experiment configuration. Call this once at the top of your experiment, passing in all important config vars as a dict. This will serialize the config to JSON, while handling anything which can't be serialized in a graceful way (writing as informative a string as possible). Example use: .. code-block:: python logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) """ config_json = convert_json(config) if self.exp_name is not None: config_json['exp_name'] = self.exp_name if proc_id() == 0: output = json.dumps(config_json, separators=(',', ':\t'), indent=4, sort_keys=True) print(colorize('Saving config:\n', color='cyan', bold=True)) print(output) with open(osp.join(self.output_dir, "config.json"), 'w') as out: out.write(output)
def __init__(self, output_dir=None, output_fname='progress.txt', exp_name=None): """ Initialize a Logger. Args: output_dir (string): A directory for saving results to. If ``None``, defaults to a temp directory of the form ``/tmp/experiments/somerandomnumber``. output_fname (string): Name for the tab-separated-value file containing metrics logged throughout a training run. Defaults to ``progress.txt``. exp_name (string): Experiment name. If you run multiple training runs and give them all the same ``exp_name``, the plotter will know to group them. (Use case: if you run the same hyperparameter configuration with multiple random seeds, you should give them all the same ``exp_name``.) """ if proc_id() == 0: self.output_dir = output_dir or "/tmp/experiments/%i" % int( time.time()) if osp.exists(self.output_dir): print( "Warning: Log dir %s already exists! Storing info there anyway." % self.output_dir) else: os.makedirs(self.output_dir) self.output_file = open(osp.join(self.output_dir, output_fname), 'w') atexit.register(self.output_file.close) print( colorize("Logging data to %s" % self.output_file.name, 'green', bold=True)) else: self.output_dir = None self.output_file = None self.first_row = True self.log_headers = [] self.log_current_row = {} self.exp_name = exp_name
def save_state(self, state_dict, model, itr=None): """ Saves the state of an experiment. To be clear: this is about saving *state*, not logging diagnostics. All diagnostic logging is separate from this function. This function will save whatever is in ``state_dict``---usually just a copy of the environment---and the most recent copy of the model via ``model``. Call with any frequency you prefer. If you only want to maintain a single state and overwrite it at each call with the most recent version, leave ``itr=None``. If you want to keep all of the states you save, provide unique (increasing) values for 'itr'. Args: state_dict (dict): Dictionary containing essential elements to describe the current state of training. model (nn.Module): A model which contains the policy. itr: An int, or None. Current iteration of training. """ if proc_id() == 0: fname = 'vars.pkl' if itr is None else 'vars%d.pkl' % itr try: joblib.dump(state_dict, osp.join(self.output_dir, fname)) except: self.log('Warning: could not pickle state_dict.', color='red') self._torch_save(model, itr)
def ppo(env_fn, actor_critic=model.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, ent_coeff=0.0, epochs=50, clip=0.2, p_lr=3e-4, v_lr=1e-3, ppo_epochs=80, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Proximal Policy Optimization implemented with GAE-lambda advantage function. """ # Loggers logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Update the seed seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) local_steps_per_epoch = int(steps_per_epoch / num_procs()) env = env_fn() actor_critic = mlp_actor_critic(env.observation_space.shape, action_space=env.action_space, **ac_kwargs) rb = ReplayBuffer(local_steps_per_epoch, env.observation_space.shape, env.action_space.shape) # Number of parameters var_counts = tuple( sum(p.numel() for p in module.parameters() if p.requires_grad) for module in [actor_critic.p_net, actor_critic.v_net]) logger.log('Number of parameters: \t pi: %d, \t v: %d\n' % var_counts) # Optimizers p_optimizer = torch.optim.Adam(actor_critic.p_net.parameters(), lr=p_lr) v_optimizer = torch.optim.Adam(actor_critic.v_net.parameters(), lr=v_lr) sync_all_params(actor_critic.parameters()) # Initializations ep_ret, ep_len, r, val, done = 0, 0, 0, 0, False start_time = time.time() def update(): actor_critic.train() o, logp_old, a, _, rew2g, val, adv = rb.read() for i in range(ppo_epochs): _, logp, _ = actor_critic.p_net(o, a) ratio = (logp - logp_old).exp() # Ratio of policies ent = (-logp).mean() # Entropy kl = (logp_old - logp).mean() kl = mpi_avg(kl.item()) if kl > 1.5 * target_kl: # Put additional KL-div limit between consequent policies logger.log( "Early stopping at step %d due to reaching max KL-divergence" % i) break p_loss = -torch.min( ratio * adv, torch.clamp(ratio, 1 - clip, 1 + clip) * adv).mean() p_loss = p_loss - ent * ent_coeff p_optimizer.zero_grad() p_loss.backward() average_gradients(p_optimizer.param_groups) p_optimizer.step() val = actor_critic.v_net(o) v_loss = (val - rew2g).pow(2).mean() v_optimizer.zero_grad() v_loss.backward() average_gradients(v_optimizer.param_groups) v_optimizer.step() # Agent in the Wild for epoch in range(epochs): obs = env.reset() actor_critic.eval() for t in range(local_steps_per_epoch): #env.render() a, _, logp, v = actor_critic(torch.Tensor(obs.reshape(1, -1))) rb.write(obs, logp, a, r, v) obs, r, done, _ = env.step(a.detach().numpy()[0]) ep_ret += r ep_len += 1 # Do not lose r,v at terminal states if done or t == local_steps_per_epoch - 1: v_d = r if done else actor_critic.v_net( torch.Tensor(obs.reshape(1, -1))).item() rb.calc_adv(v_d) if done: logger.store(EpRet=ep_ret, EpLen=ep_len) obs, r, done, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, actor_critic, None) update() # Logger Monitor logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() env.close()
def dqn(env_fn, actor_critic=model.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, p_lr=3e-4, v_lr=1e-3, logger_kwargs=dict(), save_freq=10): """ Deep Q-Network implemented with GAE-lambda advantage function. """ # Loggers logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Update the seed seed += 1000*proc_id() torch.manual_seed(seed) local_steps_per_epoch = int(steps_per_epoch/num_procs()) env = env_fn() actor_critic = mlp_actor_critic(env.observation_space.shape, action_space=env.action_space, **ac_kwargs) rb = ReplayBuffer(local_steps_per_epoch, env.observation_space.shape, env.action_space.shape) # Optimizers p_optimizer = torch.optim.Adam(actor_critic.p_net.parameters(), lr=p_lr) v_optimizer = torch.optim.Adam(actor_critic.v_net.parameters(), lr=v_lr) sync_all_params(actor_critic.parameters()) # Initializations ep_ret, ep_len, r, val, done = 0, 0, 0, 0, False start_time = time.time() def update(): o, logp_old, a, _, rew2g, val, adv = rb.read(on_policy=True) for epoch in range(ppo_epochs): _, logp, _, val = actor_critic(o,a) ratio = (logp - logp_old).exp() # Ratio of policies kl = (logp_old - logp).mean() kl = mpi_avg(kl.item()) if kl > 1.5 * target_kl: # Put additional KL-div limit between consequent policies break p_loss = -torch.min(ratio * adv, torch.clamp(ratio, 1-clip, 1+clip) * adv).mean() p_optimizer.zero_grad() p_loss.backward() p_optimizer.step() v_loss = (val - rew2g).pow(2).mean() v_optimizer.zero_grad() v_loss.backward() v_optimizer.step() # Agent in the Wild for epoch in range(epochs): obs = env.reset() for t in range(local_steps_per_epoch): #env.render() a, _, logp, v = actor_critic(torch.Tensor(obs)) rb.write(obs, logp, a, r, v) obs, r, done, _ = env.step(a.detach().numpy()) ep_ret += r ep_len += 1 # Do not lose r,v at terminal states if done or t==local_steps_per_epoch-1: v_d = r if done else actor_critic.v_net(torch.Tensor(obs)).item() rb.calc_adv(v_d) if done: logger.store(EpRet=ep_ret, EpLen=ep_len) obs, r, done, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs-1): logger.save_state({'env': env}, actor_critic, None) update() # Logger Monitor logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', (epoch+1)*steps_per_epoch) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular() env.close()
def log(self, msg, color='green'): """Print a colorized message to stdout.""" if proc_id() == 0: print(colorize(msg, color, bold=True))
def _torch_save(self, model, itr=None): if proc_id() == 0: fname = 'torch_save.pt' if itr is None else 'torch_save%d.pt' % itr torch.save(model, osp.join(self.output_dir, fname))