def setup_pytorch_for_mpi(): """ Avoid slowdowns caused by each separate process's PyTorch using more than its fair share of CPU resources. """ print('Process %d: Reporting original number of Torch threads as %d.' % (proc_id(), torch.get_num_threads()), flush=True) if torch.get_num_threads() == 1: return fair_num_threads = max(int(torch.get_num_threads() / num_procs()), 1) torch.set_num_threads(fair_num_threads) print('Process %d: Reporting new number of Torch threads as %d.' % (proc_id(), torch.get_num_threads()), flush=True)
def __init__(self, output_dir=None, output_fname='progress.txt', exp_name=None): """ Initialize a Logger. Args: output_dir (string): A directory for saving results to. If ``None``, defaults to a temp directory of the form ``/tmp/experiments/somerandomnumber``. output_fname (string): Name for the tab-separated-value file containing metrics logged throughout a training run. Defaults to ``progress.txt``. exp_name (string): Experiment name. If you run multiple training runs and give them all the same ``exp_name``, the plotter will know to group them. (Use case: if you run the same hyperparameter configuration with multiple random seeds, you should give them all the same ``exp_name``.) """ if proc_id()==0: self.output_dir = output_dir or "/tmp/experiments/%i"%int(time.time()) if osp.exists(self.output_dir): print("Warning: Log dir %s already exists! Storing info there anyway."%self.output_dir) else: os.makedirs(self.output_dir) self.output_file = open(osp.join(self.output_dir, output_fname), 'w') atexit.register(self.output_file.close) print(colorize("Logging data to %s"%self.output_file.name, 'green', bold=True)) else: self.output_dir = None self.output_file = None self.first_row=True self.log_headers = [] self.log_current_row = {} self.exp_name = exp_name
def save_state(self, state_dict, itr=None): """ Saves the state of an experiment. To be clear: this is about saving *state*, not logging diagnostics. All diagnostic logging is separate from this function. This function will save whatever is in ``state_dict``---usually just a copy of the environment---and the most recent parameters for the model you previously set up saving for with ``setup_tf_saver``. Call with any frequency you prefer. If you only want to maintain a single state and overwrite it at each call with the most recent version, leave ``itr=None``. If you want to keep all of the states you save, provide unique (increasing) values for 'itr'. Args: state_dict (dict): Dictionary containing essential elements to describe the current state of training. itr: An int, or None. Current iteration of training. """ if proc_id() == 0: fname = 'vars.pkl' if itr is None else 'vars%d.pkl' % itr try: joblib.dump(state_dict, osp.join(self.output_dir, fname)) except: self.log('Warning: could not pickle state_dict.', color='red') if hasattr(self, 'tf_saver_elements'): self._tf_simple_save(itr) if hasattr(self, 'pytorch_saver_elements'): self._pytorch_simple_save(itr)
def dump_tabular(self): """ Write all of the diagnostics from the current iteration. Writes both to stdout, and to the output file. """ if proc_id() == 0: vals = [] key_lens = [len(key) for key in self.log_headers] max_key_len = max(15, max(key_lens)) keystr = '%' + '%d' % max_key_len fmt = "| " + keystr + "s | %15s |" n_slashes = 22 + max_key_len print("-" * n_slashes) for key in self.log_headers: val = self.log_current_row.get(key, "") valstr = "%8.3g" % val if hasattr(val, "__float__") else val print(fmt % (key, valstr)) vals.append(val) print("-" * n_slashes, flush=True) if self.output_file is not None: if self.first_row: self.output_file.write("\t".join(self.log_headers) + "\n") self.output_file.write("\t".join(map(str, vals)) + "\n") self.output_file.flush() self.log_current_row.clear() self.first_row = False
def log(self, msg, color='green'): """Print a colorized message to stdout.""" if self.quiet: return if proc_id() == 0: print(colorize(msg, color, bold=True))
def _pytorch_simple_save(self, itr=None, pytorch_save=None): """ Saves the PyTorch model (or models). """ if proc_id() == 0: assert hasattr(self, 'pytorch_saver_elements'), \ "First have to setup saving with self.setup_pytorch_saver" fpath = PYTORCH_SAVE_DIR fpath = osp.join(self.output_dir, fpath) fname = 'model' + ('%d' % itr if itr is not None else '') + '.pt' fname = osp.join(fpath, fname) os.makedirs(fpath, exist_ok=True) if pytorch_save is not None: torch.save(pytorch_save, fname) else: with warnings.catch_warnings(): warnings.simplefilter("ignore") # We are using a non-recommended way of saving PyTorch models, # by pickling whole objects (which are dependent on the exact # directory structure at the time of saving) as opposed to # just saving network weights. This works sufficiently well # for the purposes of Spinning Up, but you may want to do # something different for your personal PyTorch project. # We use a catch_warnings() context to avoid the warnings about # not being able to save the source code. torch.save(self.pytorch_saver_elements, fname)
def _save_snapshots(self, best_category='', is_pytorch=True): if proc_id() != 0 or not self.num_snapshots_to_keep: return now = time.time() if best_category: save_snapshot = True snapshots = self.best_model_snapshots[best_category] snapshots_dir = join(self.output_dir, f'best_{best_category}') else: snapshots = self.timed_snapshots snapshots_dir = join(self.output_dir, 'snapshots') if not self.timed_snapshots: save_snapshot = True else: prev_time, _prev_dir = self.timed_snapshots[-1] save_snapshot = prev_time < (now - self.snapshot_save_freq_mins * 60) if save_snapshot: if len(snapshots) == snapshots.maxlen: # Roll snapshots _old_time, old_dir = snapshots.popleft() shutil.rmtree(old_dir) new_dir = join(snapshots_dir, get_date_str()) os.makedirs(new_dir) if is_pytorch: dirs = [PYTORCH_SAVE_DIR] else: dirs = [TF_MODEL_ONLY_DIR, TF_SIMPLE_SAVE_DIR] for dir_ in dirs: curr_dir = join(self.output_dir, dir_) shutil.copytree(curr_dir, join(new_dir, dir_)) snapshots.append((now, new_dir))
def save_config(self, config): """ Log an experiment configuration. Call this once at the top of your experiment, passing in all important config vars as a dict. This will serialize the config to JSON, while handling anything which can't be serialized in a graceful way (writing as informative a string as possible). Example use: .. code-block:: python logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) """ config_json = convert_json(config) if self.exp_name is not None: config_json['exp_name'] = self.exp_name if proc_id() == 0: output = json.dumps(config_json, separators=(',', ':\t'), indent=4, sort_keys=True) print(colorize('Saving config:\n', color='cyan', bold=True)) print(output) with open(osp.join(self.output_dir, "config.json"), 'w') as out: out.write(output)
def __init__(self, env_maker: Callable, ac_maker=core.MLPActorCritic, ac_kwargs={}, seed: int = 0, epochs: int = 50, steps_per_epoch: int = 4000, gamma: float = 0.99, actor_lr: float = 3e-4, critic_lr: float = 1e-3, num_iter_train_critic: int = 80, lam: float = 0.97, max_episode_len: int = 1000, logger_kwargs=dict(), save_freq: int = 10): # Special function to avoid certain slowdowns from PyTorch + MPI combo. setup_pytorch_for_mpi() # Set up logger and save configuration self.logger = EpochLogger(**logger_kwargs) self.logger.save_config(locals()) # Random seed seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) self.epochs = epochs self.steps_per_epoch = steps_per_epoch self.num_iter_train_critic = num_iter_train_critic self.max_episode_len = max_episode_len self.save_freq = save_freq # make env self.env = env_maker() self.obs_dim = self.env.observation_space.shape self.act_dim = self.env.action_space.shape # make actor-critic self.ac = ac_maker(self.env.observation_space, self.env.action_space, **ac_kwargs) # make buffer self.local_steps_per_epoch = int(steps_per_epoch / num_procs()) self.buffer = Buffer(self.obs_dim, self.act_dim, self.local_steps_per_epoch, gamma, lam) # make optimizers self.actor_optimizer = Adam(self.ac.actor.parameters(), lr=actor_lr) self.critic_optimizer = Adam(self.ac.critic.parameters(), lr=critic_lr) # Sync params across processes sync_params(self.ac) # Count variables var_counts = tuple( core.count_vars(module) for module in [self.ac.actor, self.ac.critic]) self.logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # Set up model saving self.logger.setup_pytorch_saver(self.ac)
def _torch_simple_save(self, model, itr=None): """ Uses simple_save to save a trained model. """ if proc_id() == 0: fpath = 'simple_save' + ('%d' % itr if itr is not None else '') fpath = osp.join(self.output_dir, fpath) torch.save(model, fpath)
def sync_params(module): """ Sync all parameters of module across all MPI processes. """ if num_procs()==1: return for p in module.parameters(): p_data = p.data.cpu() p_numpy = p_data.numpy() broadcast(p_numpy) if p.device.type != 'cpu' and proc_id() != 0: p.data.copy_(p_data) # copy parameters back to GPU
def save_env(self, epoch): self.train_returns.append(np.mean(self.logger.epoch_dict['EpRet'])) if epoch > 0 and epoch % self.save_freq == 0 and proc_id() == 0: # returns, lengths = run_policy(self.env, self.ac) # avg_return = np.mean(returns) # self.test_returns.append(avg_return) # self.test_lengths.append(np.mean(lengths)) # # if avg_return > self.max_return: self.logger.save_state({'env': self.env}, epoch) generate_train_graph(self.train_returns, self.train_graph_path) # self.obs = self.env.reset() if epoch == self.epochs - 1 and proc_id() == 0: self.logger.save_state({'env': self.env}, epoch)
def _tf_simple_save(self, itr=None): """ Uses simple_save to save a trained model, plus info to make it easy to associated tensors to variables after restore. """ if proc_id()==0: assert hasattr(self, 'tf_saver_elements'), \ "First have to setup saving with self.setup_tf_saver" fpath = 'simple_save' + ('%d'%itr if itr is not None else '') fpath = osp.join(self.output_dir, fpath) if osp.exists(fpath): # simple_save refuses to be useful if fpath already exists, # so just delete fpath if it's there. shutil.rmtree(fpath) tf.saved_model.simple_save(export_dir=fpath, **self.tf_saver_elements) joblib.dump(self.tf_saver_info, osp.join(fpath, 'model_info.pkl'))
def update(self): inputs = {k: v for k, v in zip(self.all_phs, self.buf.get())} # Training for i in range(self.train_pi_iters): _, kl = self.sess.run([self.train_pi, self.approx_kl], feed_dict=inputs) kl = mpi_avg(kl) if kl > 1.5 * self.target_kl: print( 'process %d: Early stopping at step %d due to reaching max kl.' % (proc_id(), i)) break for _ in range(self.train_v_iters): self.sess.run(self.train_v, feed_dict=inputs)
def _pytorch_simple_save(self, itr=None): """ Saves the PyTorch model (or models). """ if proc_id() == 0: assert hasattr(self, 'pytorch_saver_elements'), \ "First have to setup saving with self.setup_pytorch_saver" fpath = 'pyt_save' fpath = osp.join(self.output_dir, fpath) fname = 'model' + ('%d' % itr if itr is not None else '') + '.pt' fname = osp.join(fpath, fname) os.makedirs(fpath, exist_ok=True) state_dicts = {} for key, value in self.pytorch_saver_elements.items(): if hasattr(value, 'state_dict'): state_dicts[key] = value.state_dict() else: state_dicts[key] = value state_dicts['itr'] = itr torch.save(state_dicts, fname)
def _pytorch_simple_save(self, itr=None): """ Saves the PyTorch model (or models). """ if proc_id() == 0: folder = pathlib.Path(self.output_dir) / 'pyt_save' folder.mkdir(exist_ok=True, parents=True) for name, obj in self.pytorch_saver_elements.items(): filename = name + suffix_for_pyt_save(itr) with warnings.catch_warnings(): warnings.simplefilter("ignore") # We are using a non-recommended way of saving PyTorch models, # by pickling whole objects (which are dependent on the exact # directory structure at the time of saving) as opposed to # just saving network weights. This works sufficiently well # for the purposes of Spinning Up, but you may want to do # something different for your personal PyTorch project. # We use a catch_warnings() context to avoid the warnings about # not being able to save the source code. torch.save(obj, folder / filename)
def pytorch_simple_save(prefix, output_dir, itr, saver_elements=None): """ Saves the PyTorch model (or models). """ if proc_id() == 0: fpath = 'pyt_save' fpath = osp.join(output_dir, fpath) fname = prefix + '_model' + ('%d' % itr if itr is not None else '') + '.pt' fname = osp.join(fpath, fname) os.makedirs(fpath, exist_ok=True) with warnings.catch_warnings(): warnings.simplefilter("ignore") # We are using a non-recommended way of saving PyTorch models, # by pickling whole objects (which are dependent on the exact # directory structure at the time of saving) as opposed to # just saving network weights. This works sufficiently well # for the purposes of Spinning Up, but you may want to do # something different for your personal PyTorch project. # We use a catch_warnings() context to avoid the warnings about # not being able to save the source code. torch.save(saver_elements, fname)
def _pytorch_simple_save(self, itr=None): """ Saves the PyTorch model (or models). """ if proc_id() == 0: assert hasattr(self, 'pytorch_saver_elements'), \ "First have to setup saving with self.setup_pytorch_saver" dir_path = osp.join(self.output_dir, 'pyt_save') os.makedirs(dir_path, exist_ok=True) file_name_no_ext = f"model{'' if itr is None else itr}" file_name = f"{file_name_no_ext}.pt" file_path = osp.join(dir_path, file_name) with warnings.catch_warnings(): warnings.simplefilter("ignore") # We are using a non-recommended way of saving PyTorch models, # by pickling whole objects (which are dependent on the exact # directory structure at the time of saving) as opposed to # just saving network weights. This works sufficiently well # for the purposes of Spinning Up, but you may want to do # something different for your personal PyTorch project. # We use a catch_warnings() context to avoid the warnings about # not being able to save the source code. torch.save(self.pytorch_saver_elements, file_path) self.neptune_run[f"model/{file_name_no_ext}"].upload(file_path)
def ppo(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Proximal Policy Optimization (by clipping), with early stopping based on approximate KL Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with a ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` module. The ``step`` method should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Numpy array of actions for each | observation. ``v`` (batch,) | Numpy array of value estimates | for the provided observations. ``logp_a`` (batch,) | Numpy array of log probs for the | actions in ``a``. =========== ================ ====================================== The ``act`` method behaves the same as ``step`` but only returns ``a``. The ``pi`` module's forward call should accept a batch of observations and optionally a batch of actions, and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` N/A | Torch Distribution object, containing | a batch of distributions describing | the policy for the provided observations. ``logp_a`` (batch,) | Optional (only returned if batch of | actions is given). Tensor containing | the log probability, according to | the policy, of the provided actions. | If actions not given, will contain | ``None``. =========== ================ ====================================== The ``v`` module's forward call should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``v`` (batch,) | Tensor containing the value estimates | for the provided observations. (Critical: | make sure to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) Typically denoted by :math:`\epsilon`. pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ # Special function to avoid certain slowdowns from PyTorch + MPI combo. setup_pytorch_for_mpi() # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Random seed seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) # Instantiate environment env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Create actor-critic module ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) # Sync params across processes sync_params(ac) # Count variables var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # Set up experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Set up function for computing PPO policy loss def compute_loss_pi(data): obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data[ 'logp'] # Policy loss pi, logp = ac.pi(obs, act) ratio = torch.exp(logp - logp_old) clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv loss_pi = -(torch.min(ratio * adv, clip_adv)).mean() # Useful extra info approx_kl = (logp_old - logp).mean().item() ent = pi.entropy().mean().item() clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio) clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item() pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac) return loss_pi, pi_info # Set up function for computing value loss def compute_loss_v(data): obs, ret = data['obs'], data['ret'] return ((ac.v(obs) - ret)**2).mean() # Set up optimizers for policy and value function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(): data = buf.get() pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() # Train policy with multiple steps of gradient descent for i in range(train_pi_iters): pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) kl = mpi_avg(pi_info['kl']) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break loss_pi.backward() mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() logger.store(StopIter=i) # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) loss_v.backward() mpi_avg_grads(ac.v) # average grads across MPI processes vf_optimizer.step() # Log changes from update kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old)) # Prepare for interaction with environment start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32)) next_o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # save and log buf.store(o, a, r, v, logp) logger.store(VVals=v) # Update obs (critical!) o = next_o timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t == local_steps_per_epoch - 1 if terminal or epoch_ended: if epoch_ended and not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32)) else: v = 0 buf.finish_path(v) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def sppo(args, env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=200, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) ########### if args.alpha == 'auto': target_entropy = 0.35 log_alpha = tf.get_variable('log_alpha', dtype=tf.float32, initializer=tf.log(0.2)) alpha = tf.exp(log_alpha) else: alpha = args.alpha ########### # Main outputs from computation graph mu, pi, logp, logp_pi, v, q, h = actor_critic(alpha, x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi, h] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) ###### if args.alpha == 'auto': alpha_loss = tf.reduce_mean( -log_alpha * tf.stop_gradient(-h + target_entropy) ) # tf.clip_by_value(-h + target_entropy, 0.0, 1000.0 ) alpha_optimizer = MpiAdamOptimizer(learning_rate=1e-5) train_alpha_op = alpha_optimizer.minimize(loss=alpha_loss, var_list=[log_alpha]) ###### # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) # For PPO # min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) # pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) # ### Scheme1: SPPO NO.2: add entropy # adv_logp = adv_ph - tf.stop_gradient(alpha) * tf.stop_gradient(logp) # min_adv = tf.where(adv_logp>0, (1+clip_ratio)*adv_logp, (1-clip_ratio)*adv_logp) # pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_logp, min_adv)) # ### Scheme3: SPPO NO.3: add entropy # adv_logp = adv_ph - tf.stop_gradient(alpha) * logp_old_ph # min_adv = tf.where(adv_logp>0, (1+clip_ratio)*adv_logp, (1-clip_ratio)*adv_logp) # pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_logp, min_adv)) ### Scheme2: SPPO NO.2: add entropy min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss = -tf.reduce_mean( tf.minimum(ratio * adv_ph, min_adv) + tf.stop_gradient(alpha) * h) v_loss = tf.reduce_mean((ret_ph - v)**2) #+(ret_ph - q)**2)/2.0 # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( h) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer( learning_rate=args.pi_lr).minimize(pi_loss + 0.1 * v_loss) # train_v = MpiAdamOptimizer(learning_rate=args.vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training for i in range(train_pi_iters): if args.alpha == 'auto': sess.run(train_alpha_op, feed_dict=inputs) _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) # for _ in range(train_v_iters): # sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old), Alpha=sess.run(alpha) if args.alpha == 'auto' else alpha) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v_t, logp_t, h_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) # q_t = sess.run(q, feed_dict={x_ph: o.reshape(1,-1), a_ph: a}) # SPPO NO.1: add entropy # rh = r - args.alpha * logp_t if args.alpha == 'auto': rh = r + sess.run(alpha) * h_t else: rh = r + alpha * h_t # exact entropy # save and log buf.store(o, a, rh, v_t, logp_t) logger.store(VVals=v_t) o, r, d, _ = env.step(a[0]) ep_ret += r ep_len += 1 # d = False if ep_len == max_ep_len else d terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # # Save model # if (epoch % save_freq == 0) or (epoch == epochs-1): # logger.save_state({'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Alpha', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def vpg(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, pi_lr=3e-4, vf_lr=1e-3, train_v_iters=80, lam=0.97, max_ep_len=1000, logger_kwargs=dict(), save_freq=10): """ Vanilla Policy Gradient (with GAE-Lambda for advantage estimation) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with a ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` module. The ``step`` method should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Numpy array of actions for each | observation. ``v`` (batch,) | Numpy array of value estimates | for the provided observations. ``logp_a`` (batch,) | Numpy array of log probs for the | actions in ``a``. =========== ================ ====================================== The ``act`` method behaves the same as ``step`` but only returns ``a``. The ``pi`` module's forward call should accept a batch of observations and optionally a batch of actions, and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` N/A | Torch Distribution object, containing | a batch of distributions describing | the policy for the provided observations. ``logp_a`` (batch,) | Optional (only returned if batch of | actions is given). Tensor containing | the log probability, according to | the policy, of the provided actions. | If actions not given, will contain | ``None``. =========== ================ ====================================== The ``v`` module's forward call should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``v`` (batch,) | Tensor containing the value estimates | for the provided observations. (Critical: | make sure to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to VPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ # Special function to avoid certain slowdowns from PyTorch + MPI combo. setup_pytorch_for_mpi() # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Random seed seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) # Instantiate environment env = env_fn() obs_dim = env.observation_space.shape # obs_dim = env.observation_space.n act_dim = env.action_space.shape # Create actor-critic module ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) # Sync params across processes sync_params(ac) # Count variables var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n'%var_counts) # Set up experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = VPGBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Set up function for computing VPG policy loss def compute_loss_pi(data): obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data['logp'] # Policy loss pi, logp = ac.pi(obs, act) loss_pi = -(logp * adv).mean() # Useful extra info approx_kl = (logp_old - logp).mean().item() ent = pi.entropy().mean().item() pi_info = dict(kl=approx_kl, ent=ent) return loss_pi, pi_info # Set up function for computing value loss def compute_loss_v(data): obs, ret = data['obs'], data['ret'] return ((ac.v(obs) - ret)**2).mean() # Set up optimizers for policy and value function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(): data = buf.get() # Get loss and info values before update pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() # Train policy with a single step of gradient descent pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) loss_pi.backward() mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) bayes_kl_loss = 0. if isinstance(ac.v, BayesMLPCritic): bayes_kl_loss = ac.v.compute_kl() total_loss_v = loss_v + bayes_kl_loss / data['obs'].shape[0] total_loss_v.backward() mpi_avg_grads(ac.v) # average grads across MPI processes vf_optimizer.step() # Log changes from update kl, ent = pi_info['kl'], pi_info_old['ent'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old), BayesKL=bayes_kl_loss) # Prepare for interaction with environment start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 epoch_reward = [] # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32)) next_o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # save and log buf.store(o, a, r, v, logp) logger.store(VVals=v) # Update obs (critical!) o = next_o timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t==local_steps_per_epoch-1 if terminal or epoch_ended: if epoch_ended and not(terminal): print('Warning: trajectory cut off by epoch at %d steps.'%ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32)) else: v = 0 buf.finish_path(v) if terminal: # only save EpRet / EpLen if trajectory finished epoch_reward.append(ep_ret) logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs-1): logger.save_state({'env': env}, None) # Perform VPG update! update() if epoch % 10 == 0: # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch+1)*steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('BayesKL', average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular() return epoch_reward
def ppo(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 maxRev = float("-inf") #negative infinity in the beginning #maxRevActionSeq=[] maxRevTSTT = 0 maxRevRevenue = 0 maxRevThroughput = 0 maxRevJAH = 0 maxRevRemVeh = 0 maxRevJAH2 = 0 maxRevRMSE_MLvio = 0 maxRevPerTimeVio = 0 maxRevHOTDensity = pd.DataFrame() maxRevGPDensity = pd.DataFrame() maxtdJAHMax = 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) #we need to scale the sampled values of action from (-1,1) to our choices of toll coz they were sampled from tanh activation mu numpyFromA = np.array(a[0]) numpyFromA = ((numpyFromA + 1.0) * (env.state.tollMax - env.state.tollMin) / 2.0) + env.state.tollMin a[0] = np.ndarray.tolist(numpyFromA) o, r, d, _ = env.step(a[0]) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) #get other stats and store them too otherStats = env.getAllOtherStats() if np.any(np.isnan(np.array(otherStats))): sys.exit("Nan found in statistics! Error") logger.store(EpTSTT=otherStats[0], EpRevenue=otherStats[1], EpThroughput=otherStats[2], EpJAH=otherStats[3], EpRemVeh=otherStats[4], EpJAH2=otherStats[5], EpMLViolRMSE=otherStats[6], EpPerTimeVio=otherStats[7], EptdJAHMax=otherStats[8]) #determine max rev profile if ep_ret > maxRev: maxRev = ep_ret maxRevActionSeq = env.state.tollProfile maxRevTSTT = otherStats[0] maxRevRevenue = otherStats[1] maxRevThroughput = otherStats[2] maxRevJAH = otherStats[3] maxRevRemVeh = otherStats[4] maxRevJAH2 = otherStats[5] maxRevRMSE_MLvio = otherStats[6] maxRevPerTimeVio = otherStats[7] maxRevHOTDensity = env.getHOTDensityData() maxRevGPDensity = env.getGPDensityData() maxtdJAHMax = otherStats[8] o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpTSTT', average_only=True) logger.log_tabular('EpRevenue', average_only=True) logger.log_tabular('EpThroughput', average_only=True) logger.log_tabular('EpJAH', average_only=True) logger.log_tabular('EpRemVeh', average_only=True) logger.log_tabular('EpJAH2', average_only=True) logger.log_tabular('EpMLViolRMSE', average_only=True) logger.log_tabular('EpPerTimeVio', average_only=True) logger.log_tabular('EptdJAHMax', average_only=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() print("Max cumulative reward obtained= %f " % maxRev) print( "Corresponding revenue($)= %f, TSTT(hrs)= %f, Throughput(veh)=%f, JAHstat= %f, remaining vehicles= %f, JAHstat2=%f, RMSEML_vio=%f, percentTimeViolated(%%)=%f, tdJAHMax= %f" % (maxRevRevenue, maxRevTSTT, maxRevThroughput, maxRevJAH, maxRevRemVeh, maxRevJAH2, maxRevRMSE_MLvio, maxRevPerTimeVio, maxtdJAHMax)) outputVector = [ maxRev, maxRevRevenue, maxRevTSTT, maxRevThroughput, maxRevJAH, maxRevRemVeh, maxRevJAH2, maxRevRMSE_MLvio, maxRevPerTimeVio, maxtdJAHMax ] #print("\n===Max rev action sequence is\n",maxRevActionSeq) exportTollProfile(maxRevActionSeq, logger_kwargs, outputVector) exportDensityData(maxRevHOTDensity, maxRevGPDensity, logger_kwargs)
def vpg(env_fn, actor_critic=core.ActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, pi_lr=3e-4, vf_lr=1e-3, train_v_iters=80, lam=0.97, max_ep_len=1000, logger_kwargs=dict(), save_freq=10): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A reference to ActorCritic class which after instantiation takes an input ``x``, and action, ``a``, and returns: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a`` | in states ``x``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x``. (Critical: make sure to | flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to VPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) # https://pytorch.org/docs/master/notes/randomness.html#cudnn torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = VPGBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Actor Critic model instance actor_critic = actor_critic(obs_dim, **ac_kwargs) actor_critic.to(device) # load to cpu/gpu # Count variables var_counts = tuple(core.count_vars(model) for model in [actor_critic.policy, actor_critic.value]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n'%var_counts) # Optimizers train_pi = optim.Adam(actor_critic.policy.parameters(), lr=pi_lr) train_v = optim.Adam(actor_critic.value.parameters(), lr=vf_lr) # Sync params across processes # sync_all_params() # TODO figure out the way to do use MPI for pytorch def update(): actor_critic.train() obs, act, adv, ret, logp_old = map(lambda x: Tensor(x).to(device), buf.get()) _ , logp, _, val = actor_critic(obs, act) ent = (-logp).mean() # VPG objectives pi_loss = -(logp * adv).mean() v_l_old = ((ret - val)**2).mean() # Policy gradient step train_pi.zero_grad() pi_loss.backward() train_pi.step() # Value function learning for _ in range(train_v_iters): val = actor_critic.value(obs) v_loss = (ret - val).pow(2).mean() train_v.zero_grad() v_loss.backward() train_v.step() actor_critic.eval() # Log changes from update _, logp, _, val = actor_critic(obs, act) pi_l_new = -(logp * adv).mean() v_l_new = ((ret - val)**2).mean() kl = (logp_old - logp).mean() logger.store(LossPi=pi_loss, LossV=v_l_old, KL=kl, Entropy=ent, DeltaLossPi=(pi_l_new - pi_loss), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, logp_t, logp_pi_t, v_t = actor_critic(Tensor(o.reshape(1,-1)).to(device)) # save and log buf.store(o, a.cpu().numpy(), r, v_t.item(), logp_pi_t.cpu().detach().numpy()) logger.store(VVals=v_t) o, r, d, _ = env.step(a.cpu().numpy()) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t==local_steps_per_epoch-1): if not(terminal): print('Warning: trajectory cut off by epoch at %d steps.'%ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else actor_critic(Tensor(o.reshape(1,-1)).to(device))[-1].item() buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs-1): logger.save_state({'env': env}, actor_critic, None) # Perform VPG update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch+1)*steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular()
def vpg(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, pi_lr=3e-2, vf_lr=1e-3, train_v_iters=80, lam=0.97, max_ep_len=1000, logger_kwargs=dict(), save_freq=10): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to VPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = VPGBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # VPG objectives pi_loss = -tf.reduce_mean(logp * adv_ph) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Policy gradient step sess.run(train_pi, feed_dict=inputs) # Value function learning for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl = sess.run([pi_loss, v_loss, approx_kl], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) o, r, d, _ = env.step(a[0]) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform VPG update! update() # Log info about epoch #logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', average_only=True) #logger.log_tabular('EpLen', average_only=True) #logger.log_tabular('VVals', with_min_and_max=True) #logger.log_tabular('TotalEnvInteracts', (epoch+1)*steps_per_epoch) #logger.log_tabular('LossPi', average_only=True) #logger.log_tabular('LossV', average_only=True) #logger.log_tabular('DeltaLossPi', average_only=True) #logger.log_tabular('DeltaLossV', average_only=True) #logger.log_tabular('Entropy', average_only=True) #logger.log_tabular('KL', average_only=True) #logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular()
def run(self): # Prepare for interaction with environment total_steps = self.steps_per_epoch * self.epochs start_time = time.time() o, ep_ret, ep_len = self.env.reset(), 0, 0 eps = 1 t = self.epoch * self.steps_per_epoch if self.last_save_path is not None else 0 # Main loop: collect experience in env and update/log each epoch self.actor.eval() while t < total_steps: text = "Code: %s, Epoch: %s, Episode: %s, Ep_ret: %s, ep_len: %s. [%s/%s]" % \ (self.env.current_env.code, self.epoch, eps, ep_ret, ep_len, t + 1, total_steps) self.logger.log_stdout(text) # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy. if t >= self.start_steps: with torch.no_grad(): if self.state_of_art_model and o.ndim == 2: obs = torch.FloatTensor(o).view([1, 1, *self.obs_dim ]).to(self.device) else: obs = torch.FloatTensor(o).view([1, *self.obs_dim ]).to(self.device) a, _, _ = self.get_action(obs) a = a.cpu().item() else: a = np.random.randint(0, self.act_dim) # Step the env o2, r, d, _ = self.env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == self.max_ep_len else d # 如果长度==最大长度则False,否则 # Store experience to replay buffer if d == 2 or d == 1: # 控制重置 done = 1 else: done = 0 self.replay_buffer.store(o, a, r, o2, done) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling if d == 1 or (ep_len == self.max_ep_len ): # ep_len == max_ep_len是游戏成功时最少ep长度 o, ep_ret, ep_len = self.env.reset(isRandomStart=False), 0, 0 eps += 1 elif d == 2: # 达到索引终点 self.env.reset( isRandomStart=False, total=self.env.current_env.total) #继续下一个合约,但是继承上一次总资产 elif d == 3: # 达到回撤限制(目前先不管) d # Update handling if self.replay_buffer.size > self.update_after and t % self.update_times_every_step == 0: self.actor.train() for j in range(self.update_times_every_step): batch = self.replay_buffer.sample_batch(self.batch_size) self.update(data=batch) self.actor.eval() # logger.save_epoch_Ret_optimizer_model(save_dict) # last_best_Return_per_local = Return_per_local # End of epoch handling if ( t + 1 ) % self.steps_per_epoch == 0 and self.replay_buffer.size > self.update_after: if ( t + 1 ) % self.update_times_every_step == 0: # 每达到update_times_every_step self.epoch = (t + 1) // self.steps_per_epoch # Save model if proc_id() == 0 and (self.epoch) % self.save_freq == 0: save_dict = { 'epoch': self.epoch, 'actor': self.actor.state_dict(), 'critic1': self.critic1.state_dict(), 'critic2': self.critic2.state_dict(), 'pi_optimizer': self.pi_optimizer.state_dict(), 'q1_optimizer': self.q1_optimizer.state_dict(), 'q2_optimizer': self.q2_optimizer.state_dict(), 'critic1_targ': self.critic1_targ.state_dict(), 'critic2_targ': self.critic2_targ.state_dict(), } self.logger.save_epoch_Ret_optimizer_model(save_dict) self.actor.eval() # Test the performance of the deterministic version of the agent. self.test_agent() # Log info about epoch self.logger.log_tabular('Epoch', self.epoch) # self.logger.log_tabular('EpRet', with_min_and_max=True) self.logger.log_tabular('TestEpRet', with_min_and_max=False) # self.logger.log_tabular('EpLen', average_only=True) self.logger.log_tabular('TestEpLen', average_only=True) self.logger.log_tabular('TotalEnvInteracts', t) self.logger.log_tabular('Q1Vals', with_min_and_max=True) self.logger.log_tabular('Q2Vals', with_min_and_max=True) self.logger.log_tabular('LogPi', with_min_and_max=True) self.logger.log_tabular('LossPi', average_only=True) self.logger.log_tabular('LossQ', average_only=True) self.logger.log_tabular('Time', time.time() - start_time) # if epoch > 1: # (time.time() - start_time)/epo self.logger.dump_tabular() t += 1
def ppo(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Proximal Policy Optimization (by clipping), with early stopping based on approximate KL Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) Typically denoted by :math:`\epsilon`. pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dims = env.action_space #[ choice.shape for choice in env.action_space.values() ] #act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholder(None), core.placeholder( None), {} for k in env.action_space: logp_old_ph[k] = core.placeholder(None) # Main outputs from computation graph pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dims, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO objectives ratio, min_adv, pi_loss = {}, {}, {} for k in env.action_space: ratio[k] = tf.exp(logp[k] - logp_old_ph[k]) # pi(a|s) / pi_old(a|s) min_adv[k] = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss[k] = -tf.reduce_mean(tf.minimum(ratio[k] * adv_ph, min_adv[k])) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl, approx_ent, clipped, clipfrac = {}, {}, {}, {} for k in env.action_space: approx_kl[k] = tf.reduce_mean( logp_old_ph[k] - logp[k]) # a sample estimate for KL-divergence, easy to compute approx_ent[k] = tf.reduce_mean( -logp[k]) # a sample estimate for entropy, also easy to compute clipped[k] = tf.logical_or(ratio[k] > (1 + clip_ratio), ratio[k] < (1 - clip_ratio)) clipfrac[k] = tf.reduce_mean(tf.cast(clipped[k], tf.float32)) pi_loss_sum = tf.reduce_sum(list(pi_loss.values())) # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss_sum) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving save_outputs = {'v': v} for k in env.action_space: save_outputs['pi_' + k] = pi[k] logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs=save_outputs) def update(): inputs = {} for k, v in zip(all_phs, buf.get()): if type(k) is not dict: inputs[k] = v else: for k_, v_ in zip(k.values(), v.values()): inputs[k_] = v_ pi_l_old, v_l_old, ent = sess.run([pi_loss_sum, v_loss, approx_ent], feed_dict=inputs) # Training for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) for k in kl: kl[k] = mpi_avg(kl[k]) if max(list(kl.values())) > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss_sum, v_loss, approx_kl, clipfrac], feed_dict=inputs) sum_dict = lambda x: x if type(x) is not dict else np.sum( list(x.values())) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=sum_dict(kl), Entropy=sum_dict(ent), ClipFrac=sum_dict(cf), DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) o2, r, d, _ = env.step(**a) env.render() #force_realtime=True) ep_ret += r #print ("frame_return: %.4f sofar_EpRet: %.4f" % (r, ep_ret)) ep_len += 1 # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) # Update obs (critical!) o = o2 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = 0 if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) print("EpRet:", ep_ret) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def trpo(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, delta=0.01, vf_lr=1e-3, train_v_iters=80, damping_coeff=0.1, cg_iters=10, backtrack_iters=10, backtrack_coeff=0.8, lam=0.97, max_ep_len=1000, logger_kwargs=dict(), save_freq=10, algo='trpo'): """ Trust Region Policy Optimization (with support for Natural Policy Gradient) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: ============ ================ ======================================== Symbol Shape Description ============ ================ ======================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``info`` N/A | A dict of any intermediate quantities | (from calculating the policy or log | probabilities) which are needed for | analytically computing KL divergence. | (eg sufficient statistics of the | distributions) ``info_phs`` N/A | A dict of placeholders for old values | of the entries in ``info``. ``d_kl`` () | A symbol for computing the mean KL | divergence between the current policy | (``pi``) and the old policy (as | specified by the inputs to | ``info_phs``) over the batch of | states given in ``x_ph``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) ============ ================ ======================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to TRPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) delta (float): KL-divergence limit for TRPO / NPG update. (Should be small for stability. Values like 0.01, 0.05.) vf_lr (float): Learning rate for value function optimizer. train_v_iters (int): Number of gradient descent steps to take on value function per epoch. damping_coeff (float): Artifact for numerical stability, should be smallish. Adjusts Hessian-vector product calculation: .. math:: Hv \\rightarrow (\\alpha I + H)v where :math:`\\alpha` is the damping coefficient. Probably don't play with this hyperparameter. cg_iters (int): Number of iterations of conjugate gradient to perform. Increasing this will lead to a more accurate approximation to :math:`H^{-1} g`, and possibly slightly-improved performance, but at the cost of slowing things down. Also probably don't play with this hyperparameter. backtrack_iters (int): Maximum number of steps allowed in the backtracking line search. Since the line search usually doesn't backtrack, and usually only steps back once when it does, this hyperparameter doesn't often matter. backtrack_coeff (float): How far back to step during backtracking line search. (Always between 0 and 1, usually above 0.5.) lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. algo: Either 'trpo' or 'npg': this code supports both, since they are almost the same. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph, plus placeholders for old pdist (for KL) pi, logp, logp_pi, info, info_phs, d_kl, v = actor_critic( x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph ] + core.values_as_sorted_list(info_phs) # Every step, get: action, value, logprob, & info for pdist (for computing kl div) get_action_ops = [pi, v, logp_pi] + core.values_as_sorted_list(info) # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) info_shapes = {k: v.shape.as_list()[1:] for k, v in info_phs.items()} buf = GAEBuffer(obs_dim, act_dim, local_steps_per_epoch, info_shapes, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # TRPO losses ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) pi_loss = -tf.reduce_mean(ratio * adv_ph) v_loss = tf.reduce_mean((ret_ph - v)**2) # Optimizer for value function train_vf = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) # Symbols needed for CG solver pi_params = core.get_vars('pi') gradient = core.flat_grad(pi_loss, pi_params) v_ph, hvp = core.hessian_vector_product(d_kl, pi_params) if damping_coeff > 0: hvp += damping_coeff * v_ph # Symbols for getting and setting params get_pi_params = core.flat_concat(pi_params) set_pi_params = core.assign_params_from_flat(v_ph, pi_params) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def cg(Ax, b): """ Conjugate gradient algorithm (see https://en.wikipedia.org/wiki/Conjugate_gradient_method) """ x = np.zeros_like(b) r = b.copy( ) # Note: should be 'b - Ax(x)', but for x=0, Ax(x)=0. Change if doing warm start. p = r.copy() r_dot_old = np.dot(r, r) for _ in range(cg_iters): z = Ax(p) alpha = r_dot_old / (np.dot(p, z) + EPS) x += alpha * p r -= alpha * z r_dot_new = np.dot(r, r) p = r + (r_dot_new / r_dot_old) * p r_dot_old = r_dot_new return x def update(): # Prepare hessian func, gradient eval inputs = {k: v for k, v in zip(all_phs, buf.get())} Hx = lambda x: mpi_avg(sess.run(hvp, feed_dict={**inputs, v_ph: x})) g, pi_l_old, v_l_old = sess.run([gradient, pi_loss, v_loss], feed_dict=inputs) g, pi_l_old = mpi_avg(g), mpi_avg(pi_l_old) # Core calculations for TRPO or NPG x = cg(Hx, g) alpha = np.sqrt(2 * delta / (np.dot(x, Hx(x)) + EPS)) old_params = sess.run(get_pi_params) def set_and_eval(step): sess.run(set_pi_params, feed_dict={v_ph: old_params - alpha * x * step}) return mpi_avg(sess.run([d_kl, pi_loss], feed_dict=inputs)) if algo == 'npg': # npg has no backtracking or hard kl constraint enforcement kl, pi_l_new = set_and_eval(step=1.) elif algo == 'trpo': # trpo augments npg with backtracking line search, hard kl for j in range(backtrack_iters): kl, pi_l_new = set_and_eval(step=backtrack_coeff**j) if kl <= delta and pi_l_new <= pi_l_old: logger.log( 'Accepting new params at step %d of line search.' % j) logger.store(BacktrackIters=j) break if j == backtrack_iters - 1: logger.log('Line search failed! Keeping old params.') logger.store(BacktrackIters=j) kl, pi_l_new = set_and_eval(step=0.) # Value function updates for _ in range(train_v_iters): sess.run(train_vf, feed_dict=inputs) v_l_new = sess.run(v_loss, feed_dict=inputs) # Log changes from update logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): agent_outs = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) a, v_t, logp_t, info_t = agent_outs[0][0], agent_outs[ 1], agent_outs[2], agent_outs[3:] o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # save and log buf.store(o, a, r, v_t, logp_t, info_t) logger.store(VVals=v_t) # Update obs (critical!) o = o2 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = 0 if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform TRPO or NPG update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('KL', average_only=True) if algo == 'trpo': logger.log_tabular('BacktrackIters', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def ppo(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10, explorer=None, eps=.03, pretrain_epochs=0): logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_epochs = epochs + pretrain_epochs # Main loop: collect experience in env and update/log each epoch for epoch in range(total_epochs): for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) # explore if you are in a pretrain epoch or if eps-greedy pre = epoch < pretrain_epochs during = random.random() < eps if pre or during: if explorer is None: raise ValueError('Trying to explore but explorer is None') state = env.env.state_vector() a = explorer.sample_action(state) # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) o, r, d, _ = env.step(a[0]) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def sigail(env_fn, traj_dir, actor_critic=core.mlp_actor_critic_add, ac_kwargs=dict(), d_hidden_size=64, seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=40, train_v_iters=40, lam=0.97, max_ep_len=4000, beta=1e-4, target_kl=0.01, logger_kwargs=dict(), save_freq=100, r_env_ratio=0, d_itr=20, reward_type='negative', trj_num=20, buf_size=1000, si_update_ratio=0.02, js_smooth=5, buf_update_type='random', pretrain_bc_itr=0): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape D = Discriminator(env, hidden_size=d_hidden_size, reward_type=reward_type) #!add Discriminator object D_js_m = JS_div_machine(env, hidden_size=d_hidden_size) e_obs = np.zeros((buf_size, obs_dim[0])) e_act = np.zeros((buf_size, act_dim[0])) Sibuffer = SIBuffer(obs_dim, act_dim, e_obs, e_act, trj_num=trj_num, max_size=buf_size, js_smooth_num=js_smooth) #!sibuf trj_full = False assert e_obs.shape[1:] == obs_dim # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph pi, logp, logp_pi, pi_std, entropy, v = actor_critic( x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) #buf_gail = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)#add buffer with TRgail rewards # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss = -tf.reduce_mean(tf.minimum( ratio * adv_ph, min_adv)) - beta * entropy #add entropy v_loss = tf.reduce_mean((ret_ph - v)**2) #ret_phには累積報酬のバッファが入る # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() BC = BehavioralCloning(sess, pi, logp, x_ph, a_ph) sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Sync params across processes # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def update(): inputs = {k: v for k, v in zip(all_phs, buf.get()) } #all_phsは各バッファーに対応するプレースホルダー辞書 pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training#ここも変える必要あり? おそらく変えなくて良い for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) kl = mpi_avg(kl) if kl > 1.5 * target_kl: #更新時のklが想定の1.5倍大きいとログをだしてtrainループを着る logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) for _ in range(train_v_iters): #vの更新 sess.run(train_v, feed_dict=inputs) # Log changes from update(新しいロスの計算) pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) std, std_ent = sess.run([pi_std, entropy], feed_dict=inputs) logger.store( LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=std_ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), #更新での改善量 DeltaLossV=(v_l_new - v_l_old), Std=std) start_time = time.time() o, r, d, ep_ret_task, ep_ret_gail, ep_len = env.reset(), 0, False, 0, 0, 0 if pretrain_bc_itr > 0: BC.learn(Sibuffer.expert_obs, Sibuffer.expert_act, max_itr=pretrain_bc_itr) # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) o, r, d, _ = env.step(a[0]) ''' if t <150: env.render() time.sleep(0.03) ''' ep_ret_task += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): ''' if not(terminal): print('Warning: trajectory cut off by epoch at %d steps.'%ep_len) ''' #!add discriminator train '''#終端も加えるならアリッチャあり o_reshape = o.reshape(core.combined_shape(1,obs_dim)) a_reshape = a.reshape(core.combined_shape(1,act_dim)) agent_obs = np.append(buf.obs_buf[buf.path_slice()],o_reshape,axis = 0)#!o を(obspace,)→(1,obspace)に変換してからアペンド agent_act = np.append(buf.act_buf[buf.path_slice()],a_reshape,axis = 0)#終端での状態行動対も加えてDを学習 ''' agent_obs = buf.obs_buf[buf.path_slice()] agent_act = buf.act_buf[buf.path_slice()] #D.train(sess,e_obs,e_act ,agent_obs,agent_act) #↓buf.r_gail_buf[slice(buf.path_start_idx+1, buf.ptr+2)] = D.get_reward_buf(sess,agent_obs, agent_act).ravel()#状態行動対の結果としての報酬をbufferに追加(報酬は一個ずれる) if trj_full: gail_r = 1 else: gail_r = 0 rew_gail = gail_r * D.get_reward( sess, agent_obs, agent_act).ravel() #状態行動対の結果としての報酬をbufferに追加(報酬は一個ずれる) ep_ret_gail += rew_gail.sum() #!before gail_ratio ep_ret_sum = r_env_ratio * ep_ret_task + ep_ret_gail rew_gail_head = rew_gail[:-1] last_val_gail = rew_gail[-1] buf.rew_buf[slice( buf.path_start_idx + 1, buf.ptr)] = rew_gail_head + r_env_ratio * buf.rew_buf[ slice(buf.path_start_idx + 1, buf.ptr)] #!add GAIL reward 最後の報酬は含まれないため長さが1短い if d: # if trajectory didn't reach terminal state, bootstrap value target last_val = r_env_ratio * r + last_val_gail else: last_val = sess.run(v, feed_dict={x_ph: o.reshape(1, -1) }) #v_last=...だったけどこれで良さげ buf.finish_path( last_val) #これの前にbuf.finish_add_r_vがなされていることを確認すべし if terminal: #only store trajectory to SIBUffer if trajectory finished if trj_full: Sibuffer.store( agent_obs, agent_act, sum_reward=ep_ret_task) #!store trajectory else: Sibuffer.store( agent_obs, agent_act, sum_reward=ep_ret_task) #!store trajectory logger.store(EpRet=ep_ret_task, EpRet_Sum=ep_ret_sum, EpRet_Gail=ep_ret_gail, EpLen=ep_len) o, r, d, ep_ret_task, ep_ret_sum, ep_ret_gail, ep_len = env.reset( ), 0, False, 0, 0, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, epoch) # Perform PPO update! if not (trj_full): M_obs_buf = Sibuffer.get_obs_trj() trj_full = (M_obs_buf.shape[0] >= buf_size) if trj_full: #replaybufferがr_thresholdよりも大きいとき Sibuffer.update_main_buf(ratio_update=si_update_ratio, update_type=buf_update_type) M_obs_buf = Sibuffer.get_obs_trj() M_act_buf = Sibuffer.get_act_trj() d_batch_size = len(agent_obs) for _t in range(d_itr): e_obs_batch, e_act_batch = Sibuffer.get_random_batch( d_batch_size) D.train(sess, e_obs_batch, e_act_batch, agent_obs, agent_act) D_js_m.train(sess, M_obs_buf, M_act_buf, e_obs, e_act) #バッファとエキスパートの距離を見るためにtrain js_d = D.get_js_div(sess, Sibuffer.main_obs_buf, Sibuffer.main_act_buf, agent_obs, agent_act) js_d_m = D_js_m.get_js_div(sess, M_obs_buf, M_act_buf, e_obs, e_act) else: js_d, js_d_m = 0.5, 0.5 update() Sibuffer.store_js(js_d) logger.store(JS=js_d, JS_M=js_d_m, JS_Ratio=Sibuffer.js_ratio_with_random) # Log info about epoch #if epoch%10 == 0:#logger print each 10 epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpRet_Sum', average_only=True) logger.log_tabular('EpRet_Gail', average_only=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.log_tabular('Std', average_only=True) logger.log_tabular('buffer_r', Sibuffer.buffer_r_average) logger.log_tabular('JS', average_only=True) logger.log_tabular('JS_M', average_only=True) logger.log_tabular('JS_Ratio', average_only=True) logger.dump_tabular()
def a2c(env_fn, agent: Agent, seed=0, num_cpu=1, device=torch.device("cpu"), epochs=1000, steps_per_epoch=100, gamma=0.99, use_gae=True, tau=0.95, max_grad_norm=0.5, polyak=0.995, learning_rate=1e-3, value_loss_coef=0.5, policy_loss_coef=1, entropy_loss_coef=0.1, grid_layer_weight_reg_loss_coef=1e-4, save_every=100, log_every=10, logger_kwargs=dict(), test_every=100, num_test_episodes=5, deterministic=False, save_freq=1, solved_score=None, render=False, ): use_MPI = num_cpu > 1 if use_MPI: # Special function to avoid certain slowdowns from PyTorch + MPI combo. mpi_pytorch.setup_pytorch_for_mpi() else: torch.set_num_threads(torch.get_num_threads()) # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) config = locals() del config['env_fn'] del config['agent'] del config['logger'] logger.save_config(config) test_logger_kwargs = deepcopy(logger_kwargs) test_logger_kwargs['output_dir'] = pathlib.Path(test_logger_kwargs['output_dir']) / 'evaluation' test_logger = EpochLogger(**test_logger_kwargs) # Random seed if use_MPI: seed += 10000 * mpi_tools.proc_id() torch.manual_seed(seed) np.random.seed(seed) # Instantiate environment env = env_fn() assert env.max_episode_steps > 0 obs_shape = env.observation_space.shape act_dim = env.action_space.n # training model and target model target_agent = deepcopy(agent) if use_MPI: # Sync params across processes mpi_pytorch.sync_params(agent) mpi_pytorch.sync_params(target_agent) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in target_agent.parameters(): p.requires_grad = False # Utilize GPU agent.to(device) target_agent.to(device) # Set up optimizers for policy and q-function optimizer = Adam(agent.parameters(), lr=learning_rate) # Set up model saving logger.setup_pytorch_saver(agent, name='model') def update(episode_buffer): # Update if episode_buffer.dones[-1]: next_value = 0.0 else: last_obs = episode_buffer.next_observations[-1] previous_reward = episode_buffer.rewards[-1] last_obs_tensor = torch.tensor(last_obs, dtype=torch.float32).unsqueeze(0) previous_reward_tensor = torch.tensor([previous_reward], dtype=torch.float32).unsqueeze(0) context = agent.get_context() next_value = target_agent.predict_value(obs_tensor=last_obs_tensor, previous_reward_tensor=previous_reward_tensor, goal_grid_code_tensor=goal_grid_code_tensor, context=context).cpu().item() # Super critical!! optimizer.zero_grad() # Compute value and policy losses loss, info = agent.compute_loss(rewards=np.array(episode_buffer.rewards), dones=np.array(episode_buffer.dones), next_value=next_value, discount_factor=gamma, use_gae=use_gae, tau=tau, value_loss_coef=value_loss_coef, policy_loss_coef=policy_loss_coef, entropy_reg_coef=entropy_loss_coef, grid_layer_wreg_loss_coef=grid_layer_weight_reg_loss_coef) loss.backward() if use_MPI: mpi_pytorch.mpi_avg_grads(agent) # Optimize if max_grad_norm is not None: torch.nn.utils.clip_grad_norm_(agent.parameters(), max_grad_norm) optimizer.step() # Log losses and info logger.store(**info) # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(agent.parameters(), target_agent.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) if use_MPI: mpi_pytorch.sync_params(target_agent) # Prepare for interaction with environment start_time = time.time() # Main loop: collect experience in env and update/log each epoch total_steps = 0 # Reset env obs = env.reset() reward = 0 goal_grid_code_tensor = None # Reset episode stats episode_return = 0 episode_length = 0 for epoch in range(1, epochs + 1): agent.reset_for_training() epoch_history = EpisodeHistory() for t in range(steps_per_epoch): total_steps += 1 # Get action from the model obs_tensor = torch.tensor(obs, dtype=torch.float32).unsqueeze(0) previous_reward_tensor = torch.tensor([reward], dtype=torch.float32).unsqueeze(0) action = agent.step(obs_tensor, previous_reward_tensor, goal_grid_code_tensor).squeeze(0) # Step the env obs2, reward, done, _ = env.step(action.detach().cpu().item()) if render and mpi_tools.proc_id() == 0: env.render('human', view='top') time.sleep(1e-3) episode_return += reward episode_length += 1 # Store transition to history epoch_history.store(observation=None, action=None, reward=reward, done=done, next_observation=obs2) # Super critical, easy to overlook step: make sure to update # most recent observation! obs = obs2 # End of trajectory handling if done: if reward > 0: goal_grid_code_tensor = agent.current_grid_code.detach() break update(epoch_history) # if done if epoch_history.dones[-1]: logger.store(EpRet=episode_return, EpLen=episode_length) # Reset env obs = env.reset() agent.reset() # Reset episode stats episode_return = 0 episode_length = 0 # End of epoch handling if epoch % log_every == 0: total_interactions = mpi_tools.mpi_sum(total_steps) if use_MPI else total_steps # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('Value', average_only=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossEntropy', average_only=True) logger.log_tabular('LossGridL2', average_only=True) logger.log_tabular('LossPIM', average_only=True) logger.log_tabular('TotalEnvInteracts', total_interactions) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() # Test agent solved = False if epoch % test_every == 0: video_dir = pathlib.Path(logger.output_dir) / 'test_videos' / f'epoch-{epoch:d}' test_env_fn = lambda: Monitor(env_fn(), directory=video_dir) # Test the performance of the deterministic version of the agent. context = agent.get_context() agent.eval() episode_info = evaluate_agent(env_fn=test_env_fn, agent=agent, deterministic=deterministic, num_episodes=num_test_episodes, render=False, logger=test_logger) agent.train() agent.set_context(context) if solved_score is not None: solved = all(r >= solved_score for (t, r) in episode_info) # Save model if (epoch % save_every == 0) or (epoch == epochs) or solved: logger.save_state({'env': env}) # Check environment is solved if solved: plog = lambda msg: logger.log(msg, color='green') plog("=" * 40) plog(f"ENVIRONMENT SOLVED!") plog("=" * 40) plog(f' TotalEnvInteracts {total_steps}') plog(f' Time {time.time() - start_time}') plog(f' Epoch {epoch}') break torch.save(agent, str(logger.output_dir / 'agent.pt')) env.close()