def save_state(self, state_dict, itr=None): """ Saves the state of an experiment. To be clear: this is about saving *state*, not logging diagnostics. All diagnostic logging is separate from this function. This function will save whatever is in ``state_dict``---usually just a copy of the environment---and the most recent parameters for the model you previously set up saving for with ``setup_tf_saver``. Call with any frequency you prefer. If you only want to maintain a single state and overwrite it at each call with the most recent version, leave ``itr=None``. If you want to keep all of the states you save, provide unique (increasing) values for 'itr'. Args: state_dict (dict): Dictionary containing essential elements to describe the current state of training. itr: An int, or None. Current iteration of training. """ if proc_id() == 0: fname = 'vars.pkl' if itr is None else 'vars%d.pkl' % itr try: joblib.dump(state_dict, osp.join(self.output_dir, fname)) except: self.log('Warning: could not pickle state_dict.', color='red') if hasattr(self, 'tf_saver_elements'): self._tf_simple_save(itr)
def dump_tabular(self): """ Write all of the diagnostics from the current iteration. Writes both to stdout, and to the output file. """ if proc_id() == 0: vals = [] key_lens = [len(key) for key in self.log_headers] max_key_len = max(15, max(key_lens)) keystr = '%' + '%d' % max_key_len fmt = "| " + keystr + "s | %15s |" n_slashes = 22 + max_key_len print("-" * n_slashes) for key in self.log_headers: val = self.log_current_row.get(key, "") valstr = "%8.3g" % val if hasattr(val, "__float__") else val print(fmt % (key, valstr)) vals.append(val) print("-" * n_slashes) if self.output_file is not None: if self.first_row: self.output_file.write("\t".join(self.log_headers) + "\n") self.output_file.write("\t".join(map(str, vals)) + "\n") self.output_file.flush() self.log_current_row.clear() self.first_row = False
def __init__(self, output_dir=None, output_fname='progress.txt', exp_name=None): """ Initialize a Logger. Args: output_dir (string): A directory for saving results to. If ``None``, defaults to a temp directory of the form ``/tmp/experiments/somerandomnumber``. output_fname (string): Name for the tab-separated-value file containing metrics logged throughout a training run. Defaults to ``progress.txt``. exp_name (string): Experiment name. If you run multiple training runs and give them all the same ``exp_name``, the plotter will know to group them. (Use case: if you run the same hyperparameter configuration with multiple random seeds, you should give them all the same ``exp_name``.) """ if proc_id()==0: self.output_dir = output_dir or "/tmp/experiments/%i"%int(time.time()) if osp.exists(self.output_dir): print("Warning: Log dir %s already exists! Storing info there anyway."%self.output_dir) else: os.makedirs(self.output_dir) self.output_file = open(osp.join(self.output_dir, output_fname), 'w') atexit.register(self.output_file.close) print(colorize("Logging data to %s"%self.output_file.name, 'green', bold=True)) else: self.output_dir = None self.output_file = None self.first_row=True self.log_headers = [] self.log_current_row = {} self.exp_name = exp_name
def save_config(self, config): """ Log an experiment configuration. Call this once at the top of your experiment, passing in all important config vars as a dict. This will serialize the config to JSON, while handling anything which can't be serialized in a graceful way (writing as informative a string as possible). Example use: .. code-block:: python logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) """ config_json = convert_json(config) if self.exp_name is not None: config_json['exp_name'] = self.exp_name if proc_id() == 0: output = json.dumps(config_json, separators=(',', ':\t'), indent=4, sort_keys=True) print(colorize('Saving config:\n', color='cyan', bold=True)) print(output) with open(osp.join(self.output_dir, "config.json"), 'w') as out: out.write(output)
def __init__(self, actor: nns.MLPGaussianActor, critic: nns.MLPCritic, lr_a, lr_c, train_a_itrs, train_c_itrs): self.actor, self.critic = actor, critic self.optimizer_a = torch.optim.Adam(self.actor.parameters(), lr=lr_a) self.optimizer_c = torch.optim.Adam(self.critic.parameters(), lr=lr_c) self.train_a_itrs, self.train_c_itrs = train_a_itrs, train_c_itrs self.proc_id = mpi_tools.proc_id() self.log_dir = './log/PPO/' + time.strftime( '%Y-%m-%d_%H-%M-%S', time.localtime(time.time())) if self.proc_id == 0: self.writer = SummaryWriter(log_dir=self.log_dir)
def _tf_simple_save(self, itr=None): """ Uses simple_save to save a trained model, plus info to make it easy to associated tensors to variables after restore. """ if proc_id()==0: assert hasattr(self, 'tf_saver_elements'), \ "First have to setup saving with self.setup_tf_saver" fpath = 'simple_save' + ('%d'%itr if itr is not None else '') fpath = osp.join(self.output_dir, fpath) if osp.exists(fpath): # simple_save refuses to be useful if fpath already exists, # so just delete fpath if it's there. shutil.rmtree(fpath) tf.saved_model.simple_save(export_dir=fpath, **self.tf_saver_elements) joblib.dump(self.tf_saver_info, osp.join(fpath, 'model_info.pkl'))
def _pytorch_simple_save(self, itr=None): """ Saves the PyTorch model (or models). """ if proc_id() == 0: assert hasattr(self, 'pytorch_saver_elements'), \ "First have to setup saving with self.setup_pytorch_saver" fpath = 'pyt_save' fpath = osp.join(self.output_dir, fpath) fname = 'model' + ('%d' % itr if itr is not None else '') + '.pt' fname = osp.join(fpath, fname) os.makedirs(fpath, exist_ok=True) with warnings.catch_warnings(): warnings.simplefilter("ignore") # We are using a non-recommended way of saving PyTorch models, # by pickling whole objects (which are dependent on the exact # directory structure at the time of saving) as opposed to # just saving network weights. This works sufficiently well # for the purposes of Spinning Up, but you may want to do # something different for your personal PyTorch project. # We use a catch_warnings() context to avoid the warnings about # not being able to save the source code. torch.save(self.pytorch_saver_elements, fname)
def log(self, msg, color='green'): """Print a colorized message to stdout.""" if proc_id() == 0: print(colorize(msg, color, bold=True))
def ppo(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=None, target_kl=0.01, logger_kwargs=dict(), save_freq=10, TensorBoard=True, save_nn=True, save_every=1000, load_latest=False, load_custom=False, LoadPath=None, RTA_type=None): """ Proximal Policy Optimization (by clipping), with early stopping based on approximate KL Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with a ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` module. The ``step`` method should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Numpy array of actions for each | observation. ``v`` (batch,) | Numpy array of value estimates | for the provided observations. ``logp_a`` (batch,) | Numpy array of log probs for the | actions in ``a``. =========== ================ ====================================== The ``act`` method behaves the same as ``step`` but only returns ``a``. The ``pi`` module's forward call should accept a batch of observations and optionally a batch of actions, and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` N/A | Torch Distribution object, containing | a batch of distributions describing | the policy for the provided observations. ``logp_a`` (batch,) | Optional (only returned if batch of | actions is given). Tensor containing | the log probability, according to | the policy, of the provided actions. | If actions not given, will contain | ``None``. =========== ================ ====================================== The ``v`` module's forward call should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``v`` (batch,) | Tensor containing the value estimates | for the provided observations. (Critical: | make sure to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) Typically denoted by :math:`\epsilon`. pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. TensorBoard (bool): True plots to TensorBoard, False does not save_nn (bool): True saves neural network data, False does not save_every (int): How often to save neural network load_latest (bool): Load last saved neural network data before training load_custom (bool): Load custom neural network data file before training LoadPath (str): Path for custom neural network data file RTA_type (str): RTA framework, either 'CBF', 'SVL', 'ASIF', or 'SBSF' """ # Special function to avoid certain slowdowns from PyTorch + MPI combo. setup_pytorch_for_mpi() # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Instantiate environment env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Random seed for each cpu seed += 1 * proc_id() env.seed(seed) # Create actor-critic module ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) # Load model if True if load_latest: models = glob.glob(f"{PATH}/models/PPO/*") LoadPath = max(models, key=os.path.getctime) ac.load_state_dict(torch.load(LoadPath)) elif load_custom: ac.load_state_dict(torch.load(LoadPath)) # Sync params across processes sync_params(ac) # Count variables var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # Set up experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Set up function for computing PPO policy loss def compute_loss_pi(data): obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data[ 'logp'] # Policy loss pi, logp = ac.pi(obs, act) ratio = torch.exp(logp - logp_old) clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv loss_pi = -(torch.min(ratio * adv, clip_adv)).mean() # Useful extra info approx_kl = (logp_old - logp).mean().item() ent = pi.entropy().mean().item() clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio) clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item() pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac) return loss_pi, pi_info # Set up function for computing value loss def compute_loss_v(data): obs, ret = data['obs'], data['ret'] return ((ac.v(obs) - ret)**2).mean() # Set up optimizers for policy and value function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(): data = buf.get() pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() # Train policy with multiple steps of gradient descent for i in range(train_pi_iters): pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) kl = mpi_avg(pi_info['kl']) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break loss_pi.backward() mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() logger.store(StopIter=i) # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) loss_v.backward() mpi_avg_grads(ac.v) # average grads across MPI processes vf_optimizer.step() # Log changes from update kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old)) # Import RTA if RTA_type == 'CBF': from CBF_for_speed_limit import RTA elif RTA_type == 'SVL': from Simple_velocity_limit import RTA elif RTA_type == 'ASIF': from IASIF import RTA elif RTA_type == 'SBSF': from ISimplex import RTA # Call RTA, define action conversion if RTA_type != 'off': env.RTA_reward = RTA_type rta = RTA(env) def RTA_act(obs, act): act = np.clip(act, -env.force_magnitude, env.force_magnitude) x0 = [obs[0], obs[1], 0, obs[2], obs[3], 0] u_des = np.array([[act[0]], [act[1]], [0]]) u = rta.main(x0, u_des) new_act = [u[0, 0], u[1, 0]] if np.sqrt((act[0] - new_act[0])**2 + (act[1] - new_act[1])**2) < 0.0001: env.RTA_on = False else: env.RTA_on = True return new_act # Prepare for interaction with environment start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 total_episodes = 0 RTA_percent = 0 # Create TensorBoard file if True if TensorBoard and proc_id() == 0: if env_name == 'spacecraft-docking-continuous-v0' or env_name == 'spacecraft-docking-v0': Name = f"{PATH}/runs/Spacecraft-docking-" + current_time elif env_name == 'dubins-aircraft-v0' or env_name == 'dubins-aircraft-continuous-v0': Name = f"{PATH}/runs/Dubins-aircraft-" + current_time writer = SummaryWriter(Name) # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): batch_ret = [] # Track episode returns batch_len = [] # Track episode lengths batch_RTA_percent = [] # Track precentage of time RTA is on env.success = 0 # Track episode success rate env.failure = 0 # Track episode failure rate env.crash = 0 # Track episode crash rate env.overtime = 0 # Track episode over max time/control rate episodes = 0 # Track episodes delta_v = [] # Track episode total delta v for t in range(local_steps_per_epoch): a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32)) if RTA_type != 'off': # If RTA is on, get RTA action RTA_a = RTA_act(o, a) if env.RTA_on: RTA_percent += 1 next_o, r, d, _ = env.step(RTA_a) else: # If RTA is off, pass through desired action next_o, r, d, _ = env.step(a) if env_name == 'spacecraft-docking-continuous-v0' or env_name == 'spacecraft-docking-v0': over_max_vel, _, _ = env.check_velocity(a[0], a[1]) if over_max_vel: RTA_percent += 1 ep_ret += r ep_len += 1 # save and log buf.store(o, a, r, v, logp) logger.store(VVals=v) # Update obs (critical!) o = next_o timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t == local_steps_per_epoch - 1 if terminal or epoch_ended: if epoch_ended and not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32)) else: v = 0 buf.finish_path(v) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) batch_ret.append(ep_ret) batch_len.append(ep_len) episodes += 1 if env_name == 'spacecraft-docking-continuous-v0' or env_name == 'spacecraft-docking-v0': delta_v.append(env.control_input / env.mass_deputy) batch_RTA_percent.append(RTA_percent / ep_len * 100) RTA_percent = 0 o, ep_ret, ep_len = env.reset(), 0, 0 total_episodes += episodes # Track success, failure, crash, overtime rates if episodes != 0: success_rate = env.success / episodes failure_rate = env.failure / episodes crash_rate = env.crash / episodes overtime_rate = env.overtime / episodes else: success_rate = 0 failure_rate = 0 crash_rate = 0 overtime_rate = 0 raise ( "No completed episodes logging will break [increase steps per epoch]" ) # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() # Average data over all cpus avg_batch_ret = mpi_avg(np.mean(batch_ret)) avg_batch_len = mpi_avg(np.mean(batch_len)) avg_success_rate = mpi_avg(success_rate) avg_failure_rate = mpi_avg(failure_rate) avg_crash_rate = mpi_avg(crash_rate) avg_overtime_rate = mpi_avg(overtime_rate) if env_name == 'spacecraft-docking-continuous-v0' or env_name == 'spacecraft-docking-v0': avg_delta_v = mpi_avg(np.mean(delta_v)) avg_RTA_percent = mpi_avg(np.mean(batch_RTA_percent)) if proc_id() == 0: # Only on one cpu # Plot to TensorBoard if True, only on one cpu if TensorBoard: writer.add_scalar('Return', avg_batch_ret, epoch) writer.add_scalar('Episode-Length', avg_batch_len * env.tau, epoch) writer.add_scalar('Success-Rate', avg_success_rate * 100, epoch) writer.add_scalar('Failure-Rate', avg_failure_rate * 100, epoch) writer.add_scalar('Crash-Rate', avg_crash_rate * 100, epoch) writer.add_scalar('Overtime-Rate', avg_overtime_rate * 100, epoch) if env_name == 'spacecraft-docking-continuous-v0' or env_name == 'spacecraft-docking-v0': writer.add_scalar('Delta-V', avg_delta_v, epoch) writer.add_scalar('RTA-on-percent', avg_RTA_percent, epoch) # Save neural network if true, can change to desired location if save_nn and epoch % save_every == 0 and epoch != 0: if not os.path.isdir(f"{PATH}/models"): os.mkdir(f"{PATH}/models") if not os.path.isdir(f"{PATH}/models/PPO"): os.mkdir(f"{PATH}/models/PPO") if env_name == 'spacecraft-docking-continuous-v0' or env_name == 'spacecraft-docking-v0': Name2 = f"{PATH}/models/PPO/Spacecraft-docking-" + current_time + f"-epoch{epoch}.dat" elif env_name == 'dubins-aircraft-v0' or env_name == 'dubins-aircraft-continuous-v0': Name2 = f"{PATH}/models/PPO/Dubins-aircraft-" + current_time + f"-epoch{epoch}.dat" torch.save(ac.state_dict(), Name2) # Average episodes per hour, episode per epoch ep_hr = mpi_avg(total_episodes) * args.cpu / (time.time() - start_time) * 3600 ep_Ep = mpi_avg(total_episodes) * args.cpu / (epoch + 1) # Plot on one cpu if proc_id() == 0: # Save neural network if save_nn: if not os.path.isdir(f"{PATH}/models"): os.mkdir(f"{PATH}/models") if not os.path.isdir(f"{PATH}/models/PPO"): os.mkdir(f"{PATH}/models/PPO") if env_name == 'spacecraft-docking-continuous-v0' or env_name == 'spacecraft-docking-v0': Name2 = f"{PATH}/models/PPO/Spacecraft-docking-" + current_time + "-final.dat" elif env_name == 'dubins-aircraft-v0' or env_name == 'dubins-aircraft-continuous-v0': Name2 = f"{PATH}/models/PPO/Dubins-aircraft-" + current_time + "-final.dat" torch.save(ac.state_dict(), Name2) # Print statistics on episodes print( f"Episodes per hour: {ep_hr:.0f}, Episodes per epoch: {ep_Ep:.0f}, Epochs per hour: {(epoch+1)/(time.time()-start_time)*3600:.0f}" )
LoadPath = f"{os.path.join(os.path.dirname(__file__))}/saved_models/{args.custom_model}.dat" env_name = args.env ppo(lambda: gym.make(args.env), actor_critic=core.MLPActorCritic, ac_kwargs=dict(hidden_sizes=[args.hid] * args.l), gamma=args.gamma, seed=args.seed, steps_per_epoch=args.steps, epochs=args.epochs, logger_kwargs=logger_kwargs, TensorBoard=args.NoTB, save_nn=args.NoSave, save_every=args.SaveEvery, load_latest=args.LoadLatest, load_custom=args.LoadCustom, LoadPath=LoadPath, RTA_type=args.RTA) # Show experiment duration if proc_id() == 0: print(f"Run Time: {time.time()-starttime:0.4} seconds") ''' ** To start TensorBoard, run the following command in your terminal with your specific path to aerospacerl:** tensorboard --logdir aerospacerl/RL/runs ''' ''' Example of how to run PPO in terminal from home directory, using SVL RTA for 10 epochs: python aerospacerl/RL/PPO.py --RTA SVL --epochs 10 '''
def ppo(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, trials_per_epoch=2500, steps_per_trial=100, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=1000, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph # x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) x_ph = tf.placeholder(dtype=tf.float32, shape=(None, None, 1), name='x_ph') a_ph = tf.placeholder(dtype=tf.int32, shape=(None, None), name='a_ph') # adv_ph, ret_ph, logp_old_ph, rew_ph = core.placeholders(None, None, None, 1) adv_ph = tf.placeholder(dtype=tf.float32, shape=(None, None), name='adv_ph') ret_ph = tf.placeholder(dtype=tf.float32, shape=(None, None), name='ret_ph') logp_old_ph = tf.placeholder(dtype=tf.float32, shape=(None, None), name='logp_old_ph') rew_ph = tf.placeholder(dtype=tf.float32, shape=(None, None, 1), name='rew_ph') pi_state_ph = tf.placeholder(dtype=tf.float32, shape=(None, NUM_GRU_UNITS), name='pi_state_ph') v_state_ph = tf.placeholder(dtype=tf.float32, shape=(None, NUM_GRU_UNITS), name='v_state_ph') # Initialize rnn states for pi and v # Main outputs from computation graph pi, logp, logp_pi, v, new_pi_state, new_v_state = actor_critic( x_ph, a_ph, rew_ph, pi_state_ph, v_state_ph, NUM_GRU_UNITS, action_space=env.action_space) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph, rew_ph] # Every step, get: action, value, and logprob and reward get_action_ops = [pi, v, logp_pi, new_pi_state, new_v_state] # Experience buffer steps_per_epoch = trials_per_epoch * steps_per_trial local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer( learning_rate=pi_lr).minimize(pi_loss - 0.01 * approx_ent) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) # tf.reset_default_graph() # restore_tf_graph(sess, '..//data//ppo//ppo_s0//simple_save') def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} inputs[pi_state_ph] = np.zeros((trials_per_epoch, NUM_GRU_UNITS)) inputs[v_state_ph] = np.zeros((trials_per_epoch, NUM_GRU_UNITS)) pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) print(pi_l_old, v_l_old) # Training for i in range(train_pi_iters): # print(f'pi:{i}') _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) # print(sess.run(pi_loss, feed_dict=inputs)) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) for _ in range(train_v_iters): # print(f'v:{_}') sess.run(train_v, feed_dict=inputs) # Log changes from update import datetime print(f'finish one batch training at {datetime.datetime.now()}') pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for trial in range(trials_per_epoch): print(f'trial: {trial}') old_a = np.array([0]).reshape(1, 1) old_r = np.array([0]).reshape((1, 1, 1)) means = env.sample_tasks(1)[0] action_dict = defaultdict(int) for i in range(env.action_space.n): action_dict[i] = 0 env.reset_task_simple(means) task_avg = 0.0 pi_state_t = np.zeros((1, NUM_GRU_UNITS)) v_state_t = np.zeros((1, NUM_GRU_UNITS)) for step in range(steps_per_trial): a, v_t, logp_t, pi_state_t, v_state_t = sess.run( get_action_ops, feed_dict={ x_ph: o.reshape(1, 1, -1), a_ph: old_a, rew_ph: old_r, pi_state_ph: pi_state_t, v_state_ph: v_state_t }) # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) try: o, r, d, _ = env.step(a[0][0]) except: print(a) raise AssertionError action_dict[a[0][0]] += 1 old_a = np.array(a).reshape(1, 1) old_r = np.array([r]).reshape(1, 1, 1) ep_ret += r task_avg += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (step == local_steps_per_epoch - 1): if not (terminal): print( 'Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # logger.log_tabular('Epoch', epoch) # logger.log_tabular('EpRet', with_min_and_max=True) # logger.log_tabular('Means', means) # logger.dump_tabular() print(f'avg in trial {trial}: {task_avg / steps_per_trial}') print(f'Means in trial {trial}: {means}') print(action_dict) # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # saved_path = saver.save(sess, f"/tmp/model_epoch{epoch}.ckpt") # print(f'Model saved in {saved_path}') # Perform PPO update! update() logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def ppo(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=2000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): global RENDER, BONUS """ Proximal Policy Optimization (by clipping), with early stopping based on approximate KL Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with a ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` module. The ``step`` method should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Numpy array of actions for each | observation. ``v`` (batch,) | Numpy array of value estimates | for the provided observations. ``logp_a`` (batch,) | Numpy array of log probs for the | actions in ``a``. =========== ================ ====================================== The ``act`` method behaves the same as ``step`` but only returns ``a``. The ``pi`` module's forward call should accept a batch of observations and optionally a batch of actions, and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` N/A | Torch Distribution object, containing | a batch of distributions describing | the policy for the provided observations. ``logp_a`` (batch,) | Optional (only returned if batch of | actions is given). Tensor containing | the log probability, according to | the policy, of the provided actions. | If actions not given, will contain | ``None``. =========== ================ ====================================== The ``v`` module's forward call should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``v`` (batch,) | Tensor containing the value estimates | for the provided observations. (Critical: | make sure to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) Typically denoted by :math:`\epsilon`. pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ # Reachability Trainer r_network = R_Network().to(device) trainer = R_Network_Trainer(r_network=r_network, exp_name="random1") episodic_memory = EpisodicMemory(embedding_shape=[EMBEDDING_DIM]) # Special function to avoid certain slowdowns from PyTorch + MPI combo. setup_pytorch_for_mpi() # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Random seed seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) # Instantiate environment env = env_fn() observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=(3, 64, 64)) action_space = gym.spaces.Discrete(3) obs_dim = observation_space.shape act_dim = action_space.shape # Create actor-critic module ac = actor_critic(observation_space, action_space, **ac_kwargs) # Sync params across processes sync_params(ac) # Count variables var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # Set up experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Set up function for computing PPO policy loss def compute_loss_pi(data): obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data[ 'logp'] # Policy loss pi, logp = ac.pi(obs, act) ratio = torch.exp(logp - logp_old) clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv loss_pi = -(torch.min(ratio * adv, clip_adv)).mean() # Useful extra info approx_kl = (logp_old - logp).mean().item() ent = pi.entropy().mean().item() clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio) clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item() pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac) return loss_pi, pi_info # Set up function for computing value loss def compute_loss_v(data): obs, ret = data['obs'], data['ret'] return ((ac.v(obs) - ret)**2).mean() # Set up optimizers for policy and value function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(): data = buf.get() pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() # Train policy with multiple steps of gradient descent for i in range(train_pi_iters): pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) # Entropy bonus loss_pi += pi_info['ent'] * 0.0021 kl = mpi_avg(pi_info['kl']) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break loss_pi.backward() mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() logger.store(StopIter=i) # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) loss_v.backward() mpi_avg_grads(ac.v) # average grads across MPI processes vf_optimizer.step() # Log changes from update kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old)) # Prepare for interaction with environment start_time = time.time() o, _ = env.reset() env.render() o = o.astype(np.float32) / 255. o = o.transpose(2, 0, 1) ep_ret, ep_len = 0, 0 indices = [] # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): state = torch.as_tensor(o[np.newaxis, ...], dtype=torch.float32) a, v, logp = ac.step(state) next_o, r, d, info = env.step(a) next_o = next_o.astype(np.float32) / 255. d = ep_len == max_ep_len trainer.store_new_state([next_o], [r], [d], [None]) r_network.eval() with torch.no_grad(): state_embedding = r_network.embed_observation( torch.FloatTensor([o]).to(device)).cpu().numpy()[0] aggregated, _, _ = similarity_to_memory( state_embedding, episodic_memory, r_network) curiosity_bonus = 0.03 * (0.5 - aggregated) if BONUS: print(f'{curiosity_bonus:.3f}') if curiosity_bonus > 0 or len(episodic_memory) == 0: idx = episodic_memory.store_new_state(state_embedding) x = int(env.map_scale * info['pose']['x']) y = int(env.map_scale * info['pose']['y']) if idx == len(indices): indices.append((x, y)) else: indices[idx] = (x, y) r_network.train() next_o = next_o.transpose(2, 0, 1) ep_ret += r + curiosity_bonus ep_len += 1 # save and log buf.store(o, a, r, v, logp) logger.store(VVals=v) k = cv2.waitKey(1) if k == ord('s'): RENDER = 1 - RENDER elif k == ord('b'): BONUS = 1 - BONUS if RENDER: env.info['map'] = cv2.flip(env.info['map'], 0) for index in indices: cv2.circle(env.info['map'], index, 3, (0, 0, 255), -1) env.info['map'] = cv2.flip(env.info['map'], 0) env.render() # Update obs (critical!) o = next_o timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t == local_steps_per_epoch - 1 if terminal or epoch_ended: if epoch_ended and not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: state = torch.as_tensor(o[np.newaxis, ...], dtype=torch.float32) _, v, _ = ac.step(state) else: v = 0 buf.finish_path(v) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) print(ep_ret, ep_len, len(episodic_memory)) ep_ret, ep_len = 0, 0 o, _ = env.reset() o = o.astype(np.float32) / 255. o = o.transpose(2, 0, 1) episodic_memory.reset() indices = [] # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! if epoch > 4: update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() else: buf.get()
parser.add_argument('--save_freq', type=int, default=500) parser.add_argument('--eval', default=False, action='store_true') parser.add_argument('--eval_path', type=str, default='') parser.add_argument('--play_speed', type=int, default=1) parser.add_argument('--num_procs', type=int, default=1) args = parser.parse_args() if args.eval: env = envs.env_by_name(args.env) # env.eval(path='./log/PPO/2020-10-28_20-49-51/CartPole-v1-Epoch99') env.eval(path=args.eval_path, play_speed=args.play_speed) sys.exit() mpi_tools.mpi_fork(args.num_procs) proc_id = mpi_tools.proc_id() mpi_tools.setup_pytorch_for_mpi() # Random seed seed = 0 seed += 10000 * proc_id torch.manual_seed(seed) np.random.seed(seed) env = envs.env_by_name(args.env) sample_size = int(args.sample_size / mpi_tools.num_procs()) print('sample_size ', sample_size, flush=True) buffer = PPOBuffer(obs_dim=env.obs_dim, act_dim=env.act_dim,