def dump_tabular(self): u""" Write all of the diagnostics from the current iteration. Writes both to stdout, and to the output file. """ if proc_id() == 0: vals = [] key_lens = [len(key) for key in self.log_headers] max_key_len = max(15, max(key_lens)) keystr = u'%' + u'%d' % max_key_len fmt = u"| " + keystr + u"s | %15s |" n_slashes = 22 + max_key_len print u"-" * n_slashes for key in self.log_headers: val = self.log_current_row.get(key, u"") valstr = u"%8.3g" % val if hasattr(val, u"__float__") else val print fmt % (key, valstr) vals.append(val) print u"-" * n_slashes if self.output_file is not None: if self.first_row: self.output_file.write(u"\t".join(self.log_headers) + u"\n") self.output_file.write(u"\t".join(imap(unicode, vals)) + u"\n") self.output_file.flush() self.log_current_row.clear() self.first_row = False
def save_state(self, state_dict, itr=None): u""" Saves the state of an experiment. To be clear: this is about saving *state*, not logging diagnostics. All diagnostic logging is separate from this function. This function will save whatever is in ``state_dict``---usually just a copy of the environment---and the most recent parameters for the model you previously set up saving for with ``setup_tf_saver``. Call with any frequency you prefer. If you only want to maintain a single state and overwrite it at each call with the most recent version, leave ``itr=None``. If you want to keep all of the states you save, provide unique (increasing) values for 'itr'. Args: state_dict (dict): Dictionary containing essential elements to describe the current state of training. itr: An int, or None. Current iteration of training. """ if proc_id() == 0: fname = u'vars.pkl' if itr is None else u'vars%d.pkl' % itr try: joblib.dump(state_dict, osp.join(self.output_dir, fname)) except: self.log(u'Warning: could not pickle state_dict.', color=u'red') if hasattr(self, u'tf_saver_elements'): self._tf_simple_save(itr)
def save_config(self, config): u""" Log an experiment configuration. Call this once at the top of your experiment, passing in all important config vars as a dict. This will serialize the config to JSON, while handling anything which can't be serialized in a graceful way (writing as informative a string as possible). Example use: .. code-block:: python logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) """ config_json = convert_json(config) if self.exp_name is not None: config_json[u'exp_name'] = self.exp_name if proc_id() == 0: output = json.dumps(config_json, separators=(u',', u':\t'), indent=4, sort_keys=True) print colorize(u'Saving config:\n', color=u'cyan', bold=True) print output with open(osp.join(self.output_dir, u"config.json"), u'w') as out: out.write(output)
def _tf_simple_save(self, itr=None): u""" Uses simple_save to save a trained model, plus info to make it easy to associated tensors to variables after restore. """ if proc_id() == 0: assert hasattr(self, u'tf_saver_elements'), \ u"First have to setup saving with self.setup_tf_saver" fpath = u'simple_save' + (u'%d' % itr if itr is not None else u'') fpath = osp.join(self.output_dir, fpath) if osp.exists(fpath): # simple_save refuses to be useful if fpath already exists, # so just delete fpath if it's there. shutil.rmtree(fpath) tf.saved_model.simple_save(export_dir=fpath, **self.tf_saver_elements) joblib.dump(self.tf_saver_info, osp.join(fpath, u'model_info.pkl'))
def __init__(self, output_dir=None, output_fname=u'progress.txt', exp_name=None): u""" Initialize a Logger. Args: output_dir (string): A directory for saving results to. If ``None``, defaults to a temp directory of the form ``/tmp/experiments/somerandomnumber``. output_fname (string): Name for the tab-separated-value file containing metrics logged throughout a training run. Defaults to ``progress.txt``. exp_name (string): Experiment name. If you run multiple training runs and give them all the same ``exp_name``, the plotter will know to group them. (Use case: if you run the same hyperparameter configuration with multiple random seeds, you should give them all the same ``exp_name``.) """ if proc_id() == 0: self.output_dir = output_dir or u"/tmp/experiments/%i" % int( time.time()) if osp.exists(self.output_dir): print u"Warning: Log dir %s already exists! Storing info there anyway." % self.output_dir else: os.makedirs(self.output_dir) self.output_file = open(osp.join(self.output_dir, output_fname), u'w') atexit.register(self.output_file.close) print colorize(u"Logging data to %s" % self.output_file.name, u'green', bold=True) else: self.output_dir = None self.output_file = None self.first_row = True self.log_headers = [] self.log_current_row = {} self.exp_name = exp_name
def ppo(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): u""" Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ #ros stuff name = rospy.get_name() + "/ppo_rl_agent" params_topic = rospy.get_param("~topics/params") params_pub = rospy.Publisher(params_topic, LearnedParameters) logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs[u'action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: value and logprob (we'll get action from the env) get_action_ops = [v, logp] # logp instead of logp_pi since we know a # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in [u'pi', u'v']) logger.log(u'\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={u'x': x_ph}, outputs={ u'pi': pi, u'v': v }) def update(): inputs = dict((k, v) for k, v in izip(all_phs, buf.get())) pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training for i in xrange(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log( u'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) for _ in xrange(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) # Publish ros parameters params_msg = LearnedParameters() params = [ sess.run(v).flatten() for v in tf.trainable_variables() if u"pi" in v.name ] num_params_in_msg = sum([len(p) for p in params]) assert (num_params_in_msg == core.count_vars(u'pi')) for p in params: msg = Parameters() if isinstance(p, np.ndarray): msg.params = list(p) else: msg.params = [p] params_msg.params.append(msg) params_pub.publish(params_msg) # params = [sess.run(v)[0] for v in tf.trainable_variables() if u"pi" in v.name] # for p in params: # msg = Parameters() # if isinstance(p, np.ndarray): # msg.params = list(p) # else: # msg.params = [p] # params_msg.params.append(msg) # params_pub.publish(params_msg) # RUN THIS THING! start_time = time.time() env.reset() #o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 ep_ret = 0 ep_len = 0 # Main loop: collect experience in env and update/log each epoch for epoch in xrange(epochs): for t in xrange(local_steps_per_epoch): o, r, a, d, _ = env.step() ep_ret += r ep_len += 1 # get log prob v_t, logp_t = sess.run(get_action_ops, feed_dict={ x_ph: o.reshape(1, -1), a_ph: a.reshape(1, -1) }) # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) terminal = d or (ep_len == max_ep_len) if terminal or (t > local_steps_per_epoch - 1): if not (terminal): print u'Warning: trajectory cut off by epoch at %d steps.' % ep_len # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) # NOTE: check this call #o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 env.reset() ep_ret = 0 ep_len = 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({u'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular(u'Epoch', epoch) logger.log_tabular(u'EpRet', with_min_and_max=True) logger.log_tabular(u'EpLen', average_only=True) logger.log_tabular(u'VVals', with_min_and_max=True) logger.log_tabular(u'TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular(u'LossPi', average_only=True) logger.log_tabular(u'LossV', average_only=True) logger.log_tabular(u'DeltaLossPi', average_only=True) logger.log_tabular(u'DeltaLossV', average_only=True) logger.log_tabular(u'Entropy', average_only=True) logger.log_tabular(u'KL', average_only=True) logger.log_tabular(u'ClipFrac', average_only=True) logger.log_tabular(u'StopIter', average_only=True) logger.log_tabular(u'Time', time.time() - start_time) logger.dump_tabular()
def log(self, msg, color=u'green'): u"""Print a colorized message to stdout.""" if proc_id() == 0: print colorize(msg, color, bold=True)
def vpgpolynomial(env_fn, actor_critic=core.polynomial_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.9, pi_lr=2e-5, vf_lr=1e-3, train_v_iters=80, lam=0.97, max_ep_len=1000, logger_kwargs=dict(), save_freq=10, l1_scaling=0.001): u""" Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to VPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ #ros stuff name = rospy.get_name() + "/ppo_rl_agent" params_topic = rospy.get_param("~topics/params") params_pub = rospy.Publisher(params_topic, LearnedParameters) logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs[u'action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [v, logp] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = VPGBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in [u'pi', u'v']) logger.log(u'\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # VPG objectives var = [v for v in tf.trainable_variables() if u"pi" in v.name][0] norm_loss = l1_scaling * tf.norm(var, 1) pi_loss = -tf.reduce_mean(logp * adv_ph) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute # Optimizers pi_optim = MpiAdamOptimizer(learning_rate=pi_lr) train_pi = pi_optim.minimize(pi_loss) train_pi_norm = pi_optim.minimize(norm_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={u'x': x_ph}, outputs={ u'pi': pi, u'v': v }) def update(): inputs = dict((k, v) for k, v in izip(all_phs, buf.get())) pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # print "grads" # print sess.run(pi_optim.compute_gradients(pi_loss, tf.trainable_variables(u'pi')), # feed_dict=inputs) # Policy gradient step sess.run(train_pi, feed_dict=inputs) sess.run(train_pi_norm, feed_dict=inputs) #polynomial penalizing number of terms # with tf.variable_scope('pi'): # grads_and_vars = pi_optim.compute_gradients(tf.norm(tf.trainable_variables(),ord=1)) # pi_optim.apply_gradient(grads_and_vars) # Value function learning for _ in xrange(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, pi_l_norm = sess.run( [pi_loss, v_loss, approx_kl, norm_loss], feed_dict=inputs) logger.store(LossNorm=pi_l_norm, LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) # Publish ros parameters params_msg = LearnedParameters() params = [ sess.run(v).flatten() for v in tf.trainable_variables() if u"pi" in v.name ] num_params_in_msg = sum([len(p) for p in params]) assert (num_params_in_msg == core.count_vars(u'pi')) for p in params: msg = Parameters() if isinstance(p, np.ndarray): msg.params = list(p) else: msg.params = [p] params_msg.params.append(msg) params_pub.publish(params_msg) start_time = time.time() #o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 env.reset() ep_ret = 0 ep_len = 0 # Main loop: collect experience in env and update/log each epoch for epoch in xrange(epochs): for t in xrange(local_steps_per_epoch): o, r, a, d, _ = env.step() ep_ret += r ep_len += 1 v_t, logp_t = sess.run(get_action_ops, feed_dict={ x_ph: o.reshape(1, -1), a_ph: a.reshape(1, -1) }) # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print u'Warning: trajectory cut off by epoch at %d steps.' % ep_len # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) _, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({u'env': env}, None) # Perform VPG update! update() # Log info about epoch logger.log_tabular(u'Epoch', epoch) logger.log_tabular(u'EpRet', with_min_and_max=True) logger.log_tabular(u'EpLen', average_only=True) logger.log_tabular(u'VVals', with_min_and_max=True) logger.log_tabular(u'TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular(u'LossPi', average_only=True) logger.log_tabular(u'LossV', average_only=True) logger.log_tabular(u'DeltaLossPi', average_only=True) logger.log_tabular(u'DeltaLossV', average_only=True) logger.log_tabular(u'Entropy', average_only=True) logger.log_tabular(u'KL', average_only=True) logger.log_tabular(u'Time', time.time() - start_time) logger.dump_tabular()