def dump_tabular(self): """ Write all of the diagnostics from the current iteration. Writes both to stdout, and to the output file. """ if proc_id() == 0: vals = [] key_lens = [len(key) for key in self.log_headers] max_key_len = max(15, max(key_lens)) keystr = '%' + '%d' % max_key_len fmt = "| " + keystr + "s | %15s |" n_slashes = 22 + max_key_len print("-" * n_slashes) for key in self.log_headers: val = self.log_current_row.get(key, "") valstr = "%8.3g" % val if hasattr(val, "__float__") else val print(fmt % (key, valstr)) vals.append(val) print("-" * n_slashes, flush=True) if self.output_file is not None: if self.first_row: self.output_file.write("\t".join(self.log_headers) + "\n") self.output_file.write("\t".join(map(str, vals)) + "\n") self.output_file.flush() self.log_current_row.clear() self.first_row = False
def save_state(self, state_dict, itr=None): """ Saves the state of an experiment. To be clear: this is about saving *state*, not logging diagnostics. All diagnostic logging is separate from this function. This function will save whatever is in ``state_dict``---usually just a copy of the environment---and the most recent parameters for the model you previously set up saving for with ``setup_tf_saver``. Call with any frequency you prefer. If you only want to maintain a single state and overwrite it at each call with the most recent version, leave ``itr=None``. If you want to keep all of the states you save, provide unique (increasing) values for 'itr'. Args: state_dict (dict): Dictionary containing essential elements to describe the current state of training. itr: An int, or None. Current iteration of training. """ if proc_id() == 0: fname = 'vars.pkl' if itr is None else 'vars%d.pkl' % itr try: joblib.dump(state_dict, osp.join(self.output_dir, fname)) except: self.log('Warning: could not pickle state_dict.', color='red') if hasattr(self, 'tf_saver_elements'): self._tf_simple_save(itr)
def save_config(self, config): """ Log an experiment configuration. Call this once at the top of your experiment, passing in all important config vars as a dict. This will serialize the config to JSON, while handling anything which can't be serialized in a graceful way (writing as informative a string as possible). Example use: .. code-block:: python logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) """ config_json = convert_json(config) if self.exp_name is not None: config_json['exp_name'] = self.exp_name if proc_id() == 0: output = json.dumps(config_json, separators=(',', ':\t'), indent=4, sort_keys=True) print(colorize('Saving config:\n', color='cyan', bold=True)) print(output) with open(osp.join(self.output_dir, "config.json"), 'w') as out: out.write(output)
def _tf_simple_save(self, itr=None): """ Uses simple_save to save a trained model, plus info to make it easy to associated tensors to variables after restore. """ if proc_id() == 0: assert hasattr(self, 'tf_saver_elements'), \ "First have to setup saving with self.setup_tf_saver" fpath = 'simple_save' + ('%d' % itr if itr is not None else '') fpath = osp.join(self.output_dir, fpath) if osp.exists(fpath): # simple_save refuses to be useful if fpath already exists, # so just delete fpath if it's there. shutil.rmtree(fpath) tf.saved_model.simple_save(export_dir=fpath, **self.tf_saver_elements) joblib.dump(self.tf_saver_info, osp.join(fpath, 'model_info.pkl'))
def __init__(self, output_dir=None, output_fname='progress.txt', exp_name=None): """ Initialize a Logger. Args: output_dir (string): A directory for saving results to. If ``None``, defaults to a temp directory of the form ``/tmp/experiments/somerandomnumber``. output_fname (string): Name for the tab-separated-value file containing metrics logged throughout a training run. Defaults to ``progress.txt``. exp_name (string): Experiment name. If you run multiple training runs and give them all the same ``exp_name``, the plotter will know to group them. (Use case: if you run the same hyperparameter configuration with multiple random seeds, you should give them all the same ``exp_name``.) """ if proc_id() == 0: self.output_dir = output_dir or "/tmp/experiments/%i" % int( time.time()) if osp.exists(self.output_dir): print( "Warning: Log dir %s already exists! Storing info there anyway." % self.output_dir) else: os.makedirs(self.output_dir) self.output_file = open(osp.join(self.output_dir, output_fname), 'w') atexit.register(self.output_file.close) print( colorize("Logging data to %s" % self.output_file.name, 'green', bold=True)) else: self.output_dir = None self.output_file = None self.first_row = True self.log_headers = [] self.log_current_row = {} self.exp_name = exp_name
def run_polopt_agent(env_fn, agent=CPOAgent(), actor_critic=mlp_actor_critic, ac_kwargs=dict(), seed=0, render=True, # Experience collection: steps_per_epoch=4000, epochs=50, max_ep_len=1000, # Discount factors: gamma=0.99, lam=0.97, cost_gamma=0.99, cost_lam=0.97, # Policy learning: ent_reg=0., # Cost constraints / penalties: cost_lim=25, cost_lim_end=25, penalty_init=1., penalty_lr=5e-2, # KL divergence: target_kl=0.01, # Value learning: vf_lr=1e-3, vf_iters=80, # Logging: logger=None, logger_kwargs=dict(), save_freq=1, # adaptive parm noise param_noise=None, normalize_observations=True, observation_range=(-5.,5.), ): #=========================================================================# # Prepare logger, seed, and environment in this process # #=========================================================================# logger = EpochLogger(**logger_kwargs) if logger is None else logger logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() agent.set_logger(logger) #=========================================================================# # Create computation graph for actor and critic (not training routine) # #=========================================================================# # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph from environment spaces x_ph, a_ph = placeholders_from_spaces(env.observation_space, env.action_space) # Inputs to computation graph for batch data adv_ph, cadv_ph, ret_ph, cret_ph, logp_old_ph = placeholders(*(None for _ in range(5))) # Inputs to computation graph for special purposes surr_cost_rescale_ph = tf.placeholder(tf.float32, shape=()) cur_cost_ph = tf.placeholder(tf.float32, shape=()) # Observation normalization. if normalize_observations: with tf.variable_scope('obs_rms'): obs_rms = RunningMeanStd(shape=env.observation_space.shape) x_ph = tf.clip_by_value(normalize(x_ph, obs_rms), observation_range[0], observation_range[1]) else: obs_rms = None #normalized_obs1 = tf.clip_by_value(normalize(obs1, self.obs_rms), # observation_range[0], self.observation_range[1]) # Outputs from actor critic ac_outs = actor_critic(x_ph, a_ph, **ac_kwargs) pi, logp, logp_pi, pi_info, pi_info_phs, d_kl, ent, v, vc = ac_outs # Organize placeholders for zipping with data from buffer on updates buf_phs = [x_ph, a_ph, adv_ph, cadv_ph, ret_ph, cret_ph, logp_old_ph] buf_phs += values_as_sorted_list(pi_info_phs) # Organize symbols we have to compute at each step of acting in env get_action_ops = dict(pi=pi, v=v, logp_pi=logp_pi, pi_info=pi_info) # If agent is reward penalized, it doesn't use a separate value function # for costs and we don't need to include it in get_action_ops; otherwise we do. if not(agent.reward_penalized): get_action_ops['vc'] = vc # Count variables var_counts = tuple(count_vars(scope) for scope in ['pi', 'vf', 'vc']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d, \t vc: %d\n'%var_counts) # Make a sample estimate for entropy to use as sanity check approx_ent = tf.reduce_mean(-logp) #=========================================================================# # Create replay buffer # #=========================================================================# # Obs/act shapes obs_shape = env.observation_space.shape act_shape = env.action_space.shape # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) pi_info_shapes = {k: v.shape.as_list()[1:] for k,v in pi_info_phs.items()} buf = CPOBuffer(local_steps_per_epoch, obs_shape, act_shape, pi_info_shapes, gamma, lam, cost_gamma, cost_lam) #=========================================================================# # Create computation graph for policy learning # #=========================================================================# # Likelihood ratio ratio = tf.exp(logp - logp_old_ph) # Surrogate advantage / clipped surrogate advantage if agent.clipped_adv: min_adv = tf.where(adv_ph>0, (1+agent.clip_ratio)*adv_ph, (1-agent.clip_ratio)*adv_ph ) surr_adv = tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) else: surr_adv = tf.reduce_mean(ratio * adv_ph) # Surrogate cost (advantage) surr_cost = tf.reduce_mean(ratio * cadv_ph) # Create policy objective function, including entropy regularization pi_objective = surr_adv + ent_reg * ent # Loss function for pi is negative of pi_objective pi_loss = -pi_objective # Optimizer-specific symbols if agent.trust_region: # Symbols needed for CG solver for any trust region method pi_params = get_vars('pi') flat_g = tro.flat_grad(pi_loss, pi_params) v_ph, hvp = tro.hessian_vector_product(d_kl, pi_params) if agent.damping_coeff > 0: hvp += agent.damping_coeff * v_ph # Symbols needed for CG solver for CPO only flat_b = tro.flat_grad(surr_cost, pi_params) # Symbols for getting and setting params get_pi_params = tro.flat_concat(pi_params) set_pi_params = tro.assign_params_from_flat(v_ph, pi_params) training_package = dict(flat_g=flat_g, flat_b=flat_b, v_ph=v_ph, hvp=hvp, get_pi_params=get_pi_params, set_pi_params=set_pi_params) elif agent.first_order: # Optimizer for first-order policy optimization train_pi = MpiAdamOptimizer(learning_rate=agent.pi_lr).minimize(pi_loss) # Prepare training package for agent training_package = dict(train_pi=train_pi) else: raise NotImplementedError # Provide training package to agent training_package.update(dict(pi_loss=pi_loss, surr_cost=surr_cost, d_kl=d_kl, target_kl=target_kl, cost_lim=cost_lim)) agent.prepare_update(training_package) #=========================================================================# # Create computation graph for value learning # #=========================================================================# # Value losses v_loss = tf.reduce_mean((ret_ph - v)**2) vc_loss = tf.reduce_mean((cret_ph - vc)**2) # If agent uses penalty directly in reward function, don't train a separate # value function for predicting cost returns. (Only use one vf for r - p*c.) total_value_loss = v_loss + vc_loss # Optimizer for value learning train_vf = MpiAdamOptimizer(learning_rate=vf_lr).minimize(total_value_loss) #=========================================================================# # Create session, sync across procs, and set up saver # #=========================================================================# sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v, 'vc': vc}) #=========================================================================# # Provide session to agent # #=========================================================================# agent.prepare_session(sess) #=========================================================================# # Create function for running update (called at end of each epoch) # #=========================================================================# def update(): cur_cost = logger.get_stats('EpCost')[0] #c = cur_cost - cost_lim rand_cost = 0 cur_cost_lim = cost_lim-epoch*(cost_lim-cost_lim_end)/epochs + randint(0, rand_cost) c = cur_cost - cur_cost_lim if c > 0 and agent.cares_about_cost: logger.log('Warning! Safety constraint is already violated.', 'red') #=====================================================================# # Prepare feed dict # #=====================================================================# inputs = {k:v for k,v in zip(buf_phs, buf.get())} inputs[surr_cost_rescale_ph] = logger.get_stats('EpLen')[0] inputs[cur_cost_ph] = cur_cost #=====================================================================# # Make some measurements before updating # #=====================================================================# measures = dict(LossPi=pi_loss, SurrCost=surr_cost, LossV=v_loss, Entropy=ent) if not(agent.reward_penalized): measures['LossVC'] = vc_loss pre_update_measures = sess.run(measures, feed_dict=inputs) logger.store(**pre_update_measures) #=====================================================================# # update cost_limit (@mo creation) # #=====================================================================# # Provide training package to agent training_package["cost_lim"]= cur_cost_lim agent.prepare_update(training_package) #=====================================================================# # Update policy # #=====================================================================# agent.update_pi(inputs) #=====================================================================# # Update value function # #=====================================================================# for _ in range(vf_iters): sess.run(train_vf, feed_dict=inputs) #=====================================================================# # Make some measurements after updating # #=====================================================================# del measures['Entropy'] measures['KL'] = d_kl post_update_measures = sess.run(measures, feed_dict=inputs) deltas = dict() for k in post_update_measures: if k in pre_update_measures: deltas['Delta'+k] = post_update_measures[k] - pre_update_measures[k] logger.store(KL=post_update_measures['KL'], **deltas) #=========================================================================# # Run main environment interaction loop # #=========================================================================# start_time = time.time() o, r, d, c, ep_ret, ep_cost, ep_len = env.reset(), 0, False, 0, 0, 0, 0 cur_penalty = 0 cum_cost = 0 for epoch in range(epochs): for t in range(local_steps_per_epoch): # Possibly render if render and proc_id()==0 and t < 1000: env.render() # Get outputs from policy get_action_outs = sess.run(get_action_ops, feed_dict={x_ph: o[np.newaxis]}) #get_action_outs = sess.run(get_action_ops, # feed_dict={x_ph: U.adjust_shape(x_ph, [o])}) a = get_action_outs['pi'] v_t = get_action_outs['v'] vc_t = get_action_outs.get('vc', 0) # Agent may not use cost value func logp_t = get_action_outs['logp_pi'] pi_info_t = get_action_outs['pi_info'] # Step in environment o2, r, d, info = env.step(a) # Include penalty on cost c = info.get('cost', 0) # Track cumulative cost over training cum_cost += c # save and log buf.store(o, a, r, v_t, c, vc_t, logp_t, pi_info_t) logger.store(VVals=v_t, CostVVals=vc_t) o = o2 ep_ret += r ep_cost += c ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t==local_steps_per_epoch-1): # If trajectory didn't reach terminal state, bootstrap value target(s) if d and not(ep_len == max_ep_len): # Note: we do not count env time out as true terminal state last_val, last_cval = 0, 0 else: #feed_dict={x_ph: U.adjust_shape(x_ph, [o])} feed_dict={x_ph: o[np.newaxis]} if agent.reward_penalized: last_val = sess.run(v, feed_dict=feed_dict) last_cval = 0 else: last_val, last_cval = sess.run([v, vc], feed_dict=feed_dict) buf.finish_path(last_val, last_cval) # Only save EpRet / EpLen if trajectory finished if terminal: logger.store(EpRet=ep_ret, EpLen=ep_len, EpCost=ep_cost) else: print('Warning: trajectory cut off by epoch at %d steps.'%ep_len) # Reset environment o, r, d, c, ep_ret, ep_len, ep_cost = env.reset(), 0, False, 0, 0, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs-1): logger.save_state({'env': env}, None) #=====================================================================# # Run RL update # #=====================================================================# update() #=====================================================================# # Cumulative cost calculations # #=====================================================================# cumulative_cost = mpi_sum(cum_cost) cost_rate = cumulative_cost / ((epoch+1)*steps_per_epoch) #=====================================================================# # Log performance and stats # #=====================================================================# logger.log_tabular('Epoch', epoch) # Performance stats logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpCost', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('CumulativeCost', cumulative_cost) logger.log_tabular('CostRate', cost_rate) # Value function values logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('CostVVals', with_min_and_max=True) # Pi loss and change logger.log_tabular('LossPi', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) # Surr cost and change logger.log_tabular('SurrCost', average_only=True) logger.log_tabular('DeltaSurrCost', average_only=True) # V loss and change logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) # Vc loss and change, if applicable (reward_penalized agents don't use vc) if not(agent.reward_penalized): logger.log_tabular('LossVC', average_only=True) logger.log_tabular('DeltaLossVC', average_only=True) if agent.use_penalty or agent.save_penalty: logger.log_tabular('Penalty', average_only=True) logger.log_tabular('DeltaPenalty', average_only=True) else: logger.log_tabular('Penalty', 0) logger.log_tabular('DeltaPenalty', 0) # Anything from the agent? agent.log() # Policy stats logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) # Time and steps elapsed logger.log_tabular('TotalEnvInteracts', (epoch+1)*steps_per_epoch) logger.log_tabular('Time', time.time()-start_time) # Show results! logger.dump_tabular()
def log(self, msg, color='green'): """Print a colorized message to stdout.""" if proc_id() == 0: print(colorize(msg, color, bold=True))