def main(args): """ start training the model :param args: (ArgumentParser) the training argument """ with tf_util.make_session(num_cpu=1): set_global_seeds(args.seed) env = gym.make(args.env_id) def policy_fn(name, ob_space, ac_space, reuse=False, placeholders=None, sess=None): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, sess=sess, hid_size=args.policy_hidden_size, num_hid_layers=2, placeholders=placeholders) #======================================================================================================== env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), "monitor.json")) env.seed(args.seed) gym.logger.setLevel(logging.WARN) # 길고 긴 task name을 받아옵니다. =========================================================================== task_name = get_task_name(args) args.checkpoint_dir = os.path.join(args.checkpoint_dir, task_name) args.log_dir = os.path.join(args.log_dir, task_name) # ======================================================================================================= if args.task == 'train': dataset = MujocoDset(expert_path=args.expert_path, traj_limitation=args.traj_limitation) #discriminator 네트워크 생성 reward_giver = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff) #policy 네트워크 학습 #policy network 가 policy_fn 으로 선언되어있다는 것에 주의 train(env, args.seed, policy_fn, reward_giver, dataset, args.algo, args.g_step, args.d_step, args.policy_entcoeff, args.num_timesteps, args.save_per_iter, args.checkpoint_dir, args.pretrained, args.bc_max_iter, task_name) # ======================================= 이것은 나중에 이해하보는 거시여============================================= # 학습된 모델을 evaluate 할 때 사용합니다. elif args.task == 'evaluate': runner(env, policy_fn, args.load_model_path, timesteps_per_batch=1024, number_trajs=10, stochastic_policy=args.stochastic_policy, save=args.save_sample ) else: raise NotImplementedError env.close()
class TRPO(ActorCriticRLModel): """ Trust Region Policy Optimization (https://arxiv.org/abs/1502.05477) :param policy: (ActorCriticPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, CnnLstmPolicy, ...) :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) :param gamma: (float) the discount value :param timesteps_per_batch: (int) the number of timesteps to run per batch (horizon) :param max_kl: (float) the kullback leiber loss threshold :param cg_iters: (int) the number of iterations for the conjugate gradient calculation :param lam: (float) GAE factor :param entcoeff: (float) the weight for the entropy loss :param cg_damping: (float) the compute gradient dampening factor :param vf_stepsize: (float) the value function stepsize :param vf_iters: (int) the value function's number iterations for learning :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug :param tensorboard_log: (str) the log location for tensorboard (if None, no logging) :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation :param full_tensorboard_log: (bool) enable additional logging when using tensorboard WARNING: this logging can take a lot of space quickly """ def __init__(self, policy, env, gamma=0.99, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, lam=0.98, entcoeff=0.0, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters=3, verbose=0, tensorboard_log=None, _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False): super(TRPO, self).__init__(policy=policy, env=env, verbose=verbose, requires_vec_env=False, _init_setup_model=_init_setup_model, policy_kwargs=policy_kwargs) self.using_gail = False self.timesteps_per_batch = timesteps_per_batch self.cg_iters = cg_iters self.cg_damping = cg_damping self.gamma = gamma self.lam = lam self.max_kl = max_kl self.vf_iters = vf_iters self.vf_stepsize = vf_stepsize self.entcoeff = entcoeff self.tensorboard_log = tensorboard_log self.full_tensorboard_log = full_tensorboard_log # GAIL Params self.hidden_size_adversary = 100 self.adversary_entcoeff = 1e-3 self.expert_dataset = None self.g_step = 1 self.d_step = 1 self.d_stepsize = 3e-4 self.graph = None self.sess = None self.policy_pi = None self.loss_names = None self.assign_old_eq_new = None self.compute_losses = None self.compute_lossandgrad = None self.compute_fvp = None self.compute_vflossandgrad = None self.d_adam = None self.vfadam = None self.get_flat = None self.set_from_flat = None self.timed = None self.allmean = None self.nworkers = None self.rank = None self.reward_giver = None self.step = None self.proba_step = None self.initial_state = None self.params = None self.summary = None self.episode_reward = None if _init_setup_model: self.setup_model() def _get_pretrain_placeholders(self): policy = self.policy_pi action_ph = policy.pdtype.sample_placeholder([None]) if isinstance(self.action_space, gym.spaces.Discrete): return policy.obs_ph, action_ph, policy.policy return policy.obs_ph, action_ph, policy.deterministic_action def setup_model(self): # prevent import loops from stable_baselines.gail.adversary import TransitionClassifier with SetVerbosity(self.verbose): assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the TRPO model must be " \ "an instance of common.policies.ActorCriticPolicy." self.nworkers = MPI.COMM_WORLD.Get_size() self.rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.single_threaded_session(graph=self.graph) if self.using_gail: self.reward_giver = TransitionClassifier( self.observation_space, self.action_space, self.hidden_size_adversary, entcoeff=self.adversary_entcoeff) # Construct network for new policy self.policy_pi = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) # Network for old policy with tf.variable_scope("oldpi", reuse=False): old_policy = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) with tf.variable_scope("loss", reuse=False): atarg = tf.placeholder(dtype=tf.float32, shape=[ None ]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return observation = self.policy_pi.obs_ph action = self.policy_pi.pdtype.sample_placeholder([None]) kloldnew = old_policy.proba_distribution.kl( self.policy_pi.proba_distribution) ent = self.policy_pi.proba_distribution.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = self.entcoeff * meanent vferr = tf.reduce_mean( tf.square(self.policy_pi.value_fn[:, 0] - ret)) # advantage * pnew / pold ratio = tf.exp( self.policy_pi.proba_distribution.logp(action) - old_policy.proba_distribution.logp(action)) surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] self.loss_names = [ "optimgain", "meankl", "entloss", "surrgain", "entropy" ] dist = meankl all_var_list = tf_util.get_trainable_vars("model") var_list = [ v for v in all_var_list if "/vf" not in v.name and "/q/" not in v.name ] vf_var_list = [ v for v in all_var_list if "/pi" not in v.name and "/logstd" not in v.name ] self.get_flat = tf_util.GetFlat(var_list, sess=self.sess) self.set_from_flat = tf_util.SetFromFlat(var_list, sess=self.sess) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: var_size = tf_util.intprod(shape) tangents.append( tf.reshape(flat_tangent[start:start + var_size], shape)) start += var_size gvp = tf.add_n([ tf.reduce_sum(grad * tangent) for (grad, tangent) in zipsame(klgrads, tangents) ]) # pylint: disable=E1111 fvp = tf_util.flatgrad(gvp, var_list) tf.summary.scalar('entropy_loss', meanent) tf.summary.scalar('policy_gradient_loss', optimgain) tf.summary.scalar('value_function_loss', surrgain) tf.summary.scalar('approximate_kullback-leiber', meankl) tf.summary.scalar( 'loss', optimgain + meankl + entbonus + surrgain + meanent) self.assign_old_eq_new = \ tf_util.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(tf_util.get_globals_vars("oldpi"), tf_util.get_globals_vars("model"))]) self.compute_losses = tf_util.function( [observation, old_policy.obs_ph, action, atarg], losses) self.compute_fvp = tf_util.function([ flat_tangent, observation, old_policy.obs_ph, action, atarg ], fvp) self.compute_vflossandgrad = tf_util.function( [observation, old_policy.obs_ph, ret], tf_util.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if self.rank == 0 and self.verbose >= 1: print(colorize(msg, color='magenta')) start_time = time.time() yield print( colorize("done in {:.3f} seconds".format( (time.time() - start_time)), color='magenta')) else: yield def allmean(arr): assert isinstance(arr, np.ndarray) out = np.empty_like(arr) MPI.COMM_WORLD.Allreduce(arr, out, op=MPI.SUM) out /= self.nworkers return out tf_util.initialize(sess=self.sess) th_init = self.get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) self.set_from_flat(th_init) with tf.variable_scope("Adam_mpi", reuse=False): self.vfadam = MpiAdam(vf_var_list, sess=self.sess) if self.using_gail: self.d_adam = MpiAdam( self.reward_giver.get_trainable_variables(), sess=self.sess) self.d_adam.sync() self.vfadam.sync() with tf.variable_scope("input_info", reuse=False): tf.summary.scalar('discounted_rewards', tf.reduce_mean(ret)) tf.summary.scalar('learning_rate', tf.reduce_mean(self.vf_stepsize)) tf.summary.scalar('advantage', tf.reduce_mean(atarg)) tf.summary.scalar('kl_clip_range', tf.reduce_mean(self.max_kl)) if self.full_tensorboard_log: tf.summary.histogram('discounted_rewards', ret) tf.summary.histogram('learning_rate', self.vf_stepsize) tf.summary.histogram('advantage', atarg) tf.summary.histogram('kl_clip_range', self.max_kl) if tf_util.is_image(self.observation_space): tf.summary.image('observation', observation) else: tf.summary.histogram('observation', observation) self.timed = timed self.allmean = allmean self.step = self.policy_pi.step self.proba_step = self.policy_pi.proba_step self.initial_state = self.policy_pi.initial_state self.params = find_trainable_variables("model") if self.using_gail: self.params.extend( self.reward_giver.get_trainable_variables()) self.summary = tf.summary.merge_all() self.compute_lossandgrad = \ tf_util.function([observation, old_policy.obs_ph, action, atarg, ret], [self.summary, tf_util.flatgrad(optimgain, var_list)] + losses) def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="TRPO", reset_num_timesteps=True): new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) with self.sess.as_default(): seg_gen = traj_segment_generator( self.policy_pi, self.env, self.timesteps_per_batch, reward_giver=self.reward_giver, gail=self.using_gail) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 t_start = time.time() len_buffer = deque( maxlen=40) # rolling buffer for episode lengths reward_buffer = deque( maxlen=40) # rolling buffer for episode rewards self.episode_reward = np.zeros((self.n_envs, )) true_reward_buffer = None if self.using_gail: true_reward_buffer = deque(maxlen=40) # Initialize dataloader batchsize = self.timesteps_per_batch // self.d_step self.expert_dataset.init_dataloader(batchsize) # Stats not used for now # TODO: replace with normal tb logging # g_loss_stats = Stats(loss_names) # d_loss_stats = Stats(reward_giver.loss_name) # ep_stats = Stats(["True_rewards", "Rewards", "Episode_length"]) while True: if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break if total_timesteps and timesteps_so_far >= total_timesteps: break logger.log("********** Iteration %i ************" % iters_so_far) def fisher_vector_product(vec): return self.allmean( self.compute_fvp( vec, *fvpargs, sess=self.sess)) + self.cg_damping * vec # ------------------ Update G ------------------ logger.log("Optimizing Policy...") # g_step = 1 when not using GAIL mean_losses = None vpredbefore = None tdlamret = None observation = None action = None seg = None for k in range(self.g_step): with self.timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, self.gamma, self.lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) observation, action, atarg, tdlamret = seg["ob"], seg[ "ac"], seg["adv"], seg["tdlamret"] vpredbefore = seg[ "vpred"] # predicted value function before update atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate # true_rew is the reward without discount if writer is not None: self.episode_reward = total_episode_reward_logger( self.episode_reward, seg["true_rew"].reshape( (self.n_envs, -1)), seg["dones"].reshape( (self.n_envs, -1)), writer, self.num_timesteps) args = seg["ob"], seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] self.assign_old_eq_new(sess=self.sess) with self.timed("computegrad"): steps = self.num_timesteps + (k + 1) * ( seg["total_timestep"] / self.g_step) run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata( ) if self.full_tensorboard_log else None # run loss backprop with summary, and save the metadata (memory, compute time, ...) if writer is not None: summary, grad, *lossbefore = self.compute_lossandgrad( *args, tdlamret, sess=self.sess, options=run_options, run_metadata=run_metadata) if self.full_tensorboard_log: writer.add_run_metadata( run_metadata, 'step%d' % steps) writer.add_summary(summary, steps) else: _, grad, *lossbefore = self.compute_lossandgrad( *args, tdlamret, sess=self.sess, options=run_options, run_metadata=run_metadata) lossbefore = self.allmean(np.array(lossbefore)) grad = self.allmean(grad) if np.allclose(grad, 0): logger.log("Got zero gradient. not updating") else: with self.timed("conjugate_gradient"): stepdir = conjugate_gradient( fisher_vector_product, grad, cg_iters=self.cg_iters, verbose=self.rank == 0 and self.verbose >= 1) assert np.isfinite(stepdir).all() shs = .5 * stepdir.dot( fisher_vector_product(stepdir)) # abs(shs) to avoid taking square root of negative values lagrange_multiplier = np.sqrt( abs(shs) / self.max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lagrange_multiplier expectedimprove = grad.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = self.get_flat() thnew = None for _ in range(10): thnew = thbefore + fullstep * stepsize self.set_from_flat(thnew) mean_losses = surr, kl_loss, *_ = self.allmean( np.array( self.compute_losses(*args, sess=self.sess))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(mean_losses).all(): logger.log( "Got non-finite value of losses -- bad!" ) elif kl_loss > self.max_kl * 1.5: logger.log( "violated KL constraint. shrinking step." ) elif improve < 0: logger.log( "surrogate didn't improve. shrinking step." ) else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") self.set_from_flat(thbefore) if self.nworkers > 1 and iters_so_far % 20 == 0: # list of tuples paramsums = MPI.COMM_WORLD.allgather( (thnew.sum(), self.vfadam.getflat().sum())) assert all( np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) with self.timed("vf"): for _ in range(self.vf_iters): # NOTE: for recurrent policies, use shuffle=False? for (mbob, mbret) in dataset.iterbatches( (seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=128, shuffle=True): grad = self.allmean( self.compute_vflossandgrad( mbob, mbob, mbret, sess=self.sess)) self.vfadam.update(grad, self.vf_stepsize) for (loss_name, loss_val) in zip(self.loss_names, mean_losses): logger.record_tabular(loss_name, loss_val) logger.record_tabular( "explained_variance_tdlam_before", explained_variance(vpredbefore, tdlamret)) if self.using_gail: # ------------------ Update D ------------------ logger.log("Optimizing Discriminator...") logger.log(fmt_row(13, self.reward_giver.loss_name)) assert len(observation) == self.timesteps_per_batch batch_size = self.timesteps_per_batch // self.d_step # NOTE: uses only the last g step for observation d_losses = [ ] # list of tuples, each of which gives the loss for a minibatch # NOTE: for recurrent policies, use shuffle=False? for ob_batch, ac_batch in dataset.iterbatches( (observation, action), include_final_partial_batch=False, batch_size=batch_size, shuffle=True): ob_expert, ac_expert = self.expert_dataset.get_next_batch( ) # update running mean/std for reward_giver if self.reward_giver.normalize: self.reward_giver.obs_rms.update( np.concatenate((ob_batch, ob_expert), 0)) # Reshape actions if needed when using discrete actions if isinstance(self.action_space, gym.spaces.Discrete): if len(ac_batch.shape) == 2: ac_batch = ac_batch[:, 0] if len(ac_expert.shape) == 2: ac_expert = ac_expert[:, 0] *newlosses, grad = self.reward_giver.lossandgrad( ob_batch, ac_batch, ob_expert, ac_expert) self.d_adam.update(self.allmean(grad), self.d_stepsize) d_losses.append(newlosses) logger.log(fmt_row(13, np.mean(d_losses, axis=0))) # lr: lengths and rewards lr_local = (seg["ep_lens"], seg["ep_rets"], seg["ep_true_rets"]) # local values list_lr_pairs = MPI.COMM_WORLD.allgather( lr_local) # list of tuples lens, rews, true_rets = map(flatten_lists, zip(*list_lr_pairs)) true_reward_buffer.extend(true_rets) else: # lr: lengths and rewards lr_local = (seg["ep_lens"], seg["ep_rets"] ) # local values list_lr_pairs = MPI.COMM_WORLD.allgather( lr_local) # list of tuples lens, rews = map(flatten_lists, zip(*list_lr_pairs)) len_buffer.extend(lens) reward_buffer.extend(rews) if len(len_buffer) > 0: logger.record_tabular("EpLenMean", np.mean(len_buffer)) logger.record_tabular("EpRewMean", np.mean(reward_buffer)) if self.using_gail: logger.record_tabular("EpTrueRewMean", np.mean(true_reward_buffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) current_it_timesteps = MPI.COMM_WORLD.allreduce( seg["total_timestep"]) timesteps_so_far += current_it_timesteps self.num_timesteps += current_it_timesteps iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", self.num_timesteps) logger.record_tabular("TimeElapsed", time.time() - t_start) if self.verbose >= 1 and self.rank == 0: logger.dump_tabular() return self def save(self, save_path): if self.using_gail and self.expert_dataset is not None: # Exit processes to pickle the dataset self.expert_dataset.prepare_pickling() data = { "gamma": self.gamma, "timesteps_per_batch": self.timesteps_per_batch, "max_kl": self.max_kl, "cg_iters": self.cg_iters, "lam": self.lam, "entcoeff": self.entcoeff, "cg_damping": self.cg_damping, "vf_stepsize": self.vf_stepsize, "vf_iters": self.vf_iters, "hidden_size_adversary": self.hidden_size_adversary, "adversary_entcoeff": self.adversary_entcoeff, "expert_dataset": self.expert_dataset, "g_step": self.g_step, "d_step": self.d_step, "d_stepsize": self.d_stepsize, "using_gail": self.using_gail, "verbose": self.verbose, "policy": self.policy, "observation_space": self.observation_space, "action_space": self.action_space, "n_envs": self.n_envs, "_vectorize_action": self._vectorize_action, "policy_kwargs": self.policy_kwargs } params = self.sess.run(self.params) self._save_to_file(save_path, data=data, params=params)
def setup_model(self): # prevent import loops from stable_baselines.gail.adversary import TransitionClassifier with SetVerbosity(self.verbose): assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the TRPO model must be " \ "an instance of common.policies.ActorCriticPolicy." self.nworkers = MPI.COMM_WORLD.Get_size() self.rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.single_threaded_session(graph=self.graph) if self.using_gail: self.reward_giver = TransitionClassifier( self.observation_space, self.action_space, self.hidden_size_adversary, entcoeff=self.adversary_entcoeff) # Construct network for new policy self.policy_pi = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) # Network for old policy with tf.variable_scope("oldpi", reuse=False): old_policy = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) with tf.variable_scope("loss", reuse=False): atarg = tf.placeholder(dtype=tf.float32, shape=[ None ]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return observation = self.policy_pi.obs_ph action = self.policy_pi.pdtype.sample_placeholder([None]) kloldnew = old_policy.proba_distribution.kl( self.policy_pi.proba_distribution) ent = self.policy_pi.proba_distribution.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = self.entcoeff * meanent vferr = tf.reduce_mean( tf.square(self.policy_pi.value_fn[:, 0] - ret)) # advantage * pnew / pold ratio = tf.exp( self.policy_pi.proba_distribution.logp(action) - old_policy.proba_distribution.logp(action)) surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] self.loss_names = [ "optimgain", "meankl", "entloss", "surrgain", "entropy" ] dist = meankl all_var_list = tf_util.get_trainable_vars("model") var_list = [ v for v in all_var_list if "/vf" not in v.name and "/q/" not in v.name ] vf_var_list = [ v for v in all_var_list if "/pi" not in v.name and "/logstd" not in v.name ] self.get_flat = tf_util.GetFlat(var_list, sess=self.sess) self.set_from_flat = tf_util.SetFromFlat(var_list, sess=self.sess) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: var_size = tf_util.intprod(shape) tangents.append( tf.reshape(flat_tangent[start:start + var_size], shape)) start += var_size gvp = tf.add_n([ tf.reduce_sum(grad * tangent) for (grad, tangent) in zipsame(klgrads, tangents) ]) # pylint: disable=E1111 fvp = tf_util.flatgrad(gvp, var_list) tf.summary.scalar('entropy_loss', meanent) tf.summary.scalar('policy_gradient_loss', optimgain) tf.summary.scalar('value_function_loss', surrgain) tf.summary.scalar('approximate_kullback-leiber', meankl) tf.summary.scalar( 'loss', optimgain + meankl + entbonus + surrgain + meanent) self.assign_old_eq_new = \ tf_util.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(tf_util.get_globals_vars("oldpi"), tf_util.get_globals_vars("model"))]) self.compute_losses = tf_util.function( [observation, old_policy.obs_ph, action, atarg], losses) self.compute_fvp = tf_util.function([ flat_tangent, observation, old_policy.obs_ph, action, atarg ], fvp) self.compute_vflossandgrad = tf_util.function( [observation, old_policy.obs_ph, ret], tf_util.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if self.rank == 0 and self.verbose >= 1: print(colorize(msg, color='magenta')) start_time = time.time() yield print( colorize("done in {:.3f} seconds".format( (time.time() - start_time)), color='magenta')) else: yield def allmean(arr): assert isinstance(arr, np.ndarray) out = np.empty_like(arr) MPI.COMM_WORLD.Allreduce(arr, out, op=MPI.SUM) out /= self.nworkers return out tf_util.initialize(sess=self.sess) th_init = self.get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) self.set_from_flat(th_init) with tf.variable_scope("Adam_mpi", reuse=False): self.vfadam = MpiAdam(vf_var_list, sess=self.sess) if self.using_gail: self.d_adam = MpiAdam( self.reward_giver.get_trainable_variables(), sess=self.sess) self.d_adam.sync() self.vfadam.sync() with tf.variable_scope("input_info", reuse=False): tf.summary.scalar('discounted_rewards', tf.reduce_mean(ret)) tf.summary.scalar('learning_rate', tf.reduce_mean(self.vf_stepsize)) tf.summary.scalar('advantage', tf.reduce_mean(atarg)) tf.summary.scalar('kl_clip_range', tf.reduce_mean(self.max_kl)) if self.full_tensorboard_log: tf.summary.histogram('discounted_rewards', ret) tf.summary.histogram('learning_rate', self.vf_stepsize) tf.summary.histogram('advantage', atarg) tf.summary.histogram('kl_clip_range', self.max_kl) if tf_util.is_image(self.observation_space): tf.summary.image('observation', observation) else: tf.summary.histogram('observation', observation) self.timed = timed self.allmean = allmean self.step = self.policy_pi.step self.proba_step = self.policy_pi.proba_step self.initial_state = self.policy_pi.initial_state self.params = find_trainable_variables("model") if self.using_gail: self.params.extend( self.reward_giver.get_trainable_variables()) self.summary = tf.summary.merge_all() self.compute_lossandgrad = \ tf_util.function([observation, old_policy.obs_ph, action, atarg, ret], [self.summary, tf_util.flatgrad(optimgain, var_list)] + losses)
def setup_model(self): # prevent import loops from stable_baselines.gail.adversary import TransitionClassifier with SetVerbosity(self.verbose): assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the TRPO model must be " \ "an instance of common.policies.ActorCriticPolicy." self.nworkers = MPI.COMM_WORLD.Get_size() self.rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) if self.using_gail: self.reward_giver = TransitionClassifier(self.observation_space, self.action_space, self.hidden_size_adversary, entcoeff=self.adversary_entcoeff) # Penalty related variable with tf.variable_scope('penalty'): cur_cost_ph = tf.placeholder(dtype=tf.float32, shape=[None]) # episodic cost param_init = np.log(max(np.exp(self.penalty_init) - 1, 1e-8)) penalty_param = tf.get_variable('penalty_param', initializer=float(param_init), trainable=True, dtype=tf.float32) penalty = tf.nn.softplus(penalty_param) penalty_loss = tf.reduce_mean(-penalty_param * (cur_cost_ph - self.cost_lim)) # Construct network for new policy self.policy_pi = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) # Network for old policy with tf.variable_scope("oldpi", reuse=False): old_policy = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) # # Network for safety value function # with tf.variable_Scope("vc",reuse=False): # self.cost_value = MLPValue(self.sess, self.observation_spacem, self.n_envs, 1, None) with tf.variable_scope("loss", reuse=False): atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return catarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target cost advantage function cret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical cost observation = self.policy_pi.obs_ph action = self.policy_pi.pdtype.sample_placeholder([None]) kloldnew = old_policy.proba_distribution.kl(self.policy_pi.proba_distribution) ent = self.policy_pi.proba_distribution.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = self.entcoeff * meanent vferr = tf.reduce_mean(tf.square(self.policy_pi.value_flat - ret)) vcerr = tf.reduce_mean(tf.square(self.policy_pi.vcf_flat - cret)) # advantage * pnew / pold ratio = tf.exp(self.policy_pi.proba_distribution.logp(action) - old_policy.proba_distribution.logp(action)) surrgain = tf.reduce_mean(ratio * atarg) # Surrogate for cost function surrcost = tf.reduce_mean(ratio * catarg) optimgain = surrgain + entbonus # Include surr_cost in pi_objective optimgain -= penalty * surrcost optimgain /= (1 + penalty) # # Loss function for pi is negative of pi_objective # optimgain = -optimgain # Should we?? losses = [optimgain, meankl, entbonus, surrgain, meanent, surrcost] self.loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy", "surrcost"] dist = meankl all_var_list = tf_util.get_trainable_vars("model") var_list = [v for v in all_var_list if "/vf" not in v.name and "/q/" not in v.name and "/vcf" not in v.name] # policy parameters vf_var_list = [v for v in all_var_list if "/pi" not in v.name and "/logstd" not in v.name and "/vcf" not in v.name] # value parameters vcf_var_list = [v for v in all_var_list if "/pi" not in v.name and "/logstd" not in v.name and "/vf" not in v.name] # cost value parameters self.get_flat = tf_util.GetFlat(var_list, sess=self.sess) self.set_from_flat = tf_util.SetFromFlat(var_list, sess=self.sess) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: var_size = tf_util.intprod(shape) tangents.append(tf.reshape(flat_tangent[start: start + var_size], shape)) start += var_size gvp = tf.add_n([tf.reduce_sum(grad * tangent) for (grad, tangent) in zipsame(klgrads, tangents)]) # pylint: disable=E1111 # Fisher vector products fvp = tf_util.flatgrad(gvp, var_list) tf.summary.scalar('penalty_loss', penalty_loss) tf.summary.scalar('entropy_loss', meanent) tf.summary.scalar('policy_gradient_loss', optimgain) tf.summary.scalar('value_function_loss', surrgain) tf.summary.scalar('constraint_cost_function_loss', surrcost) tf.summary.scalar('approximate_kullback-leibler', meankl) tf.summary.scalar('loss', optimgain + meankl + entbonus + surrgain + meanent + surrcost + penalty_loss) self.assign_old_eq_new = \ tf_util.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(tf_util.get_globals_vars("oldpi"), tf_util.get_globals_vars("model"))]) self.compute_losses = tf_util.function([observation, old_policy.obs_ph, action, atarg, catarg], losses) self.compute_fvp = tf_util.function([flat_tangent, observation, old_policy.obs_ph, action, atarg, catarg], fvp) # Why need all inputs? Might for implementation easiness # self.compute_vflossandgrad = tf_util.function([observation, old_policy.obs_ph, ret], # tf_util.flatgrad(vferr, vf_var_list)) # Why need old_policy.obs_ph? Doesn't seem to be used # self.compute_vcflossandgrad = tf_util.function([observation, old_policy.obs_ph, cret], # tf_util.flatgrad(vcerr, vcf_var_list)) self.compute_vflossandgrad = tf_util.function([observation, old_policy.obs_ph, ret, cret], [tf_util.flatgrad(vferr, vf_var_list), tf_util.flatgrad(vcerr, vcf_var_list)]) self.compute_lagrangiangrad = tf_util.function([cur_cost_ph], tf_util.flatgrad(penalty_loss, [penalty_param])) @contextmanager def timed(msg): if self.rank == 0 and self.verbose >= 1: print(colorize(msg, color='magenta')) start_time = time.time() yield print(colorize("done in {:.3f} seconds".format((time.time() - start_time)), color='magenta')) else: yield def allmean(arr): assert isinstance(arr, np.ndarray) out = np.empty_like(arr) MPI.COMM_WORLD.Allreduce(arr, out, op=MPI.SUM) out /= self.nworkers return out tf_util.initialize(sess=self.sess) th_init = self.get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) self.set_from_flat(th_init) with tf.variable_scope("Adam_mpi", reuse=False): self.vfadam = MpiAdam(vf_var_list, sess=self.sess) if self.using_gail: self.d_adam = MpiAdam(self.reward_giver.get_trainable_variables(), sess=self.sess) self.d_adam.sync() self.vfadam.sync() # optimizer for constraint costs value function self.vcadam = MpiAdam(vcf_var_list, sess=self.sess) self.vcadam.sync() # optimizer for lagragian value of safe RL self.penaltyadam = MpiAdam([penalty_param], sess=self.sess) self.penaltyadam.sync() with tf.variable_scope("input_info", reuse=False): tf.summary.scalar('discounted_rewards', tf.reduce_mean(ret)) tf.summary.scalar('discounted_costs', tf.reduce_mean(cret)) tf.summary.scalar('learning_rate', tf.reduce_mean(self.vf_stepsize)) tf.summary.scalar('advantage', tf.reduce_mean(atarg)) tf.summary.scalar('cost_advantage', tf.reduce_mean(catarg)) tf.summary.scalar('kl_clip_range', tf.reduce_mean(self.max_kl)) if self.full_tensorboard_log: tf.summary.histogram('discounted_rewards', ret) tf.summary.histogram('discounted_rewards', cret) tf.summary.histogram('learning_rate', self.vf_stepsize) tf.summary.histogram('penalty_learning_rate', self.penalty_lr) tf.summary.histogram('advantage', atarg) tf.summary.histogram('cost_advantage', catarg) tf.summary.histogram('kl_clip_range', self.max_kl) if tf_util.is_image(self.observation_space): tf.summary.image('observation', observation) else: tf.summary.histogram('observation', observation) self.timed = timed self.allmean = allmean self.step = self.policy_pi.step self.proba_step = self.policy_pi.proba_step self.initial_state = self.policy_pi.initial_state self.params = tf_util.get_trainable_vars("model") + tf_util.get_trainable_vars("oldpi") if self.using_gail: self.params.extend(self.reward_giver.get_trainable_variables()) self.summary = tf.summary.merge_all() self.compute_lossandgrad = \ tf_util.function([observation, old_policy.obs_ph, action, atarg, catarg, ret, cret, cur_cost_ph], [self.summary, tf_util.flatgrad(optimgain, var_list)] + losses)
class PPO2(ActorCriticRLModel): """ Proximal Policy Optimization algorithm (GPU version). Paper: https://arxiv.org/abs/1707.06347 :param policy: (ActorCriticPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, CnnLstmPolicy, ...) :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) :param gamma: (float) Discount factor :param n_steps: (int) The number of steps to run for each environment per update (i.e. batch size is n_steps * n_env where n_env is number of environment copies running in parallel) :param ent_coef: (float) Entropy coefficient for the loss calculation :param learning_rate: (float or callable) The learning rate, it can be a function :param vf_coef: (float) Value function coefficient for the loss calculation :param max_grad_norm: (float) The maximum value for the gradient clipping :param lam: (float) Factor for trade-off of bias vs variance for Generalized Advantage Estimator :param nminibatches: (int) Number of training minibatches per update. For recurrent policies, the number of environments run in parallel should be a multiple of nminibatches. :param noptepochs: (int) Number of epoch when optimizing the surrogate :param cliprange: (float or callable) Clipping parameter, it can be a function :param cliprange_vf: (float or callable) Clipping parameter for the value function, it can be a function. This is a parameter specific to the OpenAI implementation. If None is passed (default), then `cliprange` (that is used for the policy) will be used. IMPORTANT: this clipping depends on the reward scaling. To deactivate value function clipping (and recover the original PPO implementation), you have to pass a negative value (e.g. -1). :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug :param tensorboard_log: (str) the log location for tensorboard (if None, no logging) :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation :param full_tensorboard_log: (bool) enable additional logging when using tensorboard WARNING: this logging can take a lot of space quickly :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow). If None (default), use random seed. Note that if you want completely deterministic results, you must set `n_cpu_tf_sess` to 1. :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations If None, the number of cpu of the current machine will be used. """ def __init__(self, policy, env, gamma=0.99, n_steps=128, ent_coef=0.01, learning_rate=2.5e-4, vf_coef=0.5, max_grad_norm=0.5, lam=0.95, nminibatches=4, noptepochs=4, cliprange=0.2, cliprange_vf=None, verbose=0, tensorboard_log=None, _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False, seed=None, n_cpu_tf_sess=None): super(PPO2, self).__init__(policy=policy, env=env, verbose=verbose, requires_vec_env=True, _init_setup_model=_init_setup_model, policy_kwargs=policy_kwargs, seed=seed, n_cpu_tf_sess=n_cpu_tf_sess) self.learning_rate = learning_rate self.cliprange = cliprange self.cliprange_vf = cliprange_vf self.n_steps = n_steps self.ent_coef = ent_coef self.vf_coef = vf_coef self.max_grad_norm = max_grad_norm self.gamma = gamma self.lam = lam self.nminibatches = nminibatches self.noptepochs = noptepochs self.tensorboard_log = tensorboard_log self.full_tensorboard_log = full_tensorboard_log # GAIL Params self.hidden_size_adversary = 100 self.adversary_entcoeff = 1e-3 self.expert_dataset = None self.g_step = 1 self.d_step = 1 self.d_stepsize = 3e-4 self.graph = None self.sess = None self.action_ph = None self.advs_ph = None self.rewards_ph = None self.old_neglog_pac_ph = None self.old_vpred_ph = None self.learning_rate_ph = None self.clip_range_ph = None self.entropy = None self.vf_loss = None self.pg_loss = None self.approxkl = None self.clipfrac = None self.params = None self._train = None self.loss_names = None self.train_model = None self.act_model = None self.step = None self.proba_step = None self.value = None self.initial_state = None self.n_batch = None self.summary = None self.episode_reward = None self.reward_giver = None self.using_gail = False if _init_setup_model: self.setup_model() def _get_pretrain_placeholders(self): policy = self.act_model if isinstance(self.action_space, gym.spaces.Discrete): return policy.obs_ph, self.action_ph, policy.policy return policy.obs_ph, self.action_ph, policy.deterministic_action def setup_model(self): from stable_baselines.gail.adversary import TransitionClassifier with SetVerbosity(self.verbose): assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the PPO2 model must be " \ "an instance of common.policies.ActorCriticPolicy." self.n_batch = self.n_envs * self.n_steps self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) if self.using_gail: self.reward_giver = TransitionClassifier(self.observation_space, self.action_space, self.hidden_size_adversary, normalize=False, entcoeff=self.adversary_entcoeff) n_batch_step = None n_batch_train = None if issubclass(self.policy, RecurrentActorCriticPolicy): assert self.n_envs % self.nminibatches == 0, "For recurrent policies, "\ "the number of environments run in parallel should be a multiple of nminibatches." n_batch_step = self.n_envs n_batch_train = self.n_batch // self.nminibatches act_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, n_batch_step, reuse=False, **self.policy_kwargs) with tf.variable_scope("train_model", reuse=True, custom_getter=tf_util.outer_scope_getter("train_model")): train_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs // self.nminibatches, self.n_steps, n_batch_train, reuse=True, **self.policy_kwargs) with tf.variable_scope("loss", reuse=False): self.action_ph = train_model.pdtype.sample_placeholder([None], name="action_ph") self.advs_ph = tf.placeholder(tf.float32, [None], name="advs_ph") self.rewards_ph = tf.placeholder(tf.float32, [None], name="rewards_ph") self.old_neglog_pac_ph = tf.placeholder(tf.float32, [None], name="old_neglog_pac_ph") self.old_vpred_ph = tf.placeholder(tf.float32, [None], name="old_vpred_ph") self.learning_rate_ph = tf.placeholder(tf.float32, [], name="learning_rate_ph") self.clip_range_ph = tf.placeholder(tf.float32, [], name="clip_range_ph") neglogpac = train_model.proba_distribution.neglogp(self.action_ph) self.entropy = tf.reduce_mean(train_model.proba_distribution.entropy()) vpred = train_model.value_flat # Value function clipping: not present in the original PPO if self.cliprange_vf is None: # Default behavior (legacy from OpenAI baselines): # use the same clipping as for the policy self.clip_range_vf_ph = self.clip_range_ph self.cliprange_vf = self.cliprange elif isinstance(self.cliprange_vf, (float, int)) and self.cliprange_vf < 0: # Original PPO implementation: no value function clipping self.clip_range_vf_ph = None else: # Last possible behavior: clipping range # specific to the value function self.clip_range_vf_ph = tf.placeholder(tf.float32, [], name="clip_range_vf_ph") if self.clip_range_vf_ph is None: # No clipping vpred_clipped = train_model.value_flat else: # Clip the different between old and new value # NOTE: this depends on the reward scaling vpred_clipped = self.old_vpred_ph + \ tf.clip_by_value(train_model.value_flat - self.old_vpred_ph, - self.clip_range_vf_ph, self.clip_range_vf_ph) vf_losses1 = tf.square(vpred - self.rewards_ph) vf_losses2 = tf.square(vpred_clipped - self.rewards_ph) self.vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) ratio = tf.exp(self.old_neglog_pac_ph - neglogpac) pg_losses = -self.advs_ph * ratio pg_losses2 = -self.advs_ph * tf.clip_by_value(ratio, 1.0 - self.clip_range_ph, 1.0 + self.clip_range_ph) self.pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) self.approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - self.old_neglog_pac_ph)) self.clipfrac = tf.reduce_mean(tf.cast(tf.greater(tf.abs(ratio - 1.0), self.clip_range_ph), tf.float32)) loss = self.pg_loss - self.entropy * self.ent_coef + self.vf_loss * self.vf_coef tf.summary.scalar('entropy_loss', self.entropy) tf.summary.scalar('policy_gradient_loss', self.pg_loss) tf.summary.scalar('value_function_loss', self.vf_loss) tf.summary.scalar('approximate_kullback-leibler', self.approxkl) tf.summary.scalar('clip_factor', self.clipfrac) tf.summary.scalar('loss', loss) with tf.variable_scope('model'): self.params = tf.trainable_variables() # Update params with GAIL params too if self.using_gail: self.params.extend(self.reward_giver.get_trainable_variables()) if self.full_tensorboard_log: for var in self.params: tf.summary.histogram(var.name, var) grads = tf.gradients(loss, self.params) if self.max_grad_norm is not None: grads, _grad_norm = tf.clip_by_global_norm(grads, self.max_grad_norm) grads = list(zip(grads, self.params)) trainer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph, epsilon=1e-5) self._train = trainer.apply_gradients(grads) self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac'] with tf.variable_scope("input_info", reuse=False): tf.summary.scalar('discounted_rewards', tf.reduce_mean(self.rewards_ph)) tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph)) tf.summary.scalar('advantage', tf.reduce_mean(self.advs_ph)) tf.summary.scalar('clip_range', tf.reduce_mean(self.clip_range_ph)) if self.clip_range_vf_ph is not None: tf.summary.scalar('clip_range_vf', tf.reduce_mean(self.clip_range_vf_ph)) tf.summary.scalar('old_neglog_action_probabilty', tf.reduce_mean(self.old_neglog_pac_ph)) tf.summary.scalar('old_value_pred', tf.reduce_mean(self.old_vpred_ph)) if self.full_tensorboard_log: tf.summary.histogram('discounted_rewards', self.rewards_ph) tf.summary.histogram('learning_rate', self.learning_rate_ph) tf.summary.histogram('advantage', self.advs_ph) tf.summary.histogram('clip_range', self.clip_range_ph) tf.summary.histogram('old_neglog_action_probabilty', self.old_neglog_pac_ph) tf.summary.histogram('old_value_pred', self.old_vpred_ph) if tf_util.is_image(self.observation_space): tf.summary.image('observation', train_model.obs_ph) else: tf.summary.histogram('observation', train_model.obs_ph) self.train_model = train_model self.act_model = act_model self.step = act_model.step self.proba_step = act_model.proba_step self.value = act_model.value self.initial_state = act_model.initial_state tf.global_variables_initializer().run(session=self.sess) # pylint: disable=E1101 self.summary = tf.summary.merge_all() def _train_step(self, learning_rate, cliprange, obs, returns, masks, actions, values, neglogpacs, update, writer, states=None, cliprange_vf=None): """ Training of PPO2 Algorithm :param learning_rate: (float) learning rate :param cliprange: (float) Clipping factor :param obs: (np.ndarray) The current observation of the environment :param returns: (np.ndarray) the rewards :param masks: (np.ndarray) The last masks for done episodes (used in recurent policies) :param actions: (np.ndarray) the actions :param values: (np.ndarray) the values :param neglogpacs: (np.ndarray) Negative Log-likelihood probability of Actions :param update: (int) the current step iteration :param writer: (TensorFlow Summary.writer) the writer for tensorboard :param states: (np.ndarray) For recurrent policies, the internal state of the recurrent model :return: policy gradient loss, value function loss, policy entropy, approximation of kl divergence, updated clipping range, training update operation :param cliprange_vf: (float) Clipping factor for the value function """ advs = returns - values advs = (advs - advs.mean()) / (advs.std() + 1e-8) td_map = {self.train_model.obs_ph: obs, self.action_ph: actions, self.advs_ph: advs, self.rewards_ph: returns, self.learning_rate_ph: learning_rate, self.clip_range_ph: cliprange, self.old_neglog_pac_ph: neglogpacs, self.old_vpred_ph: values} if states is not None: td_map[self.train_model.states_ph] = states td_map[self.train_model.dones_ph] = masks if cliprange_vf is not None and cliprange_vf >= 0: td_map[self.clip_range_vf_ph] = cliprange_vf if states is None: update_fac = self.n_batch // self.nminibatches // self.noptepochs + 1 else: update_fac = self.n_batch // self.nminibatches // self.noptepochs // self.n_steps + 1 if writer is not None: # run loss backprop with summary, but once every 10 runs save the metadata (memory, compute time, ...) if self.full_tensorboard_log and (1 + update) % 10 == 0: run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, policy_loss, value_loss, policy_entropy, approxkl, clipfrac, _ = self.sess.run( [self.summary, self.pg_loss, self.vf_loss, self.entropy, self.approxkl, self.clipfrac, self._train], td_map, options=run_options, run_metadata=run_metadata) writer.add_run_metadata(run_metadata, 'step%d' % (update * update_fac)) else: summary, policy_loss, value_loss, policy_entropy, approxkl, clipfrac, _ = self.sess.run( [self.summary, self.pg_loss, self.vf_loss, self.entropy, self.approxkl, self.clipfrac, self._train], td_map) writer.add_summary(summary, (update * update_fac)) else: policy_loss, value_loss, policy_entropy, approxkl, clipfrac, _ = self.sess.run( [self.pg_loss, self.vf_loss, self.entropy, self.approxkl, self.clipfrac, self._train], td_map) return policy_loss, value_loss, policy_entropy, approxkl, clipfrac def learn(self, total_timesteps, callback=None, log_interval=1, tb_log_name="PPO2", reset_num_timesteps=True): # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) self.cliprange = get_schedule_fn(self.cliprange) cliprange_vf = get_schedule_fn(self.cliprange_vf) new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() runner = Runner(env=self.env, model=self, n_steps=self.n_steps, gamma=self.gamma, lam=self.lam) self.episode_reward = np.zeros((self.n_envs,)) ep_info_buf = deque(maxlen=100) t_first_start = time.time() n_updates = total_timesteps // self.n_batch for update in range(1, n_updates + 1): assert self.n_batch % self.nminibatches == 0 batch_size = self.n_batch // self.nminibatches t_start = time.time() frac = 1.0 - (update - 1.0) / n_updates lr_now = self.learning_rate(frac) cliprange_now = self.cliprange(frac) cliprange_vf_now = cliprange_vf(frac) # true_reward is the reward without discount obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = runner.run() self.num_timesteps += self.n_batch ep_info_buf.extend(ep_infos) mb_loss_vals = [] if states is None: # nonrecurrent version update_fac = self.n_batch // self.nminibatches // self.noptepochs + 1 inds = np.arange(self.n_batch) for epoch_num in range(self.noptepochs): np.random.shuffle(inds) for start in range(0, self.n_batch, batch_size): timestep = self.num_timesteps // update_fac + ((self.noptepochs * self.n_batch + epoch_num * self.n_batch + start) // batch_size) end = start + batch_size mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_loss_vals.append(self._train_step(lr_now, cliprange_now, *slices, writer=writer, update=timestep, cliprange_vf=cliprange_vf_now)) else: # recurrent version update_fac = self.n_batch // self.nminibatches // self.noptepochs // self.n_steps + 1 assert self.n_envs % self.nminibatches == 0 env_indices = np.arange(self.n_envs) flat_indices = np.arange(self.n_envs * self.n_steps).reshape(self.n_envs, self.n_steps) envs_per_batch = batch_size // self.n_steps for epoch_num in range(self.noptepochs): np.random.shuffle(env_indices) for start in range(0, self.n_envs, envs_per_batch): timestep = self.num_timesteps // update_fac + ((self.noptepochs * self.n_envs + epoch_num * self.n_envs + start) // envs_per_batch) end = start + envs_per_batch mb_env_inds = env_indices[start:end] mb_flat_inds = flat_indices[mb_env_inds].ravel() slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_states = states[mb_env_inds] mb_loss_vals.append(self._train_step(lr_now, cliprange_now, *slices, update=timestep, writer=writer, states=mb_states, cliprange_vf=cliprange_vf_now)) loss_vals = np.mean(mb_loss_vals, axis=0) t_now = time.time() fps = int(self.n_batch / (t_now - t_start)) if writer is not None: self.episode_reward = total_episode_reward_logger(self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, self.num_timesteps) if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, returns) logger.logkv("serial_timesteps", update * self.n_steps) logger.logkv("n_updates", update) logger.logkv("total_timesteps", self.num_timesteps) logger.logkv("fps", fps) logger.logkv("explained_variance", float(explained_var)) if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0: logger.logkv('ep_reward_mean', safe_mean([ep_info['r'] for ep_info in ep_info_buf])) logger.logkv('ep_len_mean', safe_mean([ep_info['l'] for ep_info in ep_info_buf])) logger.logkv('time_elapsed', t_start - t_first_start) for (loss_val, loss_name) in zip(loss_vals, self.loss_names): logger.logkv(loss_name, loss_val) logger.dumpkvs() if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break return self def save(self, save_path, cloudpickle=False): data = { "gamma": self.gamma, "n_steps": self.n_steps, "vf_coef": self.vf_coef, "ent_coef": self.ent_coef, "max_grad_norm": self.max_grad_norm, "learning_rate": self.learning_rate, "lam": self.lam, "nminibatches": self.nminibatches, "noptepochs": self.noptepochs, "cliprange": self.cliprange, "cliprange_vf": self.cliprange_vf, "verbose": self.verbose, "policy": self.policy, "observation_space": self.observation_space, "action_space": self.action_space, "n_envs": self.n_envs, "n_cpu_tf_sess": self.n_cpu_tf_sess, "seed": self.seed, "_vectorize_action": self._vectorize_action, "policy_kwargs": self.policy_kwargs } params_to_save = self.get_parameters() self._save_to_file(save_path, data=data, params=params_to_save, cloudpickle=cloudpickle)
def setup_model(self): from stable_baselines.gail.adversary import TransitionClassifier with SetVerbosity(self.verbose): assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the PPO2 model must be " \ "an instance of common.policies.ActorCriticPolicy." self.n_batch = self.n_envs * self.n_steps self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) if self.using_gail: self.reward_giver = TransitionClassifier(self.observation_space, self.action_space, self.hidden_size_adversary, normalize=False, entcoeff=self.adversary_entcoeff) n_batch_step = None n_batch_train = None if issubclass(self.policy, RecurrentActorCriticPolicy): assert self.n_envs % self.nminibatches == 0, "For recurrent policies, "\ "the number of environments run in parallel should be a multiple of nminibatches." n_batch_step = self.n_envs n_batch_train = self.n_batch // self.nminibatches act_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, n_batch_step, reuse=False, **self.policy_kwargs) with tf.variable_scope("train_model", reuse=True, custom_getter=tf_util.outer_scope_getter("train_model")): train_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs // self.nminibatches, self.n_steps, n_batch_train, reuse=True, **self.policy_kwargs) with tf.variable_scope("loss", reuse=False): self.action_ph = train_model.pdtype.sample_placeholder([None], name="action_ph") self.advs_ph = tf.placeholder(tf.float32, [None], name="advs_ph") self.rewards_ph = tf.placeholder(tf.float32, [None], name="rewards_ph") self.old_neglog_pac_ph = tf.placeholder(tf.float32, [None], name="old_neglog_pac_ph") self.old_vpred_ph = tf.placeholder(tf.float32, [None], name="old_vpred_ph") self.learning_rate_ph = tf.placeholder(tf.float32, [], name="learning_rate_ph") self.clip_range_ph = tf.placeholder(tf.float32, [], name="clip_range_ph") neglogpac = train_model.proba_distribution.neglogp(self.action_ph) self.entropy = tf.reduce_mean(train_model.proba_distribution.entropy()) vpred = train_model.value_flat # Value function clipping: not present in the original PPO if self.cliprange_vf is None: # Default behavior (legacy from OpenAI baselines): # use the same clipping as for the policy self.clip_range_vf_ph = self.clip_range_ph self.cliprange_vf = self.cliprange elif isinstance(self.cliprange_vf, (float, int)) and self.cliprange_vf < 0: # Original PPO implementation: no value function clipping self.clip_range_vf_ph = None else: # Last possible behavior: clipping range # specific to the value function self.clip_range_vf_ph = tf.placeholder(tf.float32, [], name="clip_range_vf_ph") if self.clip_range_vf_ph is None: # No clipping vpred_clipped = train_model.value_flat else: # Clip the different between old and new value # NOTE: this depends on the reward scaling vpred_clipped = self.old_vpred_ph + \ tf.clip_by_value(train_model.value_flat - self.old_vpred_ph, - self.clip_range_vf_ph, self.clip_range_vf_ph) vf_losses1 = tf.square(vpred - self.rewards_ph) vf_losses2 = tf.square(vpred_clipped - self.rewards_ph) self.vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) ratio = tf.exp(self.old_neglog_pac_ph - neglogpac) pg_losses = -self.advs_ph * ratio pg_losses2 = -self.advs_ph * tf.clip_by_value(ratio, 1.0 - self.clip_range_ph, 1.0 + self.clip_range_ph) self.pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) self.approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - self.old_neglog_pac_ph)) self.clipfrac = tf.reduce_mean(tf.cast(tf.greater(tf.abs(ratio - 1.0), self.clip_range_ph), tf.float32)) loss = self.pg_loss - self.entropy * self.ent_coef + self.vf_loss * self.vf_coef tf.summary.scalar('entropy_loss', self.entropy) tf.summary.scalar('policy_gradient_loss', self.pg_loss) tf.summary.scalar('value_function_loss', self.vf_loss) tf.summary.scalar('approximate_kullback-leibler', self.approxkl) tf.summary.scalar('clip_factor', self.clipfrac) tf.summary.scalar('loss', loss) with tf.variable_scope('model'): self.params = tf.trainable_variables() # Update params with GAIL params too if self.using_gail: self.params.extend(self.reward_giver.get_trainable_variables()) if self.full_tensorboard_log: for var in self.params: tf.summary.histogram(var.name, var) grads = tf.gradients(loss, self.params) if self.max_grad_norm is not None: grads, _grad_norm = tf.clip_by_global_norm(grads, self.max_grad_norm) grads = list(zip(grads, self.params)) trainer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph, epsilon=1e-5) self._train = trainer.apply_gradients(grads) self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac'] with tf.variable_scope("input_info", reuse=False): tf.summary.scalar('discounted_rewards', tf.reduce_mean(self.rewards_ph)) tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph)) tf.summary.scalar('advantage', tf.reduce_mean(self.advs_ph)) tf.summary.scalar('clip_range', tf.reduce_mean(self.clip_range_ph)) if self.clip_range_vf_ph is not None: tf.summary.scalar('clip_range_vf', tf.reduce_mean(self.clip_range_vf_ph)) tf.summary.scalar('old_neglog_action_probabilty', tf.reduce_mean(self.old_neglog_pac_ph)) tf.summary.scalar('old_value_pred', tf.reduce_mean(self.old_vpred_ph)) if self.full_tensorboard_log: tf.summary.histogram('discounted_rewards', self.rewards_ph) tf.summary.histogram('learning_rate', self.learning_rate_ph) tf.summary.histogram('advantage', self.advs_ph) tf.summary.histogram('clip_range', self.clip_range_ph) tf.summary.histogram('old_neglog_action_probabilty', self.old_neglog_pac_ph) tf.summary.histogram('old_value_pred', self.old_vpred_ph) if tf_util.is_image(self.observation_space): tf.summary.image('observation', train_model.obs_ph) else: tf.summary.histogram('observation', train_model.obs_ph) self.train_model = train_model self.act_model = act_model self.step = act_model.step self.proba_step = act_model.proba_step self.value = act_model.value self.initial_state = act_model.initial_state tf.global_variables_initializer().run(session=self.sess) # pylint: disable=E1101 self.summary = tf.summary.merge_all()
class TRPO(BaseRLModel): def __init__(self, policy, env, gamma=0.99, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, lam=0.98, entcoeff=0.0, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters=3, verbose=0, _init_setup_model=True): """ learns a TRPO policy using the given environment :param policy: (function (str, Gym Space, Gym Space, bool): MLPPolicy) policy generator :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) :param gamma: (float) the discount value :param timesteps_per_batch: (int) the number of timesteps to run per batch (horizon) :param max_kl: (float) the kullback leiber loss threshold :param cg_iters: (int) the number of iterations for the conjugate gradient calculation :param lam: (float) GAE factor :param entcoeff: (float) the weight for the entropy loss :param cg_damping: (float) the compute gradient dampening factor :param vf_stepsize: (float) the value function stepsize :param vf_iters: (int) the value function's number iterations for learning :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance """ super(TRPO, self).__init__(policy=policy, env=env, requires_vec_env=False, verbose=verbose) self.using_gail = False self.timesteps_per_batch = timesteps_per_batch self.cg_iters = cg_iters self.cg_damping = cg_damping self.gamma = gamma self.lam = lam self.max_kl = max_kl self.vf_iters = vf_iters self.vf_stepsize = vf_stepsize self.entcoeff = entcoeff # GAIL Params self.pretrained_weight = None self.hidden_size_adversary = 100 self.adversary_entcoeff = 1e-3 self.expert_dataset = None self.save_per_iter = 1 self.checkpoint_dir = "/tmp/gail/ckpt/" self.g_step = 1 self.d_step = 1 self.task_name = "task_name" self.d_stepsize = 3e-4 self.graph = None self.sess = None self.policy_pi = None self.loss_names = None self.assign_old_eq_new = None self.compute_losses = None self.compute_lossandgrad = None self.compute_fvp = None self.compute_vflossandgrad = None self.d_adam = None self.vfadam = None self.get_flat = None self.set_from_flat = None self.timed = None self.allmean = None self.nworkers = None self.rank = None self.reward_giver = None self.step = None self.proba_step = None self.initial_state = None self.params = None if _init_setup_model: self.setup_model() def setup_model(self): # prevent import loops from stable_baselines.gail.adversary import TransitionClassifier with SetVerbosity(self.verbose): self.nworkers = MPI.COMM_WORLD.Get_size() self.rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.single_threaded_session(graph=self.graph) if self.using_gail: self.reward_giver = TransitionClassifier( self.env, self.hidden_size_adversary, entcoeff=self.adversary_entcoeff) # Construct network for new policy with tf.variable_scope("pi", reuse=False): self.policy_pi = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False) # Network for old policy with tf.variable_scope("oldpi", reuse=False): old_policy = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return observation = self.policy_pi.obs_ph action = self.policy_pi.pdtype.sample_placeholder([None]) kloldnew = old_policy.proba_distribution.kl( self.policy_pi.proba_distribution) ent = self.policy_pi.proba_distribution.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = self.entcoeff * meanent vferr = tf.reduce_mean( tf.square(self.policy_pi.value_fn[:, 0] - ret)) # advantage * pnew / pold ratio = tf.exp( self.policy_pi.proba_distribution.logp(action) - old_policy.proba_distribution.logp(action)) surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] self.loss_names = [ "optimgain", "meankl", "entloss", "surrgain", "entropy" ] dist = meankl all_var_list = tf_util.get_trainable_vars("pi") var_list = [ v for v in all_var_list if "/vf" not in v.name and "/q/" not in v.name ] vf_var_list = [ v for v in all_var_list if "/pi" not in v.name and "/logstd" not in v.name ] self.vfadam = MpiAdam(vf_var_list, sess=self.sess) self.get_flat = tf_util.GetFlat(var_list, sess=self.sess) self.set_from_flat = tf_util.SetFromFlat(var_list, sess=self.sess) if self.using_gail: self.d_adam = MpiAdam( self.reward_giver.get_trainable_variables()) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: var_size = tf_util.intprod(shape) tangents.append( tf.reshape(flat_tangent[start:start + var_size], shape)) start += var_size gvp = tf.add_n([ tf.reduce_sum(grad * tangent) for (grad, tangent) in zipsame(klgrads, tangents) ]) # pylint: disable=E1111 fvp = tf_util.flatgrad(gvp, var_list) self.assign_old_eq_new = tf_util.function( [], [], updates=[ tf.assign(oldv, newv) for ( oldv, newv) in zipsame(tf_util.get_globals_vars("oldpi"), tf_util.get_globals_vars("pi")) ]) self.compute_losses = tf_util.function( [observation, old_policy.obs_ph, action, atarg], losses) self.compute_lossandgrad = tf_util.function( [observation, old_policy.obs_ph, action, atarg], losses + [tf_util.flatgrad(optimgain, var_list)]) self.compute_fvp = tf_util.function([ flat_tangent, observation, old_policy.obs_ph, action, atarg ], fvp) self.compute_vflossandgrad = tf_util.function( [observation, old_policy.obs_ph, ret], tf_util.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if self.rank == 0 and self.verbose >= 1: print(colorize(msg, color='magenta')) start_time = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - start_time), color='magenta')) else: yield def allmean(arr): assert isinstance(arr, np.ndarray) out = np.empty_like(arr) MPI.COMM_WORLD.Allreduce(arr, out, op=MPI.SUM) out /= self.nworkers return out tf_util.initialize(sess=self.sess) th_init = self.get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) self.set_from_flat(th_init) if self.using_gail: self.d_adam.sync() self.vfadam.sync() self.timed = timed self.allmean = allmean self.step = self.policy_pi.step self.proba_step = self.policy_pi.proba_step self.initial_state = self.policy_pi.initial_state self.params = find_trainable_variables("pi") if self.using_gail: self.params.extend( self.reward_giver.get_trainable_variables()) def learn(self, total_timesteps, callback=None, seed=None, log_interval=100): with SetVerbosity(self.verbose): self._setup_learn(seed) with self.sess.as_default(): seg_gen = traj_segment_generator( self.policy_pi, self.env, self.timesteps_per_batch, reward_giver=self.reward_giver, gail=self.using_gail) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 t_start = time.time() lenbuffer = deque( maxlen=40) # rolling buffer for episode lengths rewbuffer = deque( maxlen=40) # rolling buffer for episode rewards true_rewbuffer = None if self.using_gail: true_rewbuffer = deque(maxlen=40) # Stats not used for now # g_loss_stats = Stats(loss_names) # d_loss_stats = Stats(reward_giver.loss_name) # ep_stats = Stats(["True_rewards", "Rewards", "Episode_length"]) # if provide pretrained weight if self.pretrained_weight is not None: tf_util.load_state( self.pretrained_weight, var_list=tf_util.get_globals_vars("pi"), sess=self.sess) while True: if callback: callback(locals(), globals()) if total_timesteps and timesteps_so_far >= total_timesteps: break logger.log("********** Iteration %i ************" % iters_so_far) def fisher_vector_product(vec): return self.allmean( self.compute_fvp( vec, *fvpargs, sess=self.sess)) + self.cg_damping * vec # ------------------ Update G ------------------ logger.log("Optimizing Policy...") # g_step = 1 when not using GAIL mean_losses = None vpredbefore = None tdlamret = None observation = None action = None seg = None for _ in range(self.g_step): with self.timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, self.gamma, self.lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) observation, action, atarg, tdlamret = seg["ob"], seg[ "ac"], seg["adv"], seg["tdlamret"] vpredbefore = seg[ "vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate args = seg["ob"], seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] self.assign_old_eq_new(sess=self.sess) with self.timed("computegrad"): *lossbefore, grad = self.compute_lossandgrad( *args, sess=self.sess) lossbefore = self.allmean(np.array(lossbefore)) grad = self.allmean(grad) if np.allclose(grad, 0): logger.log("Got zero gradient. not updating") else: with self.timed("cg"): stepdir = conjugate_gradient( fisher_vector_product, grad, cg_iters=self.cg_iters, verbose=self.rank == 0 and self.verbose >= 1) assert np.isfinite(stepdir).all() shs = .5 * stepdir.dot( fisher_vector_product(stepdir)) # abs(shs) to avoid taking square root of negative values lagrange_multiplier = np.sqrt( abs(shs) / self.max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lagrange_multiplier expectedimprove = grad.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = self.get_flat() thnew = None for _ in range(10): thnew = thbefore + fullstep * stepsize self.set_from_flat(thnew) mean_losses = surr, kl_loss, *_ = self.allmean( np.array( self.compute_losses(*args, sess=self.sess))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(mean_losses).all(): logger.log( "Got non-finite value of losses -- bad!" ) elif kl_loss > self.max_kl * 1.5: logger.log( "violated KL constraint. shrinking step." ) elif improve < 0: logger.log( "surrogate didn't improve. shrinking step." ) else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") self.set_from_flat(thbefore) if self.nworkers > 1 and iters_so_far % 20 == 0: # list of tuples paramsums = MPI.COMM_WORLD.allgather( (thnew.sum(), self.vfadam.getflat().sum())) assert all( np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) with self.timed("vf"): for _ in range(self.vf_iters): for (mbob, mbret) in dataset.iterbatches( (seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=128): grad = self.allmean( self.compute_vflossandgrad( mbob, mbob, mbret, sess=self.sess)) self.vfadam.update(grad, self.vf_stepsize) for (loss_name, loss_val) in zip(self.loss_names, mean_losses): logger.record_tabular(loss_name, loss_val) logger.record_tabular( "ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) if self.using_gail: # ------------------ Update D ------------------ logger.log("Optimizing Discriminator...") logger.log(fmt_row(13, self.reward_giver.loss_name)) ob_expert, ac_expert = self.expert_dataset.get_next_batch( len(observation)) batch_size = len(observation) // self.d_step d_losses = [ ] # list of tuples, each of which gives the loss for a minibatch for ob_batch, ac_batch in dataset.iterbatches( (observation, action), include_final_partial_batch=False, batch_size=batch_size): ob_expert, ac_expert = self.expert_dataset.get_next_batch( len(ob_batch)) # update running mean/std for reward_giver if hasattr(self.reward_giver, "obs_rms"): self.reward_giver.obs_rms.update( np.concatenate((ob_batch, ob_expert), 0)) *newlosses, grad = self.reward_giver.lossandgrad( ob_batch, ac_batch, ob_expert, ac_expert) self.d_adam.update(self.allmean(grad), self.d_stepsize) d_losses.append(newlosses) logger.log(fmt_row(13, np.mean(d_losses, axis=0))) lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_true_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather( lrlocal) # list of tuples lens, rews, true_rets = map(flatten_lists, zip(*listoflrpairs)) true_rewbuffer.extend(true_rets) else: lrlocal = (seg["ep_lens"], seg["ep_rets"] ) # local values listoflrpairs = MPI.COMM_WORLD.allgather( lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) if self.using_gail: logger.record_tabular("EpTrueRewMean", np.mean(true_rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += seg["total_timestep"] iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - t_start) if self.verbose >= 1 and self.rank == 0: logger.dump_tabular() return self def predict(self, observation, state=None, mask=None): if state is None: state = self.initial_state if mask is None: mask = [False for _ in range(self.n_envs)] observation = np.array(observation).reshape( (-1, ) + self.observation_space.shape) actions, _, states, _ = self.step(observation, state, mask) return actions, states def action_probability(self, observation, state=None, mask=None): if state is None: state = self.initial_state if mask is None: mask = [False for _ in range(self.n_envs)] observation = np.array(observation).reshape( (-1, ) + self.observation_space.shape) return self.proba_step(observation, state, mask) def save(self, save_path): data = { "gamma": self.gamma, "timesteps_per_batch": self.timesteps_per_batch, "max_kl": self.max_kl, "cg_iters": self.cg_iters, "lam": self.lam, "entcoeff": self.entcoeff, "cg_damping": self.cg_damping, "vf_stepsize": self.vf_stepsize, "vf_iters": self.vf_iters, "pretrained_weight": self.pretrained_weight, "reward_giver": self.reward_giver, "expert_dataset": self.expert_dataset, "save_per_iter": self.save_per_iter, "checkpoint_dir": self.checkpoint_dir, "g_step": self.g_step, "d_step": self.d_step, "task_name": self.task_name, "d_stepsize": self.d_stepsize, "using_gail": self.using_gail, "verbose": self.verbose, "policy": self.policy, "observation_space": self.observation_space, "action_space": self.action_space, "n_envs": self.n_envs, "_vectorize_action": self._vectorize_action } params = self.sess.run(self.params) self._save_to_file(save_path, data=data, params=params) @classmethod def load(cls, load_path, env=None, **kwargs): data, params = cls._load_from_file(load_path) model = cls(policy=data["policy"], env=None, _init_setup_model=False) model.__dict__.update(data) model.__dict__.update(kwargs) model.set_env(env) model.setup_model() restores = [] for param, loaded_p in zip(model.params, params): restores.append(param.assign(loaded_p)) model.sess.run(restores) return model
def setup_model(self): # prevent import loops from stable_baselines.gail.adversary import TransitionClassifier from stable_baselines.mdal.adversary import TabularAdversaryTF, NeuralAdversaryTRPO with SetVerbosity(self.verbose): assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the MDPO model must be " \ "an instance of common.policies.ActorCriticPolicy." self.nworkers = MPI.COMM_WORLD.Get_size() self.rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.single_threaded_session(graph=self.graph) # self._setup_learn(self.seed) self._setup_learn() if self.using_gail: self.reward_giver = TransitionClassifier(self.observation_space, self.action_space, self.hidden_size_adversary, entcoeff=self.adversary_entcoeff) elif self.using_mdal: if self.neural: self.reward_giver = NeuralAdversaryTRPO(self.sess, self.observation_space, self.action_space, self.hidden_size_adversary, entcoeff=self.adversary_entcoeff) else: self.reward_giver = TabularAdversaryTF(self.sess, self.observation_space, self.action_space, self.hidden_size_adversary, entcoeff=self.adversary_entcoeff, expert_features=self.expert_dataset.successor_features, exploration_bonus=self.exploration_bonus, bonus_coef=self.bonus_coef, t_c=self.t_c, is_action_features=self.is_action_features) # Construct network for new policy self.policy_pi = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) # Network for old policy with tf.variable_scope("oldpi", reuse=False): self.old_policy = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) # Network for fitting closed form with tf.variable_scope("closedpi", reuse=False): self.closed_policy = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) with tf.variable_scope("loss", reuse=False): self.atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) self.vtarg = tf.placeholder(dtype=tf.float32, shape=[None]) self.ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return self.learning_rate_ph = tf.placeholder(dtype=tf.float32, shape=[], name="learning_rate_ph") self.outer_learning_rate_ph = tf.placeholder(dtype=tf.float32, shape=[], name="outer_learning_rate_ph") self.old_vpred_ph = tf.placeholder(dtype=tf.float32, shape=[None], name="old_vpred_ph") self.clip_range_vf_ph = tf.placeholder(dtype=tf.float32, shape=[], name="clip_range_ph") observation = self.policy_pi.obs_ph self.action = self.policy_pi.pdtype.sample_placeholder([None]) if self.tsallis_q == 1.0: kloldnew = self.policy_pi.proba_distribution.kl(self.old_policy.proba_distribution) ent = self.policy_pi.proba_distribution.entropy() meankl = tf.reduce_mean(kloldnew) else: logp_pi = self.policy_pi.proba_distribution.logp(self.action) logp_pi_old = self.old_policy.proba_distribution.logp(self.action) ent = self.policy_pi.proba_distribution.entropy() #kloldnew = self.policy_pi.proba_distribution.kl_tsallis(self.old_policy.proba_distribution, self.tsallis_q) tsallis_q = 2.0 - self.tsallis_q meankl = tf.reduce_mean(tf_log_q(tf.exp(logp_pi), tsallis_q) - tf_log_q(tf.exp(logp_pi_old), tsallis_q)) #tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = self.entcoeff * meanent if self.cliprange_vf is None: vpred_clipped = self.policy_pi.value_flat else: vpred_clipped = self.old_vpred_ph + \ tf.clip_by_value(self.policy_pi.value_flat - self.old_vpred_ph, - self.clip_range_vf_ph, self.clip_range_vf_ph) vf_losses1 = tf.square(self.policy_pi.value_flat - self.ret) vf_losses2 = tf.square(vpred_clipped - self.ret) vferr = tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) # advantage * pnew / pold ratio = tf.exp(self.policy_pi.proba_distribution.logp(self.action) - self.old_policy.proba_distribution.logp(self.action)) if self.method == "multistep-SGD": surrgain = tf.reduce_mean(ratio * self.atarg) - meankl / self.learning_rate_ph elif self.method == "closedreverse-KL": surrgain = tf.reduce_mean(tf.exp(self.atarg) * self.policy_pi.proba_distribution.logp(self.action)) else: policygain = tf.reduce_mean(tf.exp(self.atarg) * tf.log(self.closed_policy.proba_distribution.mean)) surrgain = tf.reduce_mean(ratio * self.atarg) - tf.reduce_mean(self.learning_rate_ph * ratio * self.policy_pi.proba_distribution.logp(self.action)) optimgain = surrgain #+ entbonus - self.learning_rate_ph * meankl losses = [optimgain, meankl, entbonus, surrgain, meanent] self.loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = tf_util.get_trainable_vars("model") var_list = [v for v in all_var_list if "/vf" not in v.name and "/q/" not in v.name] vf_var_list = [v for v in all_var_list if "/pi" not in v.name and "/logstd" not in v.name] print("policy vars", var_list) all_closed_var_list = tf_util.get_trainable_vars("closedpi") closed_var_list = [v for v in all_closed_var_list if "/vf" not in v.name and "/q" not in v.name] self.get_flat = tf_util.GetFlat(var_list, sess=self.sess) self.set_from_flat = tf_util.SetFromFlat(var_list, sess=self.sess) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: var_size = tf_util.intprod(shape) tangents.append(tf.reshape(flat_tangent[start: start + var_size], shape)) start += var_size gvp = tf.add_n([tf.reduce_sum(grad * tangent) for (grad, tangent) in zipsame(klgrads, tangents)]) # pylint: disable=E1111 fvp = tf_util.flatgrad(gvp, var_list) # tf.summary.scalar('entropy_loss', meanent) # tf.summary.scalar('policy_gradient_loss', optimgain) # tf.summary.scalar('value_function_loss', surrgain) # tf.summary.scalar('approximate_kullback-leibler', meankl) # tf.summary.scalar('loss', optimgain + meankl + entbonus + surrgain + meanent) self.assign_old_eq_new = \ tf_util.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(tf_util.get_globals_vars("oldpi"), tf_util.get_globals_vars("model"))]) self.compute_losses = tf_util.function([observation, self.old_policy.obs_ph, self.action, self.atarg, self.learning_rate_ph, self.vtarg], losses) self.compute_fvp = tf_util.function([flat_tangent, observation, self.old_policy.obs_ph, self.action, self.atarg], fvp) self.compute_vflossandgrad = tf_util.function([observation, self.old_policy.obs_ph, self.ret, self.old_vpred_ph, self.clip_range_vf_ph], tf_util.flatgrad(vferr, vf_var_list)) grads = tf.gradients(-optimgain, var_list) grads, _grad_norm = tf.clip_by_global_norm(grads, 0.5) trainer = tf.train.AdamOptimizer(learning_rate=self.outer_learning_rate_ph, epsilon=1e-5) # trainer = tf.train.AdamOptimizer(learning_rate=3e-4, epsilon=1e-5) grads = list(zip(grads, var_list)) self._train = trainer.apply_gradients(grads) @contextmanager def timed(msg): if self.rank == 0 and self.verbose >= 1: # print(colorize(msg, color='magenta')) # start_time = time.time() yield # print(colorize("done in {:.3f} seconds".format((time.time() - start_time)), # color='magenta')) else: yield def allmean(arr): assert isinstance(arr, np.ndarray) out = np.empty_like(arr) MPI.COMM_WORLD.Allreduce(arr, out, op=MPI.SUM) out /= self.nworkers return out tf_util.initialize(sess=self.sess) th_init = self.get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) self.set_from_flat(th_init) with tf.variable_scope("Adam_mpi", reuse=False): self.vfadam = MpiAdam(vf_var_list, sess=self.sess) if self.using_gail or self.using_mdal: self.d_adam = MpiAdam(self.reward_giver.get_trainable_variables(), sess=self.sess) self.d_adam.sync() self.vfadam.sync() with tf.variable_scope("input_info", reuse=False): tf.summary.scalar('discounted_rewards', tf.reduce_mean(self.ret)) tf.summary.scalar('learning_rate', tf.reduce_mean(self.vf_stepsize)) tf.summary.scalar('advantage', tf.reduce_mean(self.atarg)) tf.summary.scalar('kl_clip_range', tf.reduce_mean(self.max_kl)) if self.full_tensorboard_log: tf.summary.histogram('discounted_rewards', self.ret) tf.summary.histogram('learning_rate', self.vf_stepsize) tf.summary.histogram('advantage', self.atarg) tf.summary.histogram('kl_clip_range', self.max_kl) if tf_util.is_image(self.observation_space): tf.summary.image('observation', observation) else: tf.summary.histogram('observation', observation) self.timed = timed self.allmean = allmean self.step = self.policy_pi.step self.proba_step = self.policy_pi.proba_step self.initial_state = self.policy_pi.initial_state self.params = tf_util.get_trainable_vars("model") + tf_util.get_trainable_vars("oldpi") if self.using_gail: self.params.extend(self.reward_giver.get_trainable_variables()) self.summary = tf.summary.merge_all() self.compute_lossandgrad = \ tf_util.function([observation, self.old_policy.obs_ph, self.action, self.atarg, self.ret, self.learning_rate_ph, self.vtarg, self.closed_policy.obs_ph], [self.summary, tf_util.flatgrad(optimgain, var_list)] + losses)