def setup_model(self): with SetVerbosity(self.verbose): assert not isinstance(self.action_space, gym.spaces.Box), \ "Error: DQN cannot output a gym.spaces.Box action space." # If the policy is wrap in functool.partial (e.g. to disable dueling) # unwrap it to check the class type if isinstance(self.policy, partial): test_policy = self.policy.func else: test_policy = self.policy assert issubclass(test_policy, DQNPolicy), "Error: the input policy for the DQN model must be " \ "an instance of DQNPolicy." self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate) self.act, self._train_step, self.update_target, self.step_model = build_train( q_func=partial(self.policy, **self.policy_kwargs), ob_space=self.observation_space, ac_space=self.action_space, optimizer=optimizer, gamma=self.gamma, grad_norm_clipping=10, param_noise=self.param_noise, sess=self.sess, full_tensorboard_log=self.full_tensorboard_log, double_q=self.double_q) self.proba_step = self.step_model.proba_step self.params = tf_util.get_trainable_vars("deepq") # Initialize the parameters and copy them to the target network. tf_util.initialize(self.sess) self.update_target(sess=self.sess) self.summary = tf.summary.merge_all()
def learn(self, total_timesteps, callback=None, log_interval=4, tb_log_name="TD3", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) if replay_wrapper is not None: self.replay_buffer = replay_wrapper(self.replay_buffer) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) # Initial learning rate current_lr = self.learning_rate(1) start_time = time.time() episode_rewards = [0.0] episode_successes = [] if self.action_noise is not None: self.action_noise.reset() obs = self.env.reset() # Retrieve unnormalized observation for saving into the buffer if self._vec_normalize_env is not None: obs_ = self._vec_normalize_env.get_original_obs().squeeze() n_updates = 0 infos_values = [] callback.on_training_start(locals(), globals()) callback.on_rollout_start() for step in range(total_timesteps): # Before training starts, randomly sample actions # from a uniform distribution for better exploration. # Afterwards, use the learned policy # if random_exploration is set to 0 (normal setting) if self.num_timesteps < self.learning_starts or np.random.rand( ) < self.random_exploration: # actions sampled from action space are from range specific to the environment # but algorithm operates on tanh-squashed actions therefore simple scaling is used unscaled_action = self.env.action_space.sample() action = scale_action(self.action_space, unscaled_action) else: action = self.policy_tf.step(obs[None]).flatten() # Add noise to the action, as the policy # is deterministic, this is required for exploration if self.action_noise is not None: action = np.clip(action + self.action_noise(), -1, 1) # Rescale from [-1, 1] to the correct bounds unscaled_action = unscale_action(self.action_space, action) assert action.shape == self.env.action_space.shape new_obs, reward, done, info = self.env.step(unscaled_action) self.num_timesteps += 1 # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback.on_step() is False: break # Store only the unnormalized version if self._vec_normalize_env is not None: new_obs_ = self._vec_normalize_env.get_original_obs( ).squeeze() reward_ = self._vec_normalize_env.get_original_reward( ).squeeze() else: # Avoid changing the original ones obs_, new_obs_, reward_ = obs, new_obs, reward # Store transition in the replay buffer. self.replay_buffer.add(obs_, action, reward_, new_obs_, float(done)) obs = new_obs # Save the unnormalized observation if self._vec_normalize_env is not None: obs_ = new_obs_ # Retrieve reward and episode length if using Monitor wrapper maybe_ep_info = info.get('episode') if maybe_ep_info is not None: self.ep_info_buf.extend([maybe_ep_info]) if writer is not None: # Write reward per episode to tensorboard ep_reward = np.array([reward_]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) tf_util.total_episode_reward_logger( self.episode_reward, ep_reward, ep_done, writer, self.num_timesteps) if self.num_timesteps % self.train_freq == 0: callback.on_rollout_end() mb_infos_vals = [] # Update policy, critics and target networks for grad_step in range(self.gradient_steps): # Break if the warmup phase is not over # or if there are not enough samples in the replay buffer if not self.replay_buffer.can_sample(self.batch_size) \ or self.num_timesteps < self.learning_starts: break n_updates += 1 # Compute current learning_rate frac = 1.0 - step / total_timesteps current_lr = self.learning_rate(frac) # Update policy and critics (q functions) # Note: the policy is updated less frequently than the Q functions # this is controlled by the `policy_delay` parameter mb_infos_vals.append( self._train_step(step, writer, current_lr, (step + grad_step) % self.policy_delay == 0)) # Log losses and entropy, useful for monitor training if len(mb_infos_vals) > 0: infos_values = np.mean(mb_infos_vals, axis=0) callback.on_rollout_start() episode_rewards[-1] += reward_ if done: if self.action_noise is not None: self.action_noise.reset() if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if len(episode_rewards[-101:-1]) == 0: mean_reward = -np.inf else: mean_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) # Display training infos if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: fps = int(step / (time.time() - start_time)) logger.logkv("episodes", num_episodes) logger.logkv("mean 100 episode reward", mean_reward) if len(self.ep_info_buf) > 0 and len( self.ep_info_buf[0]) > 0: logger.logkv( 'ep_rewmean', safe_mean([ ep_info['r'] for ep_info in self.ep_info_buf ])) logger.logkv( 'eplenmean', safe_mean([ ep_info['l'] for ep_info in self.ep_info_buf ])) logger.logkv("n_updates", n_updates) logger.logkv("current_lr", current_lr) logger.logkv("fps", fps) logger.logkv('time_elapsed', int(time.time() - start_time)) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) if len(infos_values) > 0: for (name, val) in zip(self.infos_names, infos_values): logger.logkv(name, val) logger.logkv("total timesteps", self.num_timesteps) logger.dumpkvs() # Reset infos: infos_values = [] callback.on_training_end() return self
def setup_model(self): with SetVerbosity(self.verbose): self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) self.replay_buffer = ReplayBuffer(self.buffer_size) with tf.variable_scope("input", reuse=False): # Create policy and target TF objects self.policy_tf = self.policy(self.sess, self.observation_space, self.action_space, **self.policy_kwargs) self.target_policy_tf = self.policy( self.sess, self.observation_space, self.action_space, **self.policy_kwargs) # Initialize Placeholders self.observations_ph = self.policy_tf.obs_ph # Normalized observation for pixels self.processed_obs_ph = self.policy_tf.processed_obs self.next_observations_ph = self.target_policy_tf.obs_ph self.processed_next_obs_ph = self.target_policy_tf.processed_obs self.action_target = self.target_policy_tf.action_ph self.terminals_ph = tf.placeholder(tf.float32, shape=(None, 1), name='terminals') self.rewards_ph = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions_ph = tf.placeholder(tf.float32, shape=(None, ) + self.action_space.shape, name='actions') self.learning_rate_ph = tf.placeholder( tf.float32, [], name="learning_rate_ph") with tf.variable_scope("model", reuse=False): # Create the policy self.policy_out = policy_out = self.policy_tf.make_actor( self.processed_obs_ph) # Use two Q-functions to improve performance by reducing overestimation bias qf1, qf2 = self.policy_tf.make_critics( self.processed_obs_ph, self.actions_ph) # Q value when following the current policy qf1_pi, _ = self.policy_tf.make_critics( self.processed_obs_ph, policy_out, reuse=True) with tf.variable_scope("target", reuse=False): # Create target networks target_policy_out = self.target_policy_tf.make_actor( self.processed_next_obs_ph) # Target policy smoothing, by adding clipped noise to target actions target_noise = tf.random_normal( tf.shape(target_policy_out), stddev=self.target_policy_noise) target_noise = tf.clip_by_value(target_noise, -self.target_noise_clip, self.target_noise_clip) # Clip the noisy action to remain in the bounds [-1, 1] (output of a tanh) noisy_target_action = tf.clip_by_value( target_policy_out + target_noise, -1, 1) # Q values when following the target policy qf1_target, qf2_target = self.target_policy_tf.make_critics( self.processed_next_obs_ph, noisy_target_action) with tf.variable_scope("loss", reuse=False): # Take the min of the two target Q-Values (clipped Double-Q Learning) min_qf_target = tf.minimum(qf1_target, qf2_target) # Targets for Q value regression q_backup = tf.stop_gradient(self.rewards_ph + (1 - self.terminals_ph) * self.gamma * min_qf_target) # Compute Q-Function loss qf1_loss = tf.reduce_mean((q_backup - qf1)**2) qf2_loss = tf.reduce_mean((q_backup - qf2)**2) qvalues_losses = qf1_loss + qf2_loss # Policy loss: maximise q value self.policy_loss = policy_loss = -tf.reduce_mean(qf1_pi) # Policy train op # will be called only every n training steps, # where n is the policy delay policy_optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate_ph) policy_train_op = policy_optimizer.minimize( policy_loss, var_list=tf_util.get_trainable_vars('model/pi')) self.policy_train_op = policy_train_op # Q Values optimizer qvalues_optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate_ph) qvalues_params = tf_util.get_trainable_vars( 'model/values_fn/') # Q Values and policy target params source_params = tf_util.get_trainable_vars("model/") target_params = tf_util.get_trainable_vars("target/") # Polyak averaging for target variables self.target_ops = [ tf.assign(target, (1 - self.tau) * target + self.tau * source) for target, source in zip(target_params, source_params) ] # Initializing target to match source variables target_init_op = [ tf.assign(target, source) for target, source in zip(target_params, source_params) ] train_values_op = qvalues_optimizer.minimize( qvalues_losses, var_list=qvalues_params) self.infos_names = ['qf1_loss', 'qf2_loss'] # All ops to call during one training step self.step_ops = [ qf1_loss, qf2_loss, qf1, qf2, train_values_op ] # Monitor losses and entropy in tensorboard tf.summary.scalar('policy_loss', policy_loss) tf.summary.scalar('qf1_loss', qf1_loss) tf.summary.scalar('qf2_loss', qf2_loss) tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph)) # Retrieve parameters that must be saved self.params = tf_util.get_trainable_vars("model") self.target_params = tf_util.get_trainable_vars("target/") # Initialize Variables and target network with self.sess.as_default(): self.sess.run(tf.global_variables_initializer()) self.sess.run(target_init_op) self.summary = tf.summary.merge_all()
def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="TRPO", reset_num_timesteps=True): new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() with self.sess.as_default(): callback.on_training_start(locals(), globals()) seg_gen = traj_segment_generator(self.policy_pi, self.env, self.timesteps_per_batch, reward_giver=self.reward_giver, gail=self.using_gail, callback=callback) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 t_start = time.time() len_buffer = deque(maxlen=40) # rolling buffer for episode lengths reward_buffer = deque(maxlen=40) # rolling buffer for episode rewards true_reward_buffer = None if self.using_gail: true_reward_buffer = deque(maxlen=40) self._initialize_dataloader() # Stats not used for now # TODO: replace with normal tb logging # g_loss_stats = Stats(loss_names) # d_loss_stats = Stats(reward_giver.loss_name) # ep_stats = Stats(["True_rewards", "Rewards", "Episode_length"]) while True: if timesteps_so_far >= total_timesteps: break logger.log("********** Iteration %i ************" % iters_so_far) def fisher_vector_product(vec): return self.allmean(self.compute_fvp(vec, *fvpargs, sess=self.sess)) + self.cg_damping * vec # ------------------ Update G ------------------ logger.log("Optimizing Policy...") # g_step = 1 when not using GAIL mean_losses = None vpredbefore = None tdlamret = None observation = None action = None seg = None for k in range(self.g_step): with self.timed("sampling"): seg = seg_gen.__next__() # Stop training early (triggered by the callback) if not seg.get('continue_training', True): # pytype: disable=attribute-error break add_vtarg_and_adv(seg, self.gamma, self.lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) observation, action = seg["observations"], seg["actions"] atarg, tdlamret = seg["adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before update atarg = (atarg - atarg.mean()) / (atarg.std() + 1e-8) # standardized advantage function estimate # true_rew is the reward without discount if writer is not None: total_episode_reward_logger(self.episode_reward, seg["true_rewards"].reshape( (self.n_envs, -1)), seg["dones"].reshape((self.n_envs, -1)), writer, self.num_timesteps) args = seg["observations"], seg["observations"], seg["actions"], atarg # Subsampling: see p40-42 of John Schulman thesis # http://joschu.net/docs/thesis.pdf fvpargs = [arr[::5] for arr in args] self.assign_old_eq_new(sess=self.sess) with self.timed("computegrad"): steps = self.num_timesteps + (k + 1) * (seg["total_timestep"] / self.g_step) run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() if self.full_tensorboard_log else None # run loss backprop with summary, and save the metadata (memory, compute time, ...) if writer is not None: summary, grad, *lossbefore = self.compute_lossandgrad(*args, tdlamret, sess=self.sess, options=run_options, run_metadata=run_metadata) if self.full_tensorboard_log: writer.add_run_metadata(run_metadata, 'step%d' % steps) writer.add_summary(summary, steps) else: _, grad, *lossbefore = self.compute_lossandgrad(*args, tdlamret, sess=self.sess, options=run_options, run_metadata=run_metadata) lossbefore = self.allmean(np.array(lossbefore)) grad = self.allmean(grad) if np.allclose(grad, 0): logger.log("Got zero gradient. not updating") else: with self.timed("conjugate_gradient"): stepdir = conjugate_gradient(fisher_vector_product, grad, cg_iters=self.cg_iters, verbose=self.rank == 0 and self.verbose >= 1) assert np.isfinite(stepdir).all() shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) # abs(shs) to avoid taking square root of negative values lagrange_multiplier = np.sqrt(abs(shs) / self.max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lagrange_multiplier expectedimprove = grad.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = self.get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize self.set_from_flat(thnew) mean_losses = surr, kl_loss, *_ = self.allmean( np.array(self.compute_losses(*args, sess=self.sess))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(mean_losses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl_loss > self.max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") self.set_from_flat(thbefore) if self.nworkers > 1 and iters_so_far % 20 == 0: # list of tuples paramsums = MPI.COMM_WORLD.allgather((thnew.sum(), self.vfadam.getflat().sum())) assert all(np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) for (loss_name, loss_val) in zip(self.loss_names, mean_losses): logger.record_tabular(loss_name, loss_val) with self.timed("vf"): for _ in range(self.vf_iters): # NOTE: for recurrent policies, use shuffle=False? for (mbob, mbret) in dataset.iterbatches((seg["observations"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=128, shuffle=True): grad = self.allmean(self.compute_vflossandgrad(mbob, mbob, mbret, sess=self.sess)) self.vfadam.update(grad, self.vf_stepsize) # Stop training early (triggered by the callback) if not seg.get('continue_training', True): # pytype: disable=attribute-error break logger.record_tabular("explained_variance_tdlam_before", explained_variance(vpredbefore, tdlamret)) if self.using_gail: # ------------------ Update D ------------------ logger.log("Optimizing Discriminator...") logger.log(fmt_row(13, self.reward_giver.loss_name)) assert len(observation) == self.timesteps_per_batch batch_size = self.timesteps_per_batch // self.d_step # NOTE: uses only the last g step for observation d_losses = [] # list of tuples, each of which gives the loss for a minibatch # NOTE: for recurrent policies, use shuffle=False? for ob_batch, ac_batch in dataset.iterbatches((observation, action), include_final_partial_batch=False, batch_size=batch_size, shuffle=True): ob_expert, ac_expert = self.expert_dataset.get_next_batch() # update running mean/std for reward_giver if self.reward_giver.normalize: self.reward_giver.obs_rms.update(np.concatenate((ob_batch, ob_expert), 0)) # Reshape actions if needed when using discrete actions if isinstance(self.action_space, gym.spaces.Discrete): if len(ac_batch.shape) == 2: ac_batch = ac_batch[:, 0] if len(ac_expert.shape) == 2: ac_expert = ac_expert[:, 0] *newlosses, grad = self.reward_giver.lossandgrad(ob_batch, ac_batch, ob_expert, ac_expert) self.d_adam.update(self.allmean(grad), self.d_stepsize) d_losses.append(newlosses) logger.log(fmt_row(13, np.mean(d_losses, axis=0))) # lr: lengths and rewards lr_local = (seg["ep_lens"], seg["ep_rets"], seg["ep_true_rets"]) # local values list_lr_pairs = MPI.COMM_WORLD.allgather(lr_local) # list of tuples lens, rews, true_rets = map(flatten_lists, zip(*list_lr_pairs)) true_reward_buffer.extend(true_rets) else: # lr: lengths and rewards lr_local = (seg["ep_lens"], seg["ep_rets"]) # local values list_lr_pairs = MPI.COMM_WORLD.allgather(lr_local) # list of tuples lens, rews = map(flatten_lists, zip(*list_lr_pairs)) len_buffer.extend(lens) reward_buffer.extend(rews) if len(len_buffer) > 0: logger.record_tabular("EpLenMean", np.mean(len_buffer)) logger.record_tabular("EpRewMean", np.mean(reward_buffer)) if self.using_gail: logger.record_tabular("EpTrueRewMean", np.mean(true_reward_buffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) current_it_timesteps = MPI.COMM_WORLD.allreduce(seg["total_timestep"]) timesteps_so_far += current_it_timesteps self.num_timesteps += current_it_timesteps iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", self.num_timesteps) logger.record_tabular("TimeElapsed", time.time() - t_start) if self.verbose >= 1 and self.rank == 0: logger.dump_tabular() callback.on_training_end() return self
def setup_model(self): # prevent import loops from stable_baselines_custom.gail.adversary import TransitionClassifier with SetVerbosity(self.verbose): assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the TRPO model must be " \ "an instance of common.policies.ActorCriticPolicy." self.nworkers = MPI.COMM_WORLD.Get_size() self.rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) if self.using_gail: self.reward_giver = TransitionClassifier(self.observation_space, self.action_space, self.hidden_size_adversary, entcoeff=self.adversary_entcoeff) # Construct network for new policy self.policy_pi = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) # Network for old policy with tf.variable_scope("oldpi", reuse=False): old_policy = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) with tf.variable_scope("loss", reuse=False): atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return observation = self.policy_pi.obs_ph action = self.policy_pi.pdtype.sample_placeholder([None]) kloldnew = old_policy.proba_distribution.kl(self.policy_pi.proba_distribution) ent = self.policy_pi.proba_distribution.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = self.entcoeff * meanent vferr = tf.reduce_mean(tf.square(self.policy_pi.value_flat - ret)) # advantage * pnew / pold ratio = tf.exp(self.policy_pi.proba_distribution.logp(action) - old_policy.proba_distribution.logp(action)) surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] self.loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = tf_util.get_trainable_vars("model") var_list = [v for v in all_var_list if "/vf" not in v.name and "/q/" not in v.name] vf_var_list = [v for v in all_var_list if "/pi" not in v.name and "/logstd" not in v.name] self.get_flat = tf_util.GetFlat(var_list, sess=self.sess) self.set_from_flat = tf_util.SetFromFlat(var_list, sess=self.sess) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: var_size = tf_util.intprod(shape) tangents.append(tf.reshape(flat_tangent[start: start + var_size], shape)) start += var_size gvp = tf.add_n([tf.reduce_sum(grad * tangent) for (grad, tangent) in zipsame(klgrads, tangents)]) # pylint: disable=E1111 # Fisher vector products fvp = tf_util.flatgrad(gvp, var_list) tf.summary.scalar('entropy_loss', meanent) tf.summary.scalar('policy_gradient_loss', optimgain) tf.summary.scalar('value_function_loss', surrgain) tf.summary.scalar('approximate_kullback-leibler', meankl) tf.summary.scalar('loss', optimgain + meankl + entbonus + surrgain + meanent) self.assign_old_eq_new = \ tf_util.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(tf_util.get_globals_vars("oldpi"), tf_util.get_globals_vars("model"))]) self.compute_losses = tf_util.function([observation, old_policy.obs_ph, action, atarg], losses) self.compute_fvp = tf_util.function([flat_tangent, observation, old_policy.obs_ph, action, atarg], fvp) self.compute_vflossandgrad = tf_util.function([observation, old_policy.obs_ph, ret], tf_util.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if self.rank == 0 and self.verbose >= 1: print(colorize(msg, color='magenta')) start_time = time.time() yield print(colorize("done in {:.3f} seconds".format((time.time() - start_time)), color='magenta')) else: yield def allmean(arr): assert isinstance(arr, np.ndarray) out = np.empty_like(arr) MPI.COMM_WORLD.Allreduce(arr, out, op=MPI.SUM) out /= self.nworkers return out tf_util.initialize(sess=self.sess) th_init = self.get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) self.set_from_flat(th_init) with tf.variable_scope("Adam_mpi", reuse=False): self.vfadam = MpiAdam(vf_var_list, sess=self.sess) if self.using_gail: self.d_adam = MpiAdam(self.reward_giver.get_trainable_variables(), sess=self.sess) self.d_adam.sync() self.vfadam.sync() with tf.variable_scope("input_info", reuse=False): tf.summary.scalar('discounted_rewards', tf.reduce_mean(ret)) tf.summary.scalar('learning_rate', tf.reduce_mean(self.vf_stepsize)) tf.summary.scalar('advantage', tf.reduce_mean(atarg)) tf.summary.scalar('kl_clip_range', tf.reduce_mean(self.max_kl)) if self.full_tensorboard_log: tf.summary.histogram('discounted_rewards', ret) tf.summary.histogram('learning_rate', self.vf_stepsize) tf.summary.histogram('advantage', atarg) tf.summary.histogram('kl_clip_range', self.max_kl) if tf_util.is_image(self.observation_space): tf.summary.image('observation', observation) else: tf.summary.histogram('observation', observation) self.timed = timed self.allmean = allmean self.step = self.policy_pi.step self.proba_step = self.policy_pi.proba_step self.initial_state = self.policy_pi.initial_state self.params = tf_util.get_trainable_vars("model") + tf_util.get_trainable_vars("oldpi") if self.using_gail: self.params.extend(self.reward_giver.get_trainable_variables()) self.summary = tf.summary.merge_all() self.compute_lossandgrad = \ tf_util.function([observation, old_policy.obs_ph, action, atarg, ret], [self.summary, tf_util.flatgrad(optimgain, var_list)] + losses)
def learn(self, total_timesteps, callback=None, log_interval=1, tb_log_name="PPO2", reset_num_timesteps=True, index_to_learn=None, all_models=None, tf_writer=None): # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) self.cliprange = get_schedule_fn(self.cliprange) cliprange_vf = get_schedule_fn(self.cliprange_vf) sv = SetVerbosity(self.verbose) sv.__enter__() if tf_writer is None: new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) writer = TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log).__enter__() else: writer = tf_writer self._setup_learn() t_first_start = time.time() n_updates = total_timesteps // self.n_batch callback.on_training_start(locals(), globals()) for update in range(1, n_updates + 1): assert self.n_batch % self.nminibatches == 0, ( "The number of minibatches (`nminibatches`) " "is not a factor of the total number of samples " "collected per rollout (`n_batch`), " "some samples won't be used.") batch_size = self.n_batch // self.nminibatches t_start = time.time() frac = 1.0 - (update - 1.0) / n_updates lr_now = self.learning_rate(frac) cliprange_now = self.cliprange(frac) cliprange_vf_now = cliprange_vf(frac) callback.on_rollout_start() # true_reward is the reward without discount rollout = self.runner.run(callback, index_to_learn=index_to_learn) # Unpack obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = rollout callback.on_rollout_end() # Early stopping due to the callback if not self.runner.continue_training: break self.ep_info_buf.extend(ep_infos) mb_loss_vals = [] if states is None: # nonrecurrent version update_fac = max( self.n_batch // self.nminibatches // self.noptepochs, 1) inds = np.arange(self.n_batch) for epoch_num in range(self.noptepochs): np.random.shuffle(inds) for start in range(0, self.n_batch, batch_size): timestep = self.num_timesteps // update_fac + ( (epoch_num * self.n_batch + start) // batch_size) end = start + batch_size mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_loss_vals.append( self._train_step(lr_now, cliprange_now, *slices, writer=writer, update=timestep, cliprange_vf=cliprange_vf_now)) else: # recurrent version update_fac = max( self.n_batch // self.nminibatches // self.noptepochs // self.n_steps, 1) assert self.n_envs % self.nminibatches == 0 env_indices = np.arange(self.n_envs) flat_indices = np.arange(self.n_envs * self.n_steps).reshape( self.n_envs, self.n_steps) envs_per_batch = batch_size // self.n_steps for epoch_num in range(self.noptepochs): np.random.shuffle(env_indices) for start in range(0, self.n_envs, envs_per_batch): timestep = self.num_timesteps // update_fac + ( (epoch_num * self.n_envs + start) // envs_per_batch) end = start + envs_per_batch mb_env_inds = env_indices[start:end] mb_flat_inds = flat_indices[mb_env_inds].ravel() slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mb_states = states[mb_env_inds] mb_loss_vals.append( self._train_step(lr_now, cliprange_now, *slices, update=timestep, writer=writer, states=mb_states, cliprange_vf=cliprange_vf_now)) loss_vals = np.mean(mb_loss_vals, axis=0) t_now = time.time() fps = int(self.n_batch / (t_now - t_start)) if writer is not None: total_episode_reward_logger( self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, self.num_timesteps) if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, returns) logger.logkv("serial_timesteps", update * self.n_steps) logger.logkv("n_updates", update) logger.logkv("total_timesteps", self.num_timesteps) logger.logkv("fps", fps) logger.logkv("explained_variance", float(explained_var)) if len(self.ep_info_buf) > 0 and len(self.ep_info_buf[0]) > 0: logger.logkv( 'ep_reward_mean', safe_mean( [ep_info['r'] for ep_info in self.ep_info_buf])) logger.logkv( 'ep_len_mean', safe_mean( [ep_info['l'] for ep_info in self.ep_info_buf])) logger.logkv('time_elapsed', t_start - t_first_start) for (loss_val, loss_name) in zip(loss_vals, self.loss_names): logger.logkv(loss_name, loss_val) logger.dumpkvs() callback.on_training_end() sv.__exit__(None, None, None) return self, writer, callback
def setup_model(self): with SetVerbosity(self.verbose): assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the PPO2 model must be " \ "an instance of common.policies.ActorCriticPolicy." self.n_batch = self.n_envs * self.n_steps self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) n_batch_step = None n_batch_train = None if issubclass(self.policy, RecurrentActorCriticPolicy): assert self.n_envs % self.nminibatches == 0, "For recurrent policies, "\ "the number of environments run in parallel should be a multiple of nminibatches." n_batch_step = self.n_envs n_batch_train = self.n_batch // self.nminibatches act_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, n_batch_step, reuse=False, **self.policy_kwargs) with tf.variable_scope( "train_model", reuse=True, custom_getter=tf_util.outer_scope_getter( "train_model")): train_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs // self.nminibatches, self.n_steps, n_batch_train, reuse=True, **self.policy_kwargs) with tf.variable_scope("loss", reuse=False): self.action_ph = train_model.pdtype.sample_placeholder( [None], name="action_ph") self.advs_ph = tf.placeholder(tf.float32, [None], name="advs_ph") self.rewards_ph = tf.placeholder(tf.float32, [None], name="rewards_ph") self.old_neglog_pac_ph = tf.placeholder( tf.float32, [None], name="old_neglog_pac_ph") self.old_vpred_ph = tf.placeholder(tf.float32, [None], name="old_vpred_ph") self.learning_rate_ph = tf.placeholder( tf.float32, [], name="learning_rate_ph") self.clip_range_ph = tf.placeholder(tf.float32, [], name="clip_range_ph") neglogpac = train_model.proba_distribution.neglogp( self.action_ph) self.entropy = tf.reduce_mean( train_model.proba_distribution.entropy()) vpred = train_model.value_flat # Value function clipping: not present in the original PPO if self.cliprange_vf is None: # Default behavior (legacy from OpenAI baselines): # use the same clipping as for the policy self.clip_range_vf_ph = self.clip_range_ph self.cliprange_vf = self.cliprange elif isinstance(self.cliprange_vf, (float, int)) and self.cliprange_vf < 0: # Original PPO implementation: no value function clipping self.clip_range_vf_ph = None else: # Last possible behavior: clipping range # specific to the value function self.clip_range_vf_ph = tf.placeholder( tf.float32, [], name="clip_range_vf_ph") if self.clip_range_vf_ph is None: # No clipping vpred_clipped = train_model.value_flat else: # Clip the different between old and new value # NOTE: this depends on the reward scaling vpred_clipped = self.old_vpred_ph + \ tf.clip_by_value(train_model.value_flat - self.old_vpred_ph, - self.clip_range_vf_ph, self.clip_range_vf_ph) vf_losses1 = tf.square(vpred - self.rewards_ph) vf_losses2 = tf.square(vpred_clipped - self.rewards_ph) self.vf_loss = .5 * tf.reduce_mean( tf.maximum(vf_losses1, vf_losses2)) ratio = tf.exp(self.old_neglog_pac_ph - neglogpac) pg_losses = -self.advs_ph * ratio pg_losses2 = -self.advs_ph * tf.clip_by_value( ratio, 1.0 - self.clip_range_ph, 1.0 + self.clip_range_ph) self.pg_loss = tf.reduce_mean( tf.maximum(pg_losses, pg_losses2)) self.approxkl = .5 * tf.reduce_mean( tf.square(neglogpac - self.old_neglog_pac_ph)) self.clipfrac = tf.reduce_mean( tf.cast( tf.greater(tf.abs(ratio - 1.0), self.clip_range_ph), tf.float32)) loss = self.pg_loss - self.entropy * self.ent_coef + self.vf_loss * self.vf_coef tf.summary.scalar('entropy_loss', self.entropy) tf.summary.scalar('policy_gradient_loss', self.pg_loss) tf.summary.scalar('value_function_loss', self.vf_loss) tf.summary.scalar('approximate_kullback-leibler', self.approxkl) tf.summary.scalar('clip_factor', self.clipfrac) tf.summary.scalar('loss', loss) with tf.variable_scope('model'): self.params = tf.trainable_variables() if self.full_tensorboard_log: for var in self.params: tf.summary.histogram(var.name, var) grads = tf.gradients(loss, self.params) if self.max_grad_norm is not None: grads, _grad_norm = tf.clip_by_global_norm( grads, self.max_grad_norm) grads = list(zip(grads, self.params)) trainer = tf.train.AdamOptimizer( learning_rate=self.learning_rate_ph, epsilon=1e-5) self._train = trainer.apply_gradients(grads) self.loss_names = [ 'policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac' ] with tf.variable_scope("input_info", reuse=False): tf.summary.scalar('discounted_rewards', tf.reduce_mean(self.rewards_ph)) tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph)) tf.summary.scalar('advantage', tf.reduce_mean(self.advs_ph)) tf.summary.scalar('clip_range', tf.reduce_mean(self.clip_range_ph)) if self.clip_range_vf_ph is not None: tf.summary.scalar( 'clip_range_vf', tf.reduce_mean(self.clip_range_vf_ph)) tf.summary.scalar('old_neglog_action_probability', tf.reduce_mean(self.old_neglog_pac_ph)) tf.summary.scalar('old_value_pred', tf.reduce_mean(self.old_vpred_ph)) if self.full_tensorboard_log: tf.summary.histogram('discounted_rewards', self.rewards_ph) tf.summary.histogram('learning_rate', self.learning_rate_ph) tf.summary.histogram('advantage', self.advs_ph) tf.summary.histogram('clip_range', self.clip_range_ph) tf.summary.histogram('old_neglog_action_probability', self.old_neglog_pac_ph) tf.summary.histogram('old_value_pred', self.old_vpred_ph) if tf_util.is_image(self.observation_space): tf.summary.image('observation', train_model.obs_ph) else: tf.summary.histogram('observation', train_model.obs_ph) self.train_model = train_model self.act_model = act_model self.step = act_model.step self.proba_step = act_model.proba_step self.value = act_model.value self.initial_state = act_model.initial_state tf.global_variables_initializer().run(session=self.sess) # pylint: disable=E1101 self.summary = tf.summary.merge_all()
def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="ACKTR", reset_num_timesteps=True): new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() self.n_batch = self.n_envs * self.n_steps self.learning_rate_schedule = Scheduler( initial_value=self.learning_rate, n_values=total_timesteps, schedule=self.lr_schedule) # FIFO queue of the q_runner thread is closed at the end of the learn function. # As a result, it needs to be redefinied at every call with self.graph.as_default(): with tf.variable_scope( "kfac_apply", reuse=self.trained, custom_getter=tf_util.outer_scope_getter( "kfac_apply")): # Some of the variables are not in a scope when they are create # so we make a note of any previously uninitialized variables tf_vars = tf.global_variables() is_uninitialized = self.sess.run( [tf.is_variable_initialized(var) for var in tf_vars]) old_uninitialized_vars = [ v for (v, f) in zip(tf_vars, is_uninitialized) if not f ] self.train_op, self.q_runner = self.optim.apply_gradients( list(zip(self.grads_check, self.params))) # then we check for new uninitialized variables and initialize them tf_vars = tf.global_variables() is_uninitialized = self.sess.run( [tf.is_variable_initialized(var) for var in tf_vars]) new_uninitialized_vars = [ v for (v, f) in zip(tf_vars, is_uninitialized) if not f and v not in old_uninitialized_vars ] if len(new_uninitialized_vars) != 0: self.sess.run( tf.variables_initializer(new_uninitialized_vars)) self.trained = True t_start = time.time() coord = tf.train.Coordinator() if self.q_runner is not None: enqueue_threads = self.q_runner.create_threads(self.sess, coord=coord, start=True) else: enqueue_threads = [] callback.on_training_start(locals(), globals()) for update in range(1, total_timesteps // self.n_batch + 1): callback.on_rollout_start() # pytype:disable=bad-unpacking # true_reward is the reward without discount if isinstance(self.runner, PPO2Runner): # We are using GAE rollout = self.runner.run(callback) obs, returns, masks, actions, values, _, states, ep_infos, true_reward = rollout else: rollout = self.runner.run(callback) obs, states, returns, masks, actions, values, ep_infos, true_reward = rollout # pytype:enable=bad-unpacking callback.on_rollout_end() # Early stopping due to the callback if not self.runner.continue_training: break self.ep_info_buf.extend(ep_infos) policy_loss, value_loss, policy_entropy = self._train_step( obs, states, returns, masks, actions, values, self.num_timesteps // (self.n_batch + 1), writer) n_seconds = time.time() - t_start fps = int((update * self.n_batch) / n_seconds) if writer is not None: total_episode_reward_logger( self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, self.num_timesteps) if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, returns) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", self.num_timesteps) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("policy_loss", float(policy_loss)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(explained_var)) if len(self.ep_info_buf) > 0 and len( self.ep_info_buf[0]) > 0: logger.logkv( 'ep_reward_mean', safe_mean([ ep_info['r'] for ep_info in self.ep_info_buf ])) logger.logkv( 'ep_len_mean', safe_mean([ ep_info['l'] for ep_info in self.ep_info_buf ])) logger.dump_tabular() coord.request_stop() coord.join(enqueue_threads) callback.on_training_end() return self
def setup_model(self): with SetVerbosity(self.verbose): self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) # Construct network for new policy self.policy_pi = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) # Network for old policy with tf.variable_scope("oldpi", reuse=False): old_pi = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) with tf.variable_scope("loss", reuse=False): # Target advantage function (if applicable) atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ret = tf.placeholder(dtype=tf.float32, shape=[None]) # learning rate multiplier, updated with schedule lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # Annealed cliping parameter epislon clip_param = self.clip_param * lrmult obs_ph = self.policy_pi.obs_ph action_ph = self.policy_pi.pdtype.sample_placeholder( [None]) kloldnew = old_pi.proba_distribution.kl( self.policy_pi.proba_distribution) ent = self.policy_pi.proba_distribution.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-self.entcoeff) * meanent # pnew / pold ratio = tf.exp( self.policy_pi.proba_distribution.logp(action_ph) - old_pi.proba_distribution.logp(action_ph)) # surrogate from conservative policy iteration surr1 = ratio * atarg surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # PPO's pessimistic surrogate (L^CLIP) pol_surr = -tf.reduce_mean(tf.minimum(surr1, surr2)) vf_loss = tf.reduce_mean( tf.square(self.policy_pi.value_flat - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] self.loss_names = [ "pol_surr", "pol_entpen", "vf_loss", "kl", "ent" ] tf.summary.scalar('entropy_loss', pol_entpen) tf.summary.scalar('policy_gradient_loss', pol_surr) tf.summary.scalar('value_function_loss', vf_loss) tf.summary.scalar('approximate_kullback-leibler', meankl) tf.summary.scalar('clip_factor', clip_param) tf.summary.scalar('loss', total_loss) self.params = tf_util.get_trainable_vars("model") self.assign_old_eq_new = tf_util.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame( tf_util.get_globals_vars("oldpi"), tf_util.get_globals_vars("model")) ]) with tf.variable_scope("Adam_mpi", reuse=False): self.adam = MpiAdam(self.params, epsilon=self.adam_epsilon, sess=self.sess) with tf.variable_scope("input_info", reuse=False): tf.summary.scalar('discounted_rewards', tf.reduce_mean(ret)) tf.summary.scalar('learning_rate', tf.reduce_mean(self.optim_stepsize)) tf.summary.scalar('advantage', tf.reduce_mean(atarg)) tf.summary.scalar('clip_range', tf.reduce_mean(self.clip_param)) if self.full_tensorboard_log: tf.summary.histogram('discounted_rewards', ret) tf.summary.histogram('learning_rate', self.optim_stepsize) tf.summary.histogram('advantage', atarg) tf.summary.histogram('clip_range', self.clip_param) if tf_util.is_image(self.observation_space): tf.summary.image('observation', obs_ph) else: tf.summary.histogram('observation', obs_ph) self.step = self.policy_pi.step self.proba_step = self.policy_pi.proba_step self.initial_state = self.policy_pi.initial_state tf_util.initialize(sess=self.sess) self.summary = tf.summary.merge_all() self.lossandgrad = tf_util.function( [obs_ph, old_pi.obs_ph, action_ph, atarg, ret, lrmult], [self.summary, tf_util.flatgrad(total_loss, self.params)] + losses) self.compute_losses = tf_util.function( [obs_ph, old_pi.obs_ph, action_ph, atarg, ret, lrmult], losses)
def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="A2C", reset_num_timesteps=True): new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() self.learning_rate_schedule = Scheduler( initial_value=self.learning_rate, n_values=total_timesteps, schedule=self.lr_schedule) t_start = time.time() callback.on_training_start(locals(), globals()) for update in range(1, total_timesteps // self.n_batch + 1): callback.on_rollout_start() # true_reward is the reward without discount rollout = self.runner.run(callback) # unpack obs, states, rewards, masks, actions, values, ep_infos, true_reward = rollout callback.on_rollout_end() # Early stopping due to the callback if not self.runner.continue_training: break self.ep_info_buf.extend(ep_infos) _, value_loss, policy_entropy = self._train_step( obs, states, rewards, masks, actions, values, self.num_timesteps // self.n_batch, writer) n_seconds = time.time() - t_start fps = int((update * self.n_batch) / n_seconds) if writer is not None: total_episode_reward_logger( self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, self.num_timesteps) if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", self.num_timesteps) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(explained_var)) if len(self.ep_info_buf) > 0 and len( self.ep_info_buf[0]) > 0: logger.logkv( 'ep_reward_mean', safe_mean([ ep_info['r'] for ep_info in self.ep_info_buf ])) logger.logkv( 'ep_len_mean', safe_mean([ ep_info['l'] for ep_info in self.ep_info_buf ])) logger.dump_tabular() callback.on_training_end() return self
def setup_model(self): with SetVerbosity(self.verbose): assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the A2C model must be an " \ "instance of common.policies.ActorCriticPolicy." self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) self.n_batch = self.n_envs * self.n_steps n_batch_step = None n_batch_train = None if issubclass(self.policy, RecurrentActorCriticPolicy): n_batch_step = self.n_envs n_batch_train = self.n_envs * self.n_steps step_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, n_batch_step, reuse=False, **self.policy_kwargs) with tf.variable_scope( "train_model", reuse=True, custom_getter=tf_util.outer_scope_getter( "train_model")): train_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, self.n_steps, n_batch_train, reuse=True, **self.policy_kwargs) with tf.variable_scope("loss", reuse=False): self.actions_ph = train_model.pdtype.sample_placeholder( [None], name="action_ph") self.advs_ph = tf.placeholder(tf.float32, [None], name="advs_ph") self.rewards_ph = tf.placeholder(tf.float32, [None], name="rewards_ph") self.learning_rate_ph = tf.placeholder( tf.float32, [], name="learning_rate_ph") neglogpac = train_model.proba_distribution.neglogp( self.actions_ph) self.entropy = tf.reduce_mean( train_model.proba_distribution.entropy()) self.pg_loss = tf.reduce_mean(self.advs_ph * neglogpac) self.vf_loss = mse(tf.squeeze(train_model.value_flat), self.rewards_ph) # https://arxiv.org/pdf/1708.04782.pdf#page=9, https://arxiv.org/pdf/1602.01783.pdf#page=4 # and https://github.com/dennybritz/reinforcement-learning/issues/34 # suggest to add an entropy component in order to improve exploration. loss = self.pg_loss - self.entropy * self.ent_coef + self.vf_loss * self.vf_coef tf.summary.scalar('entropy_loss', self.entropy) tf.summary.scalar('policy_gradient_loss', self.pg_loss) tf.summary.scalar('value_function_loss', self.vf_loss) tf.summary.scalar('loss', loss) self.params = tf_util.get_trainable_vars("model") grads = tf.gradients(loss, self.params) if self.max_grad_norm is not None: grads, _ = tf.clip_by_global_norm( grads, self.max_grad_norm) grads = list(zip(grads, self.params)) with tf.variable_scope("input_info", reuse=False): tf.summary.scalar('discounted_rewards', tf.reduce_mean(self.rewards_ph)) tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph)) tf.summary.scalar('advantage', tf.reduce_mean(self.advs_ph)) if self.full_tensorboard_log: tf.summary.histogram('discounted_rewards', self.rewards_ph) tf.summary.histogram('learning_rate', self.learning_rate_ph) tf.summary.histogram('advantage', self.advs_ph) if tf_util.is_image(self.observation_space): tf.summary.image('observation', train_model.obs_ph) else: tf.summary.histogram('observation', train_model.obs_ph) trainer = tf.train.RMSPropOptimizer( learning_rate=self.learning_rate_ph, decay=self.alpha, epsilon=self.epsilon, momentum=self.momentum) self.apply_backprop = trainer.apply_gradients(grads) self.train_model = train_model self.step_model = step_model self.step = step_model.step self.proba_step = step_model.proba_step self.value = step_model.value self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=self.sess) self.summary = tf.summary.merge_all()
def setup_model(self): with SetVerbosity(self.verbose): self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) self.replay_buffer = ReplayBuffer(self.buffer_size) with tf.variable_scope("input", reuse=False): # Create policy and target TF objects self.policy_tf = self.policy(self.sess, self.observation_space, self.action_space, **self.policy_kwargs) self.target_policy = self.policy(self.sess, self.observation_space, self.action_space, **self.policy_kwargs) # Initialize Placeholders self.observations_ph = self.policy_tf.obs_ph # Normalized observation for pixels self.processed_obs_ph = self.policy_tf.processed_obs self.next_observations_ph = self.target_policy.obs_ph self.processed_next_obs_ph = self.target_policy.processed_obs self.action_target = self.target_policy.action_ph self.terminals_ph = tf.placeholder(tf.float32, shape=(None, 1), name='terminals') self.rewards_ph = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions_ph = tf.placeholder(tf.float32, shape=(None,) + self.action_space.shape, name='actions') self.learning_rate_ph = tf.placeholder(tf.float32, [], name="learning_rate_ph") with tf.variable_scope("model", reuse=False): # Create the policy # first return value corresponds to deterministic actions # policy_out corresponds to stochastic actions, used for training # logp_pi is the log probability of actions taken by the policy self.deterministic_action, policy_out, logp_pi = self.policy_tf.make_actor(self.processed_obs_ph) # Monitor the entropy of the policy, # this is not used for training self.entropy = tf.reduce_mean(self.policy_tf.entropy) # Use two Q-functions to improve performance by reducing overestimation bias. qf1, qf2, value_fn = self.policy_tf.make_critics(self.processed_obs_ph, self.actions_ph, create_qf=True, create_vf=True) qf1_pi, qf2_pi, _ = self.policy_tf.make_critics(self.processed_obs_ph, policy_out, create_qf=True, create_vf=False, reuse=True) # Target entropy is used when learning the entropy coefficient if self.target_entropy == 'auto': # automatically set target entropy if needed self.target_entropy = -np.prod(self.action_space.shape).astype(np.float32) else: # Force conversion # this will also throw an error for unexpected string self.target_entropy = float(self.target_entropy) # The entropy coefficient or entropy can be learned automatically # see Automating Entropy Adjustment for Maximum Entropy RL section # of https://arxiv.org/abs/1812.05905 if isinstance(self.ent_coef, str) and self.ent_coef.startswith('auto'): # Default initial value of ent_coef when learned init_value = 1.0 if '_' in self.ent_coef: init_value = float(self.ent_coef.split('_')[1]) assert init_value > 0., "The initial value of ent_coef must be greater than 0" self.log_ent_coef = tf.get_variable('log_ent_coef', dtype=tf.float32, initializer=np.log(init_value).astype(np.float32)) self.ent_coef = tf.exp(self.log_ent_coef) else: # Force conversion to float # this will throw an error if a malformed string (different from 'auto') # is passed self.ent_coef = float(self.ent_coef) with tf.variable_scope("target", reuse=False): # Create the value network _, _, value_target = self.target_policy.make_critics(self.processed_next_obs_ph, create_qf=False, create_vf=True) self.value_target = value_target with tf.variable_scope("loss", reuse=False): # Take the min of the two Q-Values (Double-Q Learning) min_qf_pi = tf.minimum(qf1_pi, qf2_pi) # Target for Q value regression q_backup = tf.stop_gradient( self.rewards_ph + (1 - self.terminals_ph) * self.gamma * self.value_target ) # Compute Q-Function loss # TODO: test with huber loss (it would avoid too high values) qf1_loss = 0.5 * tf.reduce_mean((q_backup - qf1) ** 2) qf2_loss = 0.5 * tf.reduce_mean((q_backup - qf2) ** 2) # Compute the entropy temperature loss # it is used when the entropy coefficient is learned ent_coef_loss, entropy_optimizer = None, None if not isinstance(self.ent_coef, float): ent_coef_loss = -tf.reduce_mean( self.log_ent_coef * tf.stop_gradient(logp_pi + self.target_entropy)) entropy_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph) # Compute the policy loss # Alternative: policy_kl_loss = tf.reduce_mean(logp_pi - min_qf_pi) policy_kl_loss = tf.reduce_mean(self.ent_coef * logp_pi - qf1_pi) # NOTE: in the original implementation, they have an additional # regularization loss for the Gaussian parameters # this is not used for now # policy_loss = (policy_kl_loss + policy_regularization_loss) policy_loss = policy_kl_loss # Target for value fn regression # We update the vf towards the min of two Q-functions in order to # reduce overestimation bias from function approximation error. v_backup = tf.stop_gradient(min_qf_pi - self.ent_coef * logp_pi) value_loss = 0.5 * tf.reduce_mean((value_fn - v_backup) ** 2) values_losses = qf1_loss + qf2_loss + value_loss # Policy train op # (has to be separate from value train op, because min_qf_pi appears in policy_loss) policy_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph) policy_train_op = policy_optimizer.minimize(policy_loss, var_list=tf_util.get_trainable_vars('model/pi')) # Value train op value_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph) values_params = tf_util.get_trainable_vars('model/values_fn') source_params = tf_util.get_trainable_vars("model/values_fn/vf") target_params = tf_util.get_trainable_vars("target/values_fn/vf") # Polyak averaging for target variables self.target_update_op = [ tf.assign(target, (1 - self.tau) * target + self.tau * source) for target, source in zip(target_params, source_params) ] # Initializing target to match source variables target_init_op = [ tf.assign(target, source) for target, source in zip(target_params, source_params) ] # Control flow is used because sess.run otherwise evaluates in nondeterministic order # and we first need to compute the policy action before computing q values losses with tf.control_dependencies([policy_train_op]): train_values_op = value_optimizer.minimize(values_losses, var_list=values_params) self.infos_names = ['policy_loss', 'qf1_loss', 'qf2_loss', 'value_loss', 'entropy'] # All ops to call during one training step self.step_ops = [policy_loss, qf1_loss, qf2_loss, value_loss, qf1, qf2, value_fn, logp_pi, self.entropy, policy_train_op, train_values_op] # Add entropy coefficient optimization operation if needed if ent_coef_loss is not None: with tf.control_dependencies([train_values_op]): ent_coef_op = entropy_optimizer.minimize(ent_coef_loss, var_list=self.log_ent_coef) self.infos_names += ['ent_coef_loss', 'ent_coef'] self.step_ops += [ent_coef_op, ent_coef_loss, self.ent_coef] # Monitor losses and entropy in tensorboard tf.summary.scalar('policy_loss', policy_loss) tf.summary.scalar('qf1_loss', qf1_loss) tf.summary.scalar('qf2_loss', qf2_loss) tf.summary.scalar('value_loss', value_loss) tf.summary.scalar('entropy', self.entropy) if ent_coef_loss is not None: tf.summary.scalar('ent_coef_loss', ent_coef_loss) tf.summary.scalar('ent_coef', self.ent_coef) tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph)) # Retrieve parameters that must be saved self.params = tf_util.get_trainable_vars("model") self.target_params = tf_util.get_trainable_vars("target/values_fn/vf") # Initialize Variables and target network with self.sess.as_default(): self.sess.run(tf.global_variables_initializer()) self.sess.run(target_init_op) self.summary = tf.summary.merge_all()
def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="DQN", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() # Create the replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps else: prioritized_replay_beta_iters = self.prioritized_replay_beta_iters self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None if replay_wrapper is not None: assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER" self.replay_buffer = replay_wrapper(self.replay_buffer) # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=self.exploration_initial_eps, final_p=self.exploration_final_eps) episode_rewards = [0.0] episode_successes = [] callback.on_training_start(locals(), globals()) callback.on_rollout_start() reset = True obs = self.env.reset() # Retrieve unnormalized observation for saving into the buffer if self._vec_normalize_env is not None: obs_ = self._vec_normalize_env.get_original_obs().squeeze() for _ in range(total_timesteps): # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(self.num_timesteps) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = \ -np.log(1. - self.exploration.value(self.num_timesteps) + self.exploration.value(self.num_timesteps) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True with self.sess.as_default(): action = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, info = self.env.step(env_action) self.num_timesteps += 1 # Stop training if return value is False if callback.on_step() is False: break # Store only the unnormalized version if self._vec_normalize_env is not None: new_obs_ = self._vec_normalize_env.get_original_obs( ).squeeze() reward_ = self._vec_normalize_env.get_original_reward( ).squeeze() else: # Avoid changing the original ones obs_, new_obs_, reward_ = obs, new_obs, rew # Store transition in the replay buffer. self.replay_buffer.add(obs_, action, reward_, new_obs_, float(done)) obs = new_obs # Save the unnormalized observation if self._vec_normalize_env is not None: obs_ = new_obs_ if writer is not None: ep_rew = np.array([reward_]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) tf_util.total_episode_reward_logger( self.episode_reward, ep_rew, ep_done, writer, self.num_timesteps) episode_rewards[-1] += reward_ if done: maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append(float(maybe_is_success)) if not isinstance(self.env, VecEnv): obs = self.env.reset() episode_rewards.append(0.0) reset = True # Do not train if the warmup phase is not over # or if there are not enough samples in the replay buffer can_sample = self.replay_buffer.can_sample(self.batch_size) if can_sample and self.num_timesteps > self.learning_starts \ and self.num_timesteps % self.train_freq == 0: callback.on_rollout_end() # Minimize the error in Bellman's equation on a batch sampled from replay buffer. # pytype:disable=bad-unpacking if self.prioritized_replay: assert self.beta_schedule is not None, \ "BUG: should be LinearSchedule when self.prioritized_replay True" experience = self.replay_buffer.sample( self.batch_size, beta=self.beta_schedule.value(self.num_timesteps), env=self._vec_normalize_env) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample( self.batch_size, env=self._vec_normalize_env) weights, batch_idxes = np.ones_like(rewards), None # pytype:enable=bad-unpacking if writer is not None: # run loss backprop with summary, but once every 100 steps save the metadata # (memory, compute time, ...) if (1 + self.num_timesteps) % 100 == 0: run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess, options=run_options, run_metadata=run_metadata) writer.add_run_metadata( run_metadata, 'step%d' % self.num_timesteps) else: summary, td_errors = self._train_step( obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) writer.add_summary(summary, self.num_timesteps) else: _, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if self.prioritized_replay: new_priorities = np.abs( td_errors) + self.prioritized_replay_eps assert isinstance(self.replay_buffer, PrioritizedReplayBuffer) self.replay_buffer.update_priorities( batch_idxes, new_priorities) callback.on_rollout_start() if can_sample and self.num_timesteps > self.learning_starts and \ self.num_timesteps % self.target_network_update_freq == 0: # Update target network periodically. self.update_target(sess=self.sess) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round( float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len( episode_rewards) % log_interval == 0: logger.record_tabular("steps", self.num_timesteps) logger.record_tabular("episodes", num_episodes) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular( "% time spent exploring", int(100 * self.exploration.value(self.num_timesteps))) logger.dump_tabular() callback.on_training_end() return self
def setup_model(self): with SetVerbosity(self.verbose): assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the ACER model must be " \ "an instance of common.policies.ActorCriticPolicy." if isinstance(self.action_space, Discrete): self.n_act = self.action_space.n continuous = False elif isinstance(self.action_space, Box): # self.n_act = self.action_space.shape[-1] # continuous = True raise NotImplementedError( "WIP: Acer does not support Continuous actions yet.") else: raise ValueError( "Error: ACER does not work with {} actions space.".format( self.action_space)) self.n_batch = self.n_envs * self.n_steps self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) self.set_random_seed(self.seed) n_batch_step = None if issubclass(self.policy, RecurrentActorCriticPolicy): n_batch_step = self.n_envs n_batch_train = self.n_envs * (self.n_steps + 1) step_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, n_batch_step, reuse=False, **self.policy_kwargs) self.params = tf_util.get_trainable_vars("model") with tf.variable_scope( "train_model", reuse=True, custom_getter=tf_util.outer_scope_getter( "train_model")): train_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, self.n_steps + 1, n_batch_train, reuse=True, **self.policy_kwargs) with tf.variable_scope("moving_average"): # create averaged model ema = tf.train.ExponentialMovingAverage(self.alpha) ema_apply_op = ema.apply(self.params) def custom_getter(getter, name, *args, **kwargs): name = name.replace("polyak_model/", "") val = ema.average(getter(name, *args, **kwargs)) return val with tf.variable_scope("polyak_model", reuse=True, custom_getter=custom_getter): self.polyak_model = polyak_model = self.policy( self.sess, self.observation_space, self.action_space, self.n_envs, self.n_steps + 1, self.n_envs * (self.n_steps + 1), reuse=True, **self.policy_kwargs) with tf.variable_scope("loss", reuse=False): self.done_ph = tf.placeholder(tf.float32, [self.n_batch]) # dones self.reward_ph = tf.placeholder( tf.float32, [self.n_batch]) # rewards, not returns self.mu_ph = tf.placeholder( tf.float32, [self.n_batch, self.n_act]) # mu's self.action_ph = train_model.pdtype.sample_placeholder( [self.n_batch]) self.learning_rate_ph = tf.placeholder(tf.float32, []) eps = 1e-6 # Notation: (var) = batch variable, (var)s = sequence variable, # (var)_i = variable index by action at step i # shape is [n_envs * (n_steps + 1)] if continuous: value = train_model.value_flat else: value = tf.reduce_sum(train_model.policy_proba * train_model.q_value, axis=-1) rho, rho_i_ = None, None if continuous: action_ = strip( train_model.proba_distribution.sample(), self.n_envs, self.n_steps) distribution_f = tf.contrib.distributions.MultivariateNormalDiag( loc=strip(train_model.proba_distribution.mean, self.n_envs, self.n_steps), scale_diag=strip( train_model.proba_distribution.logstd, self.n_envs, self.n_steps)) f_polyak = tf.contrib.distributions.MultivariateNormalDiag( loc=strip(polyak_model.proba_distribution.mean, self.n_envs, self.n_steps), scale_diag=strip( polyak_model.proba_distribution.logstd, self.n_envs, self.n_steps)) f_i = distribution_f.prob(self.action_ph) f_i_ = distribution_f.prob(action_) f_polyak_i = f_polyak.prob(self.action_ph) phi_i = strip(train_model.proba_distribution.mean, self.n_envs, self.n_steps) q_value = strip(train_model.value_fn, self.n_envs, self.n_steps) q_i = q_value[:, 0] rho_i = tf.reshape(f_i, [-1, 1]) / (self.mu_ph + eps) rho_i_ = tf.reshape(f_i_, [-1, 1]) / (self.mu_ph + eps) qret = q_retrace(self.reward_ph, self.done_ph, q_i, value, tf.pow(rho_i, 1 / self.n_act), self.n_envs, self.n_steps, self.gamma) else: # strip off last step # f is a distribution, chosen to be Gaussian distributions # with fixed diagonal covariance and mean \phi(x) # in the paper distribution_f, f_polyak, q_value = \ map(lambda variables: strip(variables, self.n_envs, self.n_steps), [train_model.policy_proba, polyak_model.policy_proba, train_model.q_value]) # Get pi and q values for actions taken f_i = get_by_index(distribution_f, self.action_ph) f_i_ = distribution_f phi_i = distribution_f f_polyak_i = f_polyak q_i = get_by_index(q_value, self.action_ph) # Compute ratios for importance truncation rho = distribution_f / (self.mu_ph + eps) rho_i = get_by_index(rho, self.action_ph) # Calculate Q_retrace targets qret = q_retrace(self.reward_ph, self.done_ph, q_i, value, rho_i, self.n_envs, self.n_steps, self.gamma) # Calculate losses # Entropy entropy = tf.reduce_sum( train_model.proba_distribution.entropy()) # Policy Gradient loss, with truncated importance sampling & bias correction value = strip(value, self.n_envs, self.n_steps, True) # check_shape([qret, value, rho_i, f_i], [[self.n_envs * self.n_steps]] * 4) # check_shape([rho, distribution_f, q_value], [[self.n_envs * self.n_steps, self.n_act]] * 2) # Truncated importance sampling adv = qret - value log_f = tf.log(f_i + eps) # [n_envs * n_steps] gain_f = log_f * tf.stop_gradient( adv * tf.minimum(self.correction_term, rho_i)) loss_f = -tf.reduce_mean(gain_f) # Bias correction for the truncation adv_bc = ( q_value - tf.reshape(value, [self.n_envs * self.n_steps, 1]) ) # [n_envs * n_steps, n_act] # check_shape([adv_bc, log_f_bc], [[self.n_envs * self.n_steps, self.n_act]] * 2) if continuous: gain_bc = tf.stop_gradient( adv_bc * tf.nn.relu(1.0 - (self.correction_term / (rho_i_ + eps))) * f_i_) else: log_f_bc = tf.log(f_i_ + eps) # / (f_old + eps) gain_bc = tf.reduce_sum(log_f_bc * tf.stop_gradient( adv_bc * tf.nn.relu(1.0 - (self.correction_term / (rho + eps))) * f_i_), axis=1) # IMP: This is sum, as expectation wrt f loss_bc = -tf.reduce_mean(gain_bc) loss_policy = loss_f + loss_bc # Value/Q function loss, and explained variance check_shape([qret, q_i], [[self.n_envs * self.n_steps]] * 2) explained_variance = q_explained_variance( tf.reshape(q_i, [self.n_envs, self.n_steps]), tf.reshape(qret, [self.n_envs, self.n_steps])) loss_q = tf.reduce_mean( tf.square(tf.stop_gradient(qret) - q_i) * 0.5) # Net loss check_shape([loss_policy, loss_q, entropy], [[]] * 3) loss = loss_policy + self.q_coef * loss_q - self.ent_coef * entropy tf.summary.scalar('entropy_loss', entropy) tf.summary.scalar('policy_gradient_loss', loss_policy) tf.summary.scalar('value_function_loss', loss_q) tf.summary.scalar('loss', loss) norm_grads_q, norm_grads_policy, avg_norm_grads_f = None, None, None avg_norm_k, avg_norm_g, avg_norm_k_dot_g, avg_norm_adj = None, None, None, None if self.trust_region: # [n_envs * n_steps, n_act] grad = tf.gradients( -(loss_policy - self.ent_coef * entropy) * self.n_steps * self.n_envs, phi_i) # [n_envs * n_steps, n_act] # Directly computed gradient of KL divergence wrt f kl_grad = -f_polyak_i / (f_i_ + eps) k_dot_g = tf.reduce_sum(kl_grad * grad, axis=-1) adj = tf.maximum( 0.0, (tf.reduce_sum(kl_grad * grad, axis=-1) - self.delta) / (tf.reduce_sum(tf.square(kl_grad), axis=-1) + eps)) # [n_envs * n_steps] # Calculate stats (before doing adjustment) for logging. avg_norm_k = avg_norm(kl_grad) avg_norm_g = avg_norm(grad) avg_norm_k_dot_g = tf.reduce_mean(tf.abs(k_dot_g)) avg_norm_adj = tf.reduce_mean(tf.abs(adj)) grad = grad - tf.reshape( adj, [self.n_envs * self.n_steps, 1]) * kl_grad # These are turst region adjusted gradients wrt f ie statistics of policy pi grads_f = -grad / (self.n_envs * self.n_steps) grads_policy = tf.gradients(f_i_, self.params, grads_f) grads_q = tf.gradients(loss_q * self.q_coef, self.params) grads = [ gradient_add(g1, g2, param, verbose=self.verbose) for (g1, g2, param ) in zip(grads_policy, grads_q, self.params) ] avg_norm_grads_f = avg_norm(grads_f) * (self.n_steps * self.n_envs) norm_grads_q = tf.global_norm(grads_q) norm_grads_policy = tf.global_norm(grads_policy) else: grads = tf.gradients(loss, self.params) norm_grads = None if self.max_grad_norm is not None: grads, norm_grads = tf.clip_by_global_norm( grads, self.max_grad_norm) grads = list(zip(grads, self.params)) with tf.variable_scope("input_info", reuse=False): tf.summary.scalar('rewards', tf.reduce_mean(self.reward_ph)) tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate)) tf.summary.scalar('advantage', tf.reduce_mean(adv)) tf.summary.scalar('action_probability', tf.reduce_mean(self.mu_ph)) if self.full_tensorboard_log: tf.summary.histogram('rewards', self.reward_ph) tf.summary.histogram('learning_rate', self.learning_rate) tf.summary.histogram('advantage', adv) tf.summary.histogram('action_probability', self.mu_ph) if tf_util.is_image(self.observation_space): tf.summary.image('observation', train_model.obs_ph) else: tf.summary.histogram('observation', train_model.obs_ph) trainer = tf.train.RMSPropOptimizer( learning_rate=self.learning_rate_ph, decay=self.rprop_alpha, epsilon=self.rprop_epsilon) _opt_op = trainer.apply_gradients(grads) # so when you call _train, you first do the gradient step, then you apply ema with tf.control_dependencies([_opt_op]): _train = tf.group(ema_apply_op) # Ops/Summaries to run, and their names for logging assert norm_grads is not None run_ops = [ _train, loss, loss_q, entropy, loss_policy, loss_f, loss_bc, explained_variance, norm_grads ] names_ops = [ 'loss', 'loss_q', 'entropy', 'loss_policy', 'loss_f', 'loss_bc', 'explained_variance', 'norm_grads' ] if self.trust_region: self.run_ops = run_ops + [ norm_grads_q, norm_grads_policy, avg_norm_grads_f, avg_norm_k, avg_norm_g, avg_norm_k_dot_g, avg_norm_adj ] self.names_ops = names_ops + [ 'norm_grads_q', 'norm_grads_policy', 'avg_norm_grads_f', 'avg_norm_k', 'avg_norm_g', 'avg_norm_k_dot_g', 'avg_norm_adj' ] self.train_model = train_model self.step_model = step_model self.step = step_model.step self.proba_step = step_model.proba_step self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=self.sess) self.summary = tf.summary.merge_all()
def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="PPO1", reset_num_timesteps=True): new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the PPO1 model must be " \ "an instance of common.policies.ActorCriticPolicy." with self.sess.as_default(): self.adam.sync() callback.on_training_start(locals(), globals()) # Prepare for rollouts seg_gen = traj_segment_generator(self.policy_pi, self.env, self.timesteps_per_actorbatch, callback=callback) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 t_start = time.time() # rolling buffer for episode lengths len_buffer = deque(maxlen=100) # rolling buffer for episode rewards reward_buffer = deque(maxlen=100) while True: if timesteps_so_far >= total_timesteps: break if self.schedule == 'constant': cur_lrmult = 1.0 elif self.schedule == 'linear': cur_lrmult = max( 1.0 - float(timesteps_so_far) / total_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() # Stop training early (triggered by the callback) if not seg.get('continue_training', True): # pytype: disable=attribute-error break add_vtarg_and_adv(seg, self.gamma, self.lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) observations, actions = seg["observations"], seg["actions"] atarg, tdlamret = seg["adv"], seg["tdlamret"] # true_rew is the reward without discount if writer is not None: total_episode_reward_logger( self.episode_reward, seg["true_rewards"].reshape( (self.n_envs, -1)), seg["dones"].reshape( (self.n_envs, -1)), writer, self.num_timesteps) # predicted value function before udpate vpredbefore = seg["vpred"] # standardized advantage function estimate atarg = (atarg - atarg.mean()) / atarg.std() dataset = Dataset(dict(ob=observations, ac=actions, atarg=atarg, vtarg=tdlamret), shuffle=not self.policy.recurrent) optim_batchsize = self.optim_batchsize or observations.shape[ 0] # set old parameter values to new parameter values self.assign_old_eq_new(sess=self.sess) logger.log("Optimizing...") logger.log(fmt_row(13, self.loss_names)) # Here we do a bunch of optimization epochs over the data for k in range(self.optim_epochs): # list of tuples, each of which gives the loss for a minibatch losses = [] for i, batch in enumerate( dataset.iterate_once(optim_batchsize)): steps = ( self.num_timesteps + k * optim_batchsize + int(i * (optim_batchsize / len(dataset.data_map)))) if writer is not None: # run loss backprop with summary, but once every 10 runs save the metadata # (memory, compute time, ...) if self.full_tensorboard_log and (1 + k) % 10 == 0: run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, grad, *newlosses = self.lossandgrad( batch["ob"], batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, sess=self.sess, options=run_options, run_metadata=run_metadata) writer.add_run_metadata( run_metadata, 'step%d' % steps) else: summary, grad, *newlosses = self.lossandgrad( batch["ob"], batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, sess=self.sess) writer.add_summary(summary, steps) else: _, grad, *newlosses = self.lossandgrad( batch["ob"], batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, sess=self.sess) self.adam.update(grad, self.optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in dataset.iterate_once(optim_batchsize): newlosses = self.compute_losses(batch["ob"], batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, sess=self.sess) losses.append(newlosses) mean_losses, _, _ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, mean_losses)) for (loss_val, name) in zipsame(mean_losses, self.loss_names): logger.record_tabular("loss_" + name, loss_val) logger.record_tabular( "ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) # local values lrlocal = (seg["ep_lens"], seg["ep_rets"]) # list of tuples listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) lens, rews = map(flatten_lists, zip(*listoflrpairs)) len_buffer.extend(lens) reward_buffer.extend(rews) if len(len_buffer) > 0: logger.record_tabular("EpLenMean", np.mean(len_buffer)) logger.record_tabular("EpRewMean", np.mean(reward_buffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) current_it_timesteps = MPI.COMM_WORLD.allreduce( seg["total_timestep"]) timesteps_so_far += current_it_timesteps self.num_timesteps += current_it_timesteps iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", self.num_timesteps) logger.record_tabular("TimeElapsed", time.time() - t_start) if self.verbose >= 1 and MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() callback.on_training_end() return self
def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="ACER", reset_num_timesteps=True): new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() self.learning_rate_schedule = Scheduler( initial_value=self.learning_rate, n_values=total_timesteps, schedule=self.lr_schedule) episode_stats = EpisodeStats(self.n_steps, self.n_envs) if self.replay_ratio > 0: buffer = Buffer(env=self.env, n_steps=self.n_steps, size=self.buffer_size) else: buffer = None t_start = time.time() callback.on_training_start(locals(), globals()) # n_batch samples, 1 on_policy call and multiple off-policy calls for steps in range(0, total_timesteps, self.n_batch): callback.on_rollout_start() enc_obs, obs, actions, rewards, mus, dones, masks = self.runner.run( callback) callback.on_rollout_end() # Early stopping due to the callback if not self.runner.continue_training: break episode_stats.feed(rewards, dones) if buffer is not None: buffer.put(enc_obs, actions, rewards, mus, dones, masks) if writer is not None: total_episode_reward_logger( self.episode_reward, rewards.reshape((self.n_envs, self.n_steps)), dones.reshape((self.n_envs, self.n_steps)), writer, self.num_timesteps) # reshape stuff correctly obs = obs.reshape(self.runner.batch_ob_shape) actions = actions.reshape([self.n_batch]) rewards = rewards.reshape([self.n_batch]) mus = mus.reshape([self.n_batch, self.n_act]) dones = dones.reshape([self.n_batch]) masks = masks.reshape([self.runner.batch_ob_shape[0]]) names_ops, values_ops = self._train_step( obs, actions, rewards, dones, mus, self.initial_state, masks, self.num_timesteps, writer) if self.verbose >= 1 and (int(steps / self.n_batch) % log_interval == 0): logger.record_tabular("total_timesteps", self.num_timesteps) logger.record_tabular("fps", int(steps / (time.time() - t_start))) # IMP: In EpisodicLife env, during training, we get done=True at each loss of life, # not just at the terminal state. Thus, this is mean until end of life, not end of episode. # For true episode rewards, see the monitor files in the log folder. logger.record_tabular("mean_episode_length", episode_stats.mean_length()) logger.record_tabular("mean_episode_reward", episode_stats.mean_reward()) for name, val in zip(names_ops, values_ops): logger.record_tabular(name, float(val)) logger.dump_tabular() if (self.replay_ratio > 0 and buffer is not None and buffer.has_atleast(self.replay_start)): samples_number = np.random.poisson(self.replay_ratio) for _ in range(samples_number): # get obs, actions, rewards, mus, dones from buffer. obs, actions, rewards, mus, dones, masks = buffer.get() # reshape stuff correctly obs = obs.reshape(self.runner.batch_ob_shape) actions = actions.reshape([self.n_batch]) rewards = rewards.reshape([self.n_batch]) mus = mus.reshape([self.n_batch, self.n_act]) dones = dones.reshape([self.n_batch]) masks = masks.reshape([self.runner.batch_ob_shape[0]]) self._train_step(obs, actions, rewards, dones, mus, self.initial_state, masks, self.num_timesteps) callback.on_training_end() return self
def setup_model(self): with SetVerbosity(self.verbose): assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the ACKTR model must be " \ "an instance of common.policies.ActorCriticPolicy." # Enable continuous actions tricks (normalized advantage) self.continuous_actions = isinstance(self.action_space, Box) self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) n_batch_step = None n_batch_train = None if issubclass(self.policy, RecurrentActorCriticPolicy): n_batch_step = self.n_envs n_batch_train = self.n_envs * self.n_steps step_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, n_batch_step, reuse=False, **self.policy_kwargs) self.params = params = tf_util.get_trainable_vars("model") with tf.variable_scope( "train_model", reuse=True, custom_getter=tf_util.outer_scope_getter( "train_model")): train_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, self.n_steps, n_batch_train, reuse=True, **self.policy_kwargs) with tf.variable_scope( "loss", reuse=False, custom_getter=tf_util.outer_scope_getter("loss")): self.advs_ph = advs_ph = tf.placeholder(tf.float32, [None]) self.rewards_ph = rewards_ph = tf.placeholder( tf.float32, [None]) self.learning_rate_ph = learning_rate_ph = tf.placeholder( tf.float32, []) self.actions_ph = train_model.pdtype.sample_placeholder( [None]) neg_log_prob = train_model.proba_distribution.neglogp( self.actions_ph) # training loss pg_loss = tf.reduce_mean(advs_ph * neg_log_prob) self.entropy = entropy = tf.reduce_mean( train_model.proba_distribution.entropy()) self.pg_loss = pg_loss = pg_loss - self.ent_coef * entropy self.vf_loss = vf_loss = mse( tf.squeeze(train_model.value_fn), rewards_ph) train_loss = pg_loss + self.vf_coef * vf_loss # Fisher loss construction self.pg_fisher = pg_fisher_loss = -tf.reduce_mean( neg_log_prob) sample_net = train_model.value_fn + tf.random_normal( tf.shape(train_model.value_fn)) self.vf_fisher = vf_fisher_loss = -self.vf_fisher_coef * tf.reduce_mean( tf.pow( train_model.value_fn - tf.stop_gradient(sample_net), 2)) self.joint_fisher = pg_fisher_loss + vf_fisher_loss tf.summary.scalar('entropy_loss', self.entropy) tf.summary.scalar('policy_gradient_loss', pg_loss) tf.summary.scalar('policy_gradient_fisher_loss', pg_fisher_loss) tf.summary.scalar('value_function_loss', self.vf_loss) tf.summary.scalar('value_function_fisher_loss', vf_fisher_loss) tf.summary.scalar('loss', train_loss) self.grads_check = tf.gradients(train_loss, params) with tf.variable_scope("input_info", reuse=False): tf.summary.scalar('discounted_rewards', tf.reduce_mean(self.rewards_ph)) tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph)) tf.summary.scalar('advantage', tf.reduce_mean(self.advs_ph)) if self.full_tensorboard_log: tf.summary.histogram('discounted_rewards', self.rewards_ph) tf.summary.histogram('learning_rate', self.learning_rate_ph) tf.summary.histogram('advantage', self.advs_ph) if tf_util.is_image(self.observation_space): tf.summary.image('observation', train_model.obs_ph) else: tf.summary.histogram('observation', train_model.obs_ph) with tf.variable_scope( "kfac", reuse=False, custom_getter=tf_util.outer_scope_getter("kfac")): with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer( learning_rate=learning_rate_ph, clip_kl=self.kfac_clip, momentum=0.9, kfac_update=self.kfac_update, epsilon=0.01, stats_decay=0.99, async_eigen_decomp=self.async_eigen_decomp, cold_iter=10, max_grad_norm=self.max_grad_norm, verbose=self.verbose) optim.compute_and_apply_stats(self.joint_fisher, var_list=params) self.train_model = train_model self.step_model = step_model self.step = step_model.step self.proba_step = step_model.proba_step self.value = step_model.value self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=self.sess) self.summary = tf.summary.merge_all()