def setup_model(self): with SetVerbosity(self.verbose): assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the PPO2 model must be " \ "an instance of common.policies.ActorCriticPolicy." self.n_batch = self.n_envs * self.n_steps self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) n_batch_step = None n_batch_train = None if issubclass(self.policy, RecurrentActorCriticPolicy): assert self.n_envs % self.nminibatches == 0, "For recurrent policies, "\ "the number of environments run in parallel should be a multiple of nminibatches." n_batch_step = self.n_envs n_batch_train = self.n_batch // self.nminibatches act_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, n_batch_step, reuse=False, **self.policy_kwargs) with tf.variable_scope( "train_model", reuse=True, custom_getter=tf_util.outer_scope_getter( "train_model")): train_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs // self.nminibatches, self.n_steps, n_batch_train, reuse=True, **self.policy_kwargs) with tf.variable_scope("loss", reuse=False): self.action_ph = train_model.pdtype.sample_placeholder( [None], name="action_ph") self.advs_ph = tf.placeholder(tf.float32, [None], name="advs_ph") self.rewards_ph = tf.placeholder(tf.float32, [None], name="rewards_ph") self.old_neglog_pac_ph = tf.placeholder( tf.float32, [None], name="old_neglog_pac_ph") self.old_vpred_ph = tf.placeholder(tf.float32, [None], name="old_vpred_ph") self.learning_rate_ph = tf.placeholder( tf.float32, [], name="learning_rate_ph") self.clip_range_ph = tf.placeholder(tf.float32, [], name="clip_range_ph") neglogpac = train_model.proba_distribution.neglogp( self.action_ph) self.entropy = tf.reduce_mean( train_model.proba_distribution.entropy()) vpred = train_model.value_flat # Value function clipping: not present in the original PPO if self.cliprange_vf is None: # Default behavior (legacy from OpenAI baselines): # use the same clipping as for the policy self.clip_range_vf_ph = self.clip_range_ph self.cliprange_vf = self.cliprange elif isinstance(self.cliprange_vf, (float, int)) and self.cliprange_vf < 0: # Original PPO implementation: no value function clipping self.clip_range_vf_ph = None else: # Last possible behavior: clipping range # specific to the value function self.clip_range_vf_ph = tf.placeholder( tf.float32, [], name="clip_range_vf_ph") if self.clip_range_vf_ph is None: # No clipping vpred_clipped = train_model.value_flat else: # Clip the different between old and new value # NOTE: this depends on the reward scaling vpred_clipped = self.old_vpred_ph + \ tf.clip_by_value(train_model.value_flat - self.old_vpred_ph, - self.clip_range_vf_ph, self.clip_range_vf_ph) vf_losses1 = tf.square(vpred - self.rewards_ph) vf_losses2 = tf.square(vpred_clipped - self.rewards_ph) self.vf_loss = .5 * tf.reduce_mean( tf.maximum(vf_losses1, vf_losses2)) ratio = tf.exp(self.old_neglog_pac_ph - neglogpac) pg_losses = -self.advs_ph * ratio pg_losses2 = -self.advs_ph * tf.clip_by_value( ratio, 1.0 - self.clip_range_ph, 1.0 + self.clip_range_ph) self.pg_loss = tf.reduce_mean( tf.maximum(pg_losses, pg_losses2)) self.approxkl = .5 * tf.reduce_mean( tf.square(neglogpac - self.old_neglog_pac_ph)) self.clipfrac = tf.reduce_mean( tf.cast( tf.greater(tf.abs(ratio - 1.0), self.clip_range_ph), tf.float32)) loss = self.pg_loss - self.entropy * self.ent_coef + self.vf_loss * self.vf_coef tf.summary.scalar('entropy_loss', self.entropy) tf.summary.scalar('policy_gradient_loss', self.pg_loss) tf.summary.scalar('value_function_loss', self.vf_loss) tf.summary.scalar('approximate_kullback-leibler', self.approxkl) tf.summary.scalar('clip_factor', self.clipfrac) tf.summary.scalar('loss', loss) with tf.variable_scope('model'): self.params = tf.trainable_variables() if self.full_tensorboard_log: for var in self.params: tf.summary.histogram(var.name, var) grads = tf.gradients(loss, self.params) if self.max_grad_norm is not None: grads, _grad_norm = tf.clip_by_global_norm( grads, self.max_grad_norm) grads = list(zip(grads, self.params)) trainer = tf.train.AdamOptimizer( learning_rate=self.learning_rate_ph, epsilon=1e-5) self._train = trainer.apply_gradients(grads) self.loss_names = [ 'policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac' ] with tf.variable_scope("input_info", reuse=False): tf.summary.scalar('discounted_rewards', tf.reduce_mean(self.rewards_ph)) tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph)) tf.summary.scalar('advantage', tf.reduce_mean(self.advs_ph)) tf.summary.scalar('clip_range', tf.reduce_mean(self.clip_range_ph)) if self.clip_range_vf_ph is not None: tf.summary.scalar( 'clip_range_vf', tf.reduce_mean(self.clip_range_vf_ph)) tf.summary.scalar('old_neglog_action_probability', tf.reduce_mean(self.old_neglog_pac_ph)) tf.summary.scalar('old_value_pred', tf.reduce_mean(self.old_vpred_ph)) if self.full_tensorboard_log: tf.summary.histogram('discounted_rewards', self.rewards_ph) tf.summary.histogram('learning_rate', self.learning_rate_ph) tf.summary.histogram('advantage', self.advs_ph) tf.summary.histogram('clip_range', self.clip_range_ph) tf.summary.histogram('old_neglog_action_probability', self.old_neglog_pac_ph) tf.summary.histogram('old_value_pred', self.old_vpred_ph) if tf_util.is_image(self.observation_space): tf.summary.image('observation', train_model.obs_ph) else: tf.summary.histogram('observation', train_model.obs_ph) self.train_model = train_model self.act_model = act_model self.step = act_model.step self.proba_step = act_model.proba_step self.value = act_model.value self.initial_state = act_model.initial_state tf.global_variables_initializer().run(session=self.sess) # pylint: disable=E1101 self.summary = tf.summary.merge_all()
def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="ACKTR", reset_num_timesteps=True): new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) self.n_batch = self.n_envs * self.n_steps self.learning_rate_schedule = Scheduler(initial_value=self.learning_rate, n_values=total_timesteps, schedule=self.lr_schedule) # FIFO queue of the q_runner thread is closed at the end of the learn function. # As a result, it needs to be redefinied at every call with self.graph.as_default(): with tf.variable_scope("kfac_apply", reuse=self.trained, custom_getter=tf_util.outer_scope_getter("kfac_apply")): # Some of the variables are not in a scope when they are create # so we make a note of any previously uninitialized variables tf_vars = tf.global_variables() is_uninitialized = self.sess.run([tf.is_variable_initialized(var) for var in tf_vars]) old_uninitialized_vars = [v for (v, f) in zip(tf_vars, is_uninitialized) if not f] self.train_op, self.q_runner = self.optim.apply_gradients(list(zip(self.grads_check, self.params))) # then we check for new uninitialized variables and initialize them tf_vars = tf.global_variables() is_uninitialized = self.sess.run([tf.is_variable_initialized(var) for var in tf_vars]) new_uninitialized_vars = [v for (v, f) in zip(tf_vars, is_uninitialized) if not f and v not in old_uninitialized_vars] if len(new_uninitialized_vars) != 0: self.sess.run(tf.variables_initializer(new_uninitialized_vars)) self.trained = True runner = A2CRunner(self.env, self, n_steps=self.n_steps, gamma=self.gamma) self.episode_reward = np.zeros((self.n_envs,)) t_start = time.time() coord = tf.train.Coordinator() if self.q_runner is not None: enqueue_threads = self.q_runner.create_threads(self.sess, coord=coord, start=True) else: enqueue_threads = [] # Training stats (when using Monitor wrapper) ep_info_buf = deque(maxlen=100) for update in range(1, total_timesteps // self.n_batch + 1): # true_reward is the reward without discount obs, states, rewards, masks, actions, values, ep_infos, true_reward = runner.run() ep_info_buf.extend(ep_infos) policy_loss, value_loss, policy_entropy = self._train_step(obs, states, rewards, masks, actions, values, self.num_timesteps // (self.n_batch + 1), writer) n_seconds = time.time() - t_start fps = int((update * self.n_batch) / n_seconds) if writer is not None: self.episode_reward = total_episode_reward_logger(self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, self.num_timesteps) if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", self.num_timesteps) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("policy_loss", float(policy_loss)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(explained_var)) if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0: logger.logkv('ep_reward_mean', safe_mean([ep_info['r'] for ep_info in ep_info_buf])) logger.logkv('ep_len_mean', safe_mean([ep_info['l'] for ep_info in ep_info_buf])) logger.dump_tabular() self.num_timesteps += self.n_batch + 1 coord.request_stop() coord.join(enqueue_threads) return self
def build_train(q_func, ob_space, ac_space, optimizer, sess, grad_norm_clipping=None, gamma=1.0, double_q=True, scope="deepq", reuse=None, param_noise=False, param_noise_filter_func=None, full_tensorboard_log=False): """ Creates the train function: :param q_func: (DQNPolicy) the policy :param ob_space: (Gym Space) The observation space of the environment :param ac_space: (Gym Space) The action space of the environment :param reuse: (bool) whether or not to reuse the graph variables :param optimizer: (tf.train.Optimizer) optimizer to use for the Q-learning objective. :param sess: (TensorFlow session) The current TensorFlow session :param grad_norm_clipping: (float) clip gradient norms to this value. If None no clipping is performed. :param gamma: (float) discount rate. :param double_q: (bool) if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. :param scope: (str or VariableScope) optional scope for variable_scope. :param reuse: (bool) whether or not the variables should be reused. To be able to reuse the scope must be given. :param param_noise: (bool) whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) :param param_noise_filter_func: (function (TensorFlow Tensor): bool) function that decides whether or not a variable should be perturbed. Only applicable if param_noise is True. If set to None, default_param_noise_filter is used by default. :param full_tensorboard_log: (bool) enable additional logging when using tensorboard WARNING: this logging can take a lot of space quickly :return: (tuple) act: (function (TensorFlow Tensor, bool, float): TensorFlow Tensor) function to select and action given observation. See the top of the file for details. train: (function (Any, numpy float, numpy float, Any, numpy bool, numpy float): numpy float) optimize the error in Bellman's equation. See the top of the file for details. update_target: (function) copy the parameters from optimized Q function to the target Q function. See the top of the file for details. step_model: (DQNPolicy) Policy for evaluation """ n_actions = ac_space.nvec if isinstance(ac_space, MultiDiscrete) else ac_space.n with tf.variable_scope("input", reuse=reuse): stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic") update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps") with tf.variable_scope(scope, reuse=reuse): if param_noise: act_f, obs_phs = build_act_with_param_noise( q_func, ob_space, ac_space, stochastic_ph, update_eps_ph, sess, param_noise_filter_func=param_noise_filter_func) else: act_f, obs_phs = build_act(q_func, ob_space, ac_space, stochastic_ph, update_eps_ph, sess) # q network evaluation with tf.variable_scope( "step_model", reuse=True, custom_getter=tf_util.outer_scope_getter("step_model")): step_model = q_func(sess, ob_space, ac_space, 1, 1, None, reuse=True, obs_phs=obs_phs) q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/model") # target q network evaluation with tf.variable_scope("target_q_func", reuse=False): target_policy = q_func(sess, ob_space, ac_space, 1, 1, None, reuse=False) target_q_func_vars = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/target_q_func") # compute estimate of best possible value starting from state at t + 1 double_q_values = None double_obs_ph = target_policy.obs_ph if double_q: with tf.variable_scope( "double_q", reuse=True, custom_getter=tf_util.outer_scope_getter("double_q")): double_policy = q_func(sess, ob_space, ac_space, 1, 1, None, reuse=True) double_q_values = double_policy.q_values double_obs_ph = double_policy.obs_ph with tf.variable_scope("loss", reuse=reuse): # set up placeholders act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") # q scores for actions which we know were selected in the given state. q_t_selected = tf.reduce_sum(step_model.q_values * tf.one_hot(act_t_ph, n_actions), axis=1) # compute estimate of best possible value starting from state at t + 1 if double_q: q_tp1_best_using_online_net = tf.argmax(double_q_values, axis=1) q_tp1_best = tf.reduce_sum( target_policy.q_values * tf.one_hot(q_tp1_best_using_online_net, n_actions), axis=1) else: q_tp1_best = tf.reduce_max(target_policy.q_values, axis=1) q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best # compute RHS of bellman equation q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked # compute the error (potentially clipped) td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) errors = tf_util.huber_loss(td_error) weighted_error = tf.reduce_mean(importance_weights_ph * errors) tf.summary.scalar("td_error", tf.reduce_mean(td_error)) tf.summary.scalar("loss", weighted_error) if full_tensorboard_log: tf.summary.histogram("td_error", td_error) # update_target_fn will be called periodically to copy Q network to target Q network update_target_expr = [] for var, var_target in zip( sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) # compute optimization op (potentially with gradient clipping) gradients = optimizer.compute_gradients(weighted_error, var_list=q_func_vars) if grad_norm_clipping is not None: for i, (grad, var) in enumerate(gradients): if grad is not None: gradients[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var) with tf.variable_scope("input_info", reuse=False): tf.summary.scalar('rewards', tf.reduce_mean(rew_t_ph)) tf.summary.scalar('importance_weights', tf.reduce_mean(importance_weights_ph)) if full_tensorboard_log: tf.summary.histogram('rewards', rew_t_ph) tf.summary.histogram('importance_weights', importance_weights_ph) if tf_util.is_image(obs_phs[0]): tf.summary.image('observation', obs_phs[0]) elif len(obs_phs[0].shape) == 1: tf.summary.histogram('observation', obs_phs[0]) optimize_expr = optimizer.apply_gradients(gradients) summary = tf.summary.merge_all() # Create callable functions train = tf_util.function(inputs=[ obs_phs[0], act_t_ph, rew_t_ph, target_policy.obs_ph, double_obs_ph, done_mask_ph, importance_weights_ph ], outputs=[summary, td_error], updates=[optimize_expr]) update_target = tf_util.function([], [], updates=[update_target_expr]) return act_f, train, update_target, step_model
def setup_model(self): with SetVerbosity(self.verbose): assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the PPO2 model must be " \ "an instance of common.policies.ActorCriticPolicy." self.n_batch = self.n_envs * self.n_steps n_cpu = multiprocessing.cpu_count() if sys.platform == 'darwin': n_cpu //= 2 self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.make_session(num_cpu=n_cpu, graph=self.graph) n_batch_step = None n_batch_train = None if issubclass(self.policy, LstmPolicy): assert self.n_envs % self.nminibatches == 0, "For recurrent policies, "\ "the number of environments run in parallel should be a multiple of nminibatches." n_batch_step = self.n_envs n_batch_train = self.n_batch // self.nminibatches act_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, n_batch_step, reuse=False) with tf.variable_scope( "train_model", reuse=True, custom_getter=tf_util.outer_scope_getter( "train_model")): train_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs // self.nminibatches, self.n_steps, n_batch_train, reuse=True) with tf.variable_scope("loss", reuse=False): self.action_ph = train_model.pdtype.sample_placeholder( [None], name="action_ph") self.advs_ph = tf.placeholder(tf.float32, [None], name="advs_ph") self.rewards_ph = tf.placeholder(tf.float32, [None], name="rewards_ph") self.old_neglog_pac_ph = tf.placeholder( tf.float32, [None], name="old_neglog_pac_ph") self.old_vpred_ph = tf.placeholder(tf.float32, [None], name="old_vpred_ph") self.learning_rate_ph = tf.placeholder( tf.float32, [], name="learning_rate_ph") self.clip_range_ph = tf.placeholder(tf.float32, [], name="clip_range_ph") neglogpac = train_model.proba_distribution.neglogp( self.action_ph) self.entropy = tf.reduce_mean( train_model.proba_distribution.entropy()) vpred = train_model._value vpredclipped = self.old_vpred_ph + tf.clip_by_value( train_model._value - self.old_vpred_ph, -self.clip_range_ph, self.clip_range_ph) vf_losses1 = tf.square(vpred - self.rewards_ph) vf_losses2 = tf.square(vpredclipped - self.rewards_ph) self.vf_loss = .5 * tf.reduce_mean( tf.maximum(vf_losses1, vf_losses2)) ratio = tf.exp(self.old_neglog_pac_ph - neglogpac) pg_losses = -self.advs_ph * ratio pg_losses2 = -self.advs_ph * tf.clip_by_value( ratio, 1.0 - self.clip_range_ph, 1.0 + self.clip_range_ph) self.pg_loss = tf.reduce_mean( tf.maximum(pg_losses, pg_losses2)) self.approxkl = .5 * tf.reduce_mean( tf.square(neglogpac - self.old_neglog_pac_ph)) self.clipfrac = tf.reduce_mean( tf.to_float( tf.greater(tf.abs(ratio - 1.0), self.clip_range_ph))) loss = self.pg_loss - self.entropy * self.ent_coef + self.vf_loss * self.vf_coef tf.summary.scalar('entropy_loss', self.entropy) tf.summary.scalar('policy_gradient_loss', self.pg_loss) tf.summary.scalar('value_function_loss', self.vf_loss) tf.summary.scalar('approximate_kullback-leiber', self.approxkl) tf.summary.scalar('clip_factor', self.clipfrac) tf.summary.scalar('loss', loss) with tf.variable_scope('model'): self.params = tf.trainable_variables() grads = tf.gradients(loss, self.params) if self.max_grad_norm is not None: grads, _grad_norm = tf.clip_by_global_norm( grads, self.max_grad_norm) grads = list(zip(grads, self.params)) trainer = tf.train.AdamOptimizer( learning_rate=self.learning_rate_ph, epsilon=1e-5) self._train = trainer.apply_gradients(grads) self.loss_names = [ 'policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac' ] with tf.variable_scope("input_info", reuse=False): tf.summary.scalar('discounted_rewards', tf.reduce_mean(self.rewards_ph)) tf.summary.histogram('discounted_rewards', self.rewards_ph) tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph)) tf.summary.histogram('learning_rate', self.learning_rate_ph) tf.summary.scalar('advantage', tf.reduce_mean(self.advs_ph)) tf.summary.histogram('advantage', self.advs_ph) tf.summary.scalar('clip_range', tf.reduce_mean(self.clip_range_ph)) tf.summary.histogram('clip_range', self.clip_range_ph) tf.summary.scalar('old_neglog_action_probabilty', tf.reduce_mean(self.old_neglog_pac_ph)) tf.summary.histogram('old_neglog_action_probabilty', self.old_neglog_pac_ph) tf.summary.scalar('old_value_pred', tf.reduce_mean(self.old_vpred_ph)) tf.summary.histogram('old_value_pred', self.old_vpred_ph) if len(self.observation_space.shape) == 3: tf.summary.image('observation', train_model.obs_ph) else: tf.summary.histogram('observation', train_model.obs_ph) self.train_model = train_model self.act_model = act_model self.step = act_model.step self.proba_step = act_model.proba_step self.value = act_model.value self.initial_state = act_model.initial_state tf.global_variables_initializer().run(session=self.sess) # pylint: disable=E1101 self.summary = tf.summary.merge_all()
def setup_model(self): with SetVerbosity(self.verbose): assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the ACKTR model must be " \ "an instance of common.policies.ActorCriticPolicy." if isinstance(self.action_space, Box): raise NotImplementedError("WIP: ACKTR does not support Continuous actions yet.") self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.make_session(num_cpu=self.nprocs, graph=self.graph) n_batch_step = None n_batch_train = None if issubclass(self.policy, RecurrentActorCriticPolicy): n_batch_step = self.n_envs n_batch_train = self.n_envs * self.n_steps self.model = step_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, n_batch_step, reuse=False, **self.policy_kwargs) self.params = params = find_trainable_variables("model") with tf.variable_scope("train_model", reuse=True, custom_getter=tf_util.outer_scope_getter("train_model")): self.model2 = train_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, self.n_steps, n_batch_train, reuse=True, **self.policy_kwargs) with tf.variable_scope("loss", reuse=False, custom_getter=tf_util.outer_scope_getter("loss")): self.advs_ph = advs_ph = tf.placeholder(tf.float32, [None]) self.rewards_ph = rewards_ph = tf.placeholder(tf.float32, [None]) self.pg_lr_ph = pg_lr_ph = tf.placeholder(tf.float32, []) self.action_ph = action_ph = train_model.pdtype.sample_placeholder([None]) logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.policy, labels=action_ph) self.logits = train_model.policy # training loss pg_loss = tf.reduce_mean(advs_ph * logpac) self.entropy = entropy = tf.reduce_mean(calc_entropy(train_model.policy)) self.pg_loss = pg_loss = pg_loss - self.ent_coef * entropy self.vf_loss = vf_loss = mse(tf.squeeze(train_model.value_fn), rewards_ph) train_loss = pg_loss + self.vf_coef * vf_loss # Fisher loss construction self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac) sample_net = train_model.value_fn + tf.random_normal(tf.shape(train_model.value_fn)) self.vf_fisher = vf_fisher_loss = - self.vf_fisher_coef * tf.reduce_mean( tf.pow(train_model.value_fn - tf.stop_gradient(sample_net), 2)) self.joint_fisher = pg_fisher_loss + vf_fisher_loss tf.summary.scalar('entropy_loss', self.entropy) tf.summary.scalar('policy_gradient_loss', pg_loss) tf.summary.scalar('policy_gradient_fisher_loss', pg_fisher_loss) tf.summary.scalar('value_function_loss', self.vf_loss) tf.summary.scalar('value_function_fisher_loss', vf_fisher_loss) tf.summary.scalar('loss', train_loss) self.grads_check = tf.gradients(train_loss, params) with tf.variable_scope("input_info", reuse=False): tf.summary.scalar('discounted_rewards', tf.reduce_mean(self.rewards_ph)) tf.summary.scalar('learning_rate', tf.reduce_mean(self.pg_lr_ph)) tf.summary.scalar('advantage', tf.reduce_mean(self.advs_ph)) if self.full_tensorboard_log: tf.summary.histogram('discounted_rewards', self.rewards_ph) tf.summary.histogram('learning_rate', self.pg_lr_ph) tf.summary.histogram('advantage', self.advs_ph) if tf_util.is_image(self.observation_space): tf.summary.image('observation', train_model.obs_ph) else: tf.summary.histogram('observation', train_model.obs_ph) with tf.variable_scope("kfac", reuse=False, custom_getter=tf_util.outer_scope_getter("kfac")): with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer(learning_rate=pg_lr_ph, clip_kl=self.kfac_clip, momentum=0.9, kfac_update=1, epsilon=0.01, stats_decay=0.99, async_eigen_decomp=self.async_eigen_decomp, cold_iter=10, max_grad_norm=self.max_grad_norm, verbose=self.verbose) optim.compute_and_apply_stats(self.joint_fisher, var_list=params) self.train_model = train_model self.step_model = step_model self.step = step_model.step self.proba_step = step_model.proba_step self.value = step_model.value self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=self.sess) self.summary = tf.summary.merge_all()
def setup_model(self): with SetVerbosity(self.verbose): assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the A2C model must be an " \ "instance of common.policies.ActorCriticPolicy." self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.make_session(graph=self.graph) self.n_batch = self.n_envs * self.n_steps n_batch_step = None n_batch_train = None if issubclass(self.policy, RecurrentActorCriticPolicy): n_batch_step = self.n_envs n_batch_train = self.n_envs * self.n_steps step_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, n_batch_step, reuse=False, **self.policy_kwargs) with tf.variable_scope("train_model", reuse=True, custom_getter=tf_util.outer_scope_getter("train_model")): train_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, self.n_steps, n_batch_train, reuse=True, **self.policy_kwargs) with tf.variable_scope("loss", reuse=False): self.actions_ph = train_model.pdtype.sample_placeholder([None], name="action_ph") self.advs_ph = tf.placeholder(tf.float32, [None], name="advs_ph") self.rewards_ph = tf.placeholder(tf.float32, [None], name="rewards_ph") self.learning_rate_ph = tf.placeholder(tf.float32, [], name="learning_rate_ph") neglogpac = train_model.proba_distribution.neglogp(self.actions_ph) self.entropy = tf.reduce_mean(train_model.proba_distribution.entropy()) self.pg_loss = tf.reduce_mean(self.advs_ph * neglogpac) self.vf_loss = mse(tf.squeeze(train_model.value_flat), self.rewards_ph) # https://arxiv.org/pdf/1708.04782.pdf#page=9, https://arxiv.org/pdf/1602.01783.pdf#page=4 # and https://github.com/dennybritz/reinforcement-learning/issues/34 # suggest to add an entropy component in order to improve exploration. loss = self.pg_loss - self.entropy * self.ent_coef + self.vf_loss * self.vf_coef tf.summary.scalar('entropy_loss', self.entropy) tf.summary.scalar('policy_gradient_loss', self.pg_loss) tf.summary.scalar('value_function_loss', self.vf_loss) tf.summary.scalar('loss', loss) self.params = tf_util.get_trainable_vars("model") grads = tf.gradients(loss, self.params) if self.max_grad_norm is not None: grads, _ = tf.clip_by_global_norm(grads, self.max_grad_norm) grads = list(zip(grads, self.params)) with tf.variable_scope("input_info", reuse=False): tf.summary.scalar('discounted_rewards', tf.reduce_mean(self.rewards_ph)) tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph)) tf.summary.scalar('advantage', tf.reduce_mean(self.advs_ph)) if self.full_tensorboard_log: tf.summary.histogram('discounted_rewards', self.rewards_ph) tf.summary.histogram('learning_rate', self.learning_rate_ph) tf.summary.histogram('advantage', self.advs_ph) if tf_util.is_image(self.observation_space): tf.summary.image('observation', train_model.obs_ph) else: tf.summary.histogram('observation', train_model.obs_ph) trainer = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate_ph, decay=self.alpha, epsilon=self.epsilon) self.apply_backprop = trainer.apply_gradients(grads) self.train_model = train_model self.step_model = step_model self.step = step_model.step self.proba_step = step_model.proba_step self.all_step = step_model.all_step self.value = step_model.value self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=self.sess) self.summary = tf.summary.merge_all()
def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="ACKTR", reset_num_timesteps=True): new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() self.n_batch = self.n_envs * self.n_steps self.learning_rate_schedule = Scheduler( initial_value=self.learning_rate, n_values=total_timesteps, schedule=self.lr_schedule) # FIFO queue of the q_runner thread is closed at the end of the learn function. # As a result, it needs to be redefinied at every call with self.graph.as_default(): with tf.variable_scope( "kfac_apply", reuse=self.trained, custom_getter=tf_util.outer_scope_getter( "kfac_apply")): # Some of the variables are not in a scope when they are create # so we make a note of any previously uninitialized variables tf_vars = tf.global_variables() is_uninitialized = self.sess.run( [tf.is_variable_initialized(var) for var in tf_vars]) old_uninitialized_vars = [ v for (v, f) in zip(tf_vars, is_uninitialized) if not f ] self.train_op, self.q_runner = self.optim.apply_gradients( list(zip(self.grads_check, self.params))) # then we check for new uninitialized variables and initialize them tf_vars = tf.global_variables() is_uninitialized = self.sess.run( [tf.is_variable_initialized(var) for var in tf_vars]) new_uninitialized_vars = [ v for (v, f) in zip(tf_vars, is_uninitialized) if not f and v not in old_uninitialized_vars ] if len(new_uninitialized_vars) != 0: self.sess.run( tf.variables_initializer(new_uninitialized_vars)) self.trained = True t_start = time.time() coord = tf.train.Coordinator() if self.q_runner is not None: enqueue_threads = self.q_runner.create_threads(self.sess, coord=coord, start=True) else: enqueue_threads = [] callback.on_training_start(locals(), globals()) for update in range(1, total_timesteps // self.n_batch + 1): callback.on_rollout_start() # pytype:disable=bad-unpacking # true_reward is the reward without discount if isinstance(self.runner, PPO2Runner): # We are using GAE rollout = self.runner.run(callback) obs, returns, masks, actions, values, _, states, ep_infos, true_reward = rollout else: rollout = self.runner.run(callback) obs, states, returns, masks, actions, values, ep_infos, true_reward = rollout # pytype:enable=bad-unpacking callback.on_rollout_end() # Early stopping due to the callback if not self.runner.continue_training: break self.ep_info_buf.extend(ep_infos) policy_loss, value_loss, policy_entropy = self._train_step( obs, states, returns, masks, actions, values, self.num_timesteps // (self.n_batch + 1), writer) n_seconds = time.time() - t_start fps = int((update * self.n_batch) / n_seconds) if writer is not None: total_episode_reward_logger( self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, self.num_timesteps) if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, returns) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", self.num_timesteps) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("policy_loss", float(policy_loss)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(explained_var)) if len(self.ep_info_buf) > 0 and len( self.ep_info_buf[0]) > 0: logger.logkv( 'ep_reward_mean', safe_mean([ ep_info['r'] for ep_info in self.ep_info_buf ])) logger.logkv( 'ep_len_mean', safe_mean([ ep_info['l'] for ep_info in self.ep_info_buf ])) logger.dump_tabular() coord.request_stop() coord.join(enqueue_threads) callback.on_training_end() return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="ACKTR"): with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name) as writer: self._setup_learn(seed) self.n_batch = self.n_envs * self.n_steps self.learning_rate_schedule = Scheduler(initial_value=self.learning_rate, n_values=total_timesteps, schedule=self.lr_schedule) # FIFO queue of the q_runner thread is closed at the end of the learn function. # As a result, it needs to be redefinied at every call with self.graph.as_default(): with tf.variable_scope("kfac_apply", reuse=self.trained, custom_getter=tf_util.outer_scope_getter("kfac_apply")): # Some of the variables are not in a scope when they are create # so we make a note of any previously uninitialized variables tf_vars = tf.global_variables() is_uninitialized = self.sess.run([tf.is_variable_initialized(var) for var in tf_vars]) old_uninitialized_vars = [v for (v, f) in zip(tf_vars, is_uninitialized) if not f] self.train_op, self.q_runner = self.optim.apply_gradients(list(zip(self.grads_check, self.params))) # then we check for new uninitialized variables and initialize them tf_vars = tf.global_variables() is_uninitialized = self.sess.run([tf.is_variable_initialized(var) for var in tf_vars]) new_uninitialized_vars = [v for (v, f) in zip(tf_vars, is_uninitialized) if not f and v not in old_uninitialized_vars] if len(new_uninitialized_vars) != 0: self.sess.run(tf.variables_initializer(new_uninitialized_vars)) self.trained = True runner = A2CRunner(self.env, self, n_steps=self.n_steps, gamma=self.gamma) self.episode_reward = np.zeros((self.n_envs,)) t_start = time.time() coord = tf.train.Coordinator() enqueue_threads = self.q_runner.create_threads(self.sess, coord=coord, start=True) for update in range(1, total_timesteps // self.n_batch + 1): # true_reward is the reward without discount obs, states, rewards, masks, actions, values, true_reward = runner.run() policy_loss, value_loss, policy_entropy = self._train_step(obs, states, rewards, masks, actions, values, update, writer) n_seconds = time.time() - t_start fps = int((update * self.n_batch) / n_seconds) if writer is not None: self.episode_reward = total_episode_reward_logger(self.episode_reward, true_reward.reshape((self.n_envs, self.n_steps)), masks.reshape((self.n_envs, self.n_steps)), writer, update * (self.n_batch + 1)) if callback is not None: callback(locals(), globals()) if self.verbose >= 1 and (update % log_interval == 0 or update == 1): explained_var = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update * self.n_batch) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("policy_loss", float(policy_loss)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(explained_var)) logger.dump_tabular() coord.request_stop() coord.join(enqueue_threads) return self
def setup_model(self): with SetVerbosity(self.verbose): assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the ACER model must be " \ "an instance of common.policies.ActorCriticPolicy." if isinstance(self.action_space, Discrete): self.n_act = self.action_space.n continuous = False elif isinstance(self.action_space, Box): # self.n_act = self.action_space.shape[-1] # continuous = True raise NotImplementedError( "WIP: Acer does not support Continuous actions yet.") else: raise ValueError( "Error: ACER does not work with {} actions space.".format( self.action_space)) self.n_batch = self.n_envs * self.n_steps self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) self.set_random_seed(self.seed) n_batch_step = None if issubclass(self.policy, RecurrentActorCriticPolicy): n_batch_step = self.n_envs n_batch_train = self.n_envs * (self.n_steps + 1) step_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, n_batch_step, reuse=False, **self.policy_kwargs) self.params = tf_util.get_trainable_vars("model") with tf.variable_scope( "train_model", reuse=True, custom_getter=tf_util.outer_scope_getter( "train_model")): train_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, self.n_steps + 1, n_batch_train, reuse=True, **self.policy_kwargs) with tf.variable_scope("moving_average"): # create averaged model ema = tf.train.ExponentialMovingAverage(self.alpha) ema_apply_op = ema.apply(self.params) def custom_getter(getter, name, *args, **kwargs): name = name.replace("polyak_model/", "") val = ema.average(getter(name, *args, **kwargs)) return val with tf.variable_scope("polyak_model", reuse=True, custom_getter=custom_getter): self.polyak_model = polyak_model = self.policy( self.sess, self.observation_space, self.action_space, self.n_envs, self.n_steps + 1, self.n_envs * (self.n_steps + 1), reuse=True, **self.policy_kwargs) with tf.variable_scope("loss", reuse=False): self.done_ph = tf.placeholder(tf.float32, [self.n_batch]) # dones self.reward_ph = tf.placeholder( tf.float32, [self.n_batch]) # rewards, not returns self.mu_ph = tf.placeholder( tf.float32, [self.n_batch, self.n_act]) # mu's self.action_ph = train_model.pdtype.sample_placeholder( [self.n_batch]) self.learning_rate_ph = tf.placeholder(tf.float32, []) eps = 1e-6 # Notation: (var) = batch variable, (var)s = sequence variable, # (var)_i = variable index by action at step i # shape is [n_envs * (n_steps + 1)] if continuous: value = train_model.value_flat else: value = tf.reduce_sum(train_model.policy_proba * train_model.q_value, axis=-1) rho, rho_i_ = None, None if continuous: action_ = strip( train_model.proba_distribution.sample(), self.n_envs, self.n_steps) distribution_f = tf.contrib.distributions.MultivariateNormalDiag( loc=strip(train_model.proba_distribution.mean, self.n_envs, self.n_steps), scale_diag=strip( train_model.proba_distribution.logstd, self.n_envs, self.n_steps)) f_polyak = tf.contrib.distributions.MultivariateNormalDiag( loc=strip(polyak_model.proba_distribution.mean, self.n_envs, self.n_steps), scale_diag=strip( polyak_model.proba_distribution.logstd, self.n_envs, self.n_steps)) f_i = distribution_f.prob(self.action_ph) f_i_ = distribution_f.prob(action_) f_polyak_i = f_polyak.prob(self.action_ph) phi_i = strip(train_model.proba_distribution.mean, self.n_envs, self.n_steps) q_value = strip(train_model.value_fn, self.n_envs, self.n_steps) q_i = q_value[:, 0] rho_i = tf.reshape(f_i, [-1, 1]) / (self.mu_ph + eps) rho_i_ = tf.reshape(f_i_, [-1, 1]) / (self.mu_ph + eps) qret = q_retrace(self.reward_ph, self.done_ph, q_i, value, tf.pow(rho_i, 1 / self.n_act), self.n_envs, self.n_steps, self.gamma) else: # strip off last step # f is a distribution, chosen to be Gaussian distributions # with fixed diagonal covariance and mean \phi(x) # in the paper distribution_f, f_polyak, q_value = \ map(lambda variables: strip(variables, self.n_envs, self.n_steps), [train_model.policy_proba, polyak_model.policy_proba, train_model.q_value]) # Get pi and q values for actions taken f_i = get_by_index(distribution_f, self.action_ph) f_i_ = distribution_f phi_i = distribution_f f_polyak_i = f_polyak q_i = get_by_index(q_value, self.action_ph) # Compute ratios for importance truncation rho = distribution_f / (self.mu_ph + eps) rho_i = get_by_index(rho, self.action_ph) # Calculate Q_retrace targets qret = q_retrace(self.reward_ph, self.done_ph, q_i, value, rho_i, self.n_envs, self.n_steps, self.gamma) # Calculate losses # Entropy entropy = tf.reduce_sum( train_model.proba_distribution.entropy()) # Policy Gradient loss, with truncated importance sampling & bias correction value = strip(value, self.n_envs, self.n_steps, True) # check_shape([qret, value, rho_i, f_i], [[self.n_envs * self.n_steps]] * 4) # check_shape([rho, distribution_f, q_value], [[self.n_envs * self.n_steps, self.n_act]] * 2) # Truncated importance sampling adv = qret - value log_f = tf.log(f_i + eps) # [n_envs * n_steps] gain_f = log_f * tf.stop_gradient( adv * tf.minimum(self.correction_term, rho_i)) loss_f = -tf.reduce_mean(gain_f) # Bias correction for the truncation adv_bc = ( q_value - tf.reshape(value, [self.n_envs * self.n_steps, 1]) ) # [n_envs * n_steps, n_act] # check_shape([adv_bc, log_f_bc], [[self.n_envs * self.n_steps, self.n_act]] * 2) if continuous: gain_bc = tf.stop_gradient( adv_bc * tf.nn.relu(1.0 - (self.correction_term / (rho_i_ + eps))) * f_i_) else: log_f_bc = tf.log(f_i_ + eps) # / (f_old + eps) gain_bc = tf.reduce_sum(log_f_bc * tf.stop_gradient( adv_bc * tf.nn.relu(1.0 - (self.correction_term / (rho + eps))) * f_i_), axis=1) # IMP: This is sum, as expectation wrt f loss_bc = -tf.reduce_mean(gain_bc) loss_policy = loss_f + loss_bc # Value/Q function loss, and explained variance check_shape([qret, q_i], [[self.n_envs * self.n_steps]] * 2) explained_variance = q_explained_variance( tf.reshape(q_i, [self.n_envs, self.n_steps]), tf.reshape(qret, [self.n_envs, self.n_steps])) loss_q = tf.reduce_mean( tf.square(tf.stop_gradient(qret) - q_i) * 0.5) # Net loss check_shape([loss_policy, loss_q, entropy], [[]] * 3) loss = loss_policy + self.q_coef * loss_q - self.ent_coef * entropy tf.summary.scalar('entropy_loss', entropy) tf.summary.scalar('policy_gradient_loss', loss_policy) tf.summary.scalar('value_function_loss', loss_q) tf.summary.scalar('loss', loss) norm_grads_q, norm_grads_policy, avg_norm_grads_f = None, None, None avg_norm_k, avg_norm_g, avg_norm_k_dot_g, avg_norm_adj = None, None, None, None if self.trust_region: # [n_envs * n_steps, n_act] grad = tf.gradients( -(loss_policy - self.ent_coef * entropy) * self.n_steps * self.n_envs, phi_i) # [n_envs * n_steps, n_act] # Directly computed gradient of KL divergence wrt f kl_grad = -f_polyak_i / (f_i_ + eps) k_dot_g = tf.reduce_sum(kl_grad * grad, axis=-1) adj = tf.maximum( 0.0, (tf.reduce_sum(kl_grad * grad, axis=-1) - self.delta) / (tf.reduce_sum(tf.square(kl_grad), axis=-1) + eps)) # [n_envs * n_steps] # Calculate stats (before doing adjustment) for logging. avg_norm_k = avg_norm(kl_grad) avg_norm_g = avg_norm(grad) avg_norm_k_dot_g = tf.reduce_mean(tf.abs(k_dot_g)) avg_norm_adj = tf.reduce_mean(tf.abs(adj)) grad = grad - tf.reshape( adj, [self.n_envs * self.n_steps, 1]) * kl_grad # These are turst region adjusted gradients wrt f ie statistics of policy pi grads_f = -grad / (self.n_envs * self.n_steps) grads_policy = tf.gradients(f_i_, self.params, grads_f) grads_q = tf.gradients(loss_q * self.q_coef, self.params) grads = [ gradient_add(g1, g2, param, verbose=self.verbose) for (g1, g2, param ) in zip(grads_policy, grads_q, self.params) ] avg_norm_grads_f = avg_norm(grads_f) * (self.n_steps * self.n_envs) norm_grads_q = tf.global_norm(grads_q) norm_grads_policy = tf.global_norm(grads_policy) else: grads = tf.gradients(loss, self.params) norm_grads = None if self.max_grad_norm is not None: grads, norm_grads = tf.clip_by_global_norm( grads, self.max_grad_norm) grads = list(zip(grads, self.params)) with tf.variable_scope("input_info", reuse=False): tf.summary.scalar('rewards', tf.reduce_mean(self.reward_ph)) tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate)) tf.summary.scalar('advantage', tf.reduce_mean(adv)) tf.summary.scalar('action_probability', tf.reduce_mean(self.mu_ph)) if self.full_tensorboard_log: tf.summary.histogram('rewards', self.reward_ph) tf.summary.histogram('learning_rate', self.learning_rate) tf.summary.histogram('advantage', adv) tf.summary.histogram('action_probability', self.mu_ph) if tf_util.is_image(self.observation_space): tf.summary.image('observation', train_model.obs_ph) else: tf.summary.histogram('observation', train_model.obs_ph) trainer = tf.train.RMSPropOptimizer( learning_rate=self.learning_rate_ph, decay=self.rprop_alpha, epsilon=self.rprop_epsilon) _opt_op = trainer.apply_gradients(grads) # so when you call _train, you first do the gradient step, then you apply ema with tf.control_dependencies([_opt_op]): _train = tf.group(ema_apply_op) # Ops/Summaries to run, and their names for logging assert norm_grads is not None run_ops = [ _train, loss, loss_q, entropy, loss_policy, loss_f, loss_bc, explained_variance, norm_grads ] names_ops = [ 'loss', 'loss_q', 'entropy', 'loss_policy', 'loss_f', 'loss_bc', 'explained_variance', 'norm_grads' ] if self.trust_region: self.run_ops = run_ops + [ norm_grads_q, norm_grads_policy, avg_norm_grads_f, avg_norm_k, avg_norm_g, avg_norm_k_dot_g, avg_norm_adj ] self.names_ops = names_ops + [ 'norm_grads_q', 'norm_grads_policy', 'avg_norm_grads_f', 'avg_norm_k', 'avg_norm_g', 'avg_norm_k_dot_g', 'avg_norm_adj' ] self.train_model = train_model self.step_model = step_model self.step = step_model.step self.proba_step = step_model.proba_step self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=self.sess) self.summary = tf.summary.merge_all()
def setup_model(self): with SetVerbosity(self.verbose): assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the A2C model must be an " \ "instance of common.policies.ActorCriticPolicy." self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.make_session(graph=self.graph) self.n_batch = self.n_envs * self.n_steps n_batch_step = None n_batch_train = None if issubclass(self.policy, LstmPolicy): n_batch_step = self.n_envs n_batch_train = self.n_envs * self.n_steps step_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, n_batch_step, reuse=False) with tf.variable_scope( "train_model", reuse=True, custom_getter=tf_util.outer_scope_getter( "train_model")): train_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, self.n_steps, n_batch_train, reuse=True) with tf.variable_scope("loss", reuse=False): self.actions_ph = train_model.pdtype.sample_placeholder( [None], name="action_ph") self.advs_ph = tf.placeholder(tf.float32, [None], name="advs_ph") self.rewards_ph = tf.placeholder(tf.float32, [None], name="rewards_ph") self.learning_rate_ph = tf.placeholder( tf.float32, [], name="learning_rate_ph") neglogpac = train_model.proba_distribution.neglogp( self.actions_ph) self.entropy = tf.reduce_mean( train_model.proba_distribution.entropy()) self.pg_loss = tf.reduce_mean(self.advs_ph * neglogpac) self.vf_loss = mse(tf.squeeze(train_model.value_fn), self.rewards_ph) loss = self.pg_loss - self.entropy * self.ent_coef + self.vf_loss * self.vf_coef tf.summary.scalar('entropy_loss', self.entropy) tf.summary.scalar('policy_gradient_loss', self.pg_loss) tf.summary.scalar('value_function_loss', self.vf_loss) tf.summary.scalar('loss', loss) self.params = find_trainable_variables("model") grads = tf.gradients(loss, self.params) if self.max_grad_norm is not None: grads, _ = tf.clip_by_global_norm( grads, self.max_grad_norm) grads = list(zip(grads, self.params)) with tf.variable_scope("input_info", reuse=False): tf.summary.scalar('discounted_rewards', tf.reduce_mean(self.rewards_ph)) tf.summary.histogram('discounted_rewards', self.rewards_ph) tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate)) tf.summary.histogram('learning_rate', self.learning_rate) tf.summary.scalar('advantage', tf.reduce_mean(self.advs_ph)) tf.summary.histogram('advantage', self.advs_ph) if len(self.observation_space.shape) == 3: tf.summary.image('observation', train_model.obs_ph) else: tf.summary.histogram('observation', train_model.obs_ph) trainer = tf.train.RMSPropOptimizer( learning_rate=self.learning_rate_ph, decay=self.alpha, epsilon=self.epsilon) self.apply_backprop = trainer.apply_gradients(grads) self.train_model = train_model self.step_model = step_model self.step = step_model.step self.proba_step = step_model.proba_step self.value = step_model.value self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=self.sess) self.summary = tf.summary.merge_all()
def setup_model(self): with SetVerbosity(self.verbose): assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the A2C model must be an " \ "instance of common.policies.ActorCriticPolicy." assert issubclass(self.policy, FeedForwardPolicy), "Error: the input policy for the A2C model must be an " \ "instance of common.policies.FeedFowardPolicy." self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.make_session(graph=self.graph) self.n_batch = self.n_envs * self.n_steps n_batch_step = None n_batch_train = None n_batch_sil = None if issubclass(self.policy, LstmPolicy): n_batch_step = self.n_envs n_batch_train = self.n_envs * self.n_steps # TODO: Add n_batch_sil = 512 step_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, n_batch_step, reuse=False) # TODO: Add with tf.variable_scope("train_model", reuse=True, custom_getter=tf_util.outer_scope_getter("train_model")): train_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, self.n_steps, n_batch_train, reuse=True) with tf.variable_scope("sil_model", reuse=True, custom_getter=tf_util.outer_scope_getter("sil_model")): sil_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, self.n_steps, n_batch_sil, reuse=True) with tf.variable_scope("loss", reuse=False): # self.actions_ph = train_model.pdtype.sample_placeholder([None], name="action_ph") self.actions_ph = train_model.action_ph self.advs_ph = tf.placeholder(tf.float32, [None], name="advs_ph") self.rewards_ph = tf.placeholder(tf.float32, [None], name="rewards_ph") self.successor_feature_ph = tf.placeholder(tf.float32, [None, FEATURE_SIZE], name="successor_feature_ph") self.learning_rate_ph = tf.placeholder(tf.float32, [], name="learning_rate_ph") neglogpac = train_model.proba_distribution.neglogp(self.actions_ph) last_frame = tf.reshape(train_model.obs_ph[..., 3], shape=[-1, 84 * 84]) recons_losses = tf.squared_difference(x=last_frame, y=train_model.recons_mod) self.recons_loss = tf.losses.mean_squared_error(labels=last_frame, predictions=train_model.recons_mod) self.entropy = tf.reduce_mean(train_model.proba_distribution.entropy()) self.pg_loss = tf.reduce_mean(self.advs_ph * neglogpac) if self.use_recons: self.vf_loss = mse(tf.squeeze(train_model.value_fn), self.rewards_ph + self.recons_intri * tf.stop_gradient(self.recons_loss)) else: self.vf_loss = mse(tf.squeeze(train_model.value_fn), self.rewards_ph) # TODO: loss of SF self.sf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.successor_feature), self.successor_feature_ph)) loss = self.pg_loss - \ self.entropy * self.ent_coef + \ self.vf_loss * self.vf_coef if self.use_recons: loss += self.recons_loss * self.recons_coef elif self.use_sf: loss += self.sf_loss * self.sf_coef + \ self.recons_loss * self.recons_coef tf.summary.scalar('recons_loss/max', tf.reduce_max(recons_losses)) tf.summary.scalar('recons_loss/min', tf.reduce_min(recons_losses)) tf.summary.scalar('recons_loss', self.recons_loss) tf.summary.scalar('entropy_loss', self.entropy) tf.summary.scalar('policy_gradient_loss', self.pg_loss) tf.summary.scalar('value_function_loss', self.vf_loss) tf.summary.scalar('successor_feature_loss', self.sf_loss) tf.summary.scalar('loss', loss) self.params = find_trainable_variables("model") grads = tf.gradients(loss, self.params) if self.max_grad_norm is not None: grads, _ = tf.clip_by_global_norm(grads, self.max_grad_norm) grads = list(zip(grads, self.params)) _last_frame = tf.reshape(last_frame, [-1, 84, 84, 1]) _recons_mod = tf.reshape(train_model.recons_mod, [-1, 84, 84, 1]) with tf.variable_scope("input_info", reuse=False): tf.summary.scalar('discounted_rewards', tf.reduce_mean(self.rewards_ph)) tf.summary.histogram('discounted_rewards', self.rewards_ph) tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate)) tf.summary.histogram('learning_rate', self.learning_rate) tf.summary.scalar('advantage', tf.reduce_mean(self.advs_ph)) tf.summary.histogram('advantage', self.advs_ph) tf.summary.image('last_frame', _last_frame) tf.summary.image('reconstruction', _recons_mod) if len(self.observation_space.shape) == 3: tf.summary.image('observation', train_model.obs_ph) else: tf.summary.histogram('observation', train_model.obs_ph) trainer = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate_ph, decay=self.alpha, epsilon=self.epsilon) self.apply_backprop = trainer.apply_gradients(grads) # TODO: Add self.sil = SelfImitation( model_ob=sil_model.obs_ph, model_vf=sil_model.value_fn, model_entropy=sil_model.proba_distribution.entropy(), fn_value=sil_model.value, fn_neg_log_prob=sil_model.proba_distribution.neglogp, ac_space=self.action_space, fn_reward=np.sign, n_env=self.n_envs, n_update=self.sil_update, beta=self.sil_beta) self.sil.build_train_op( params=self.params, optim=trainer, lr=self.learning_rate_ph, max_grad_norm=self.max_grad_norm) self.train_model = train_model self.step_model = step_model # self.step = step_model.step self.step = step_model.step_with_sf self.estimate_recons = step_model.estimate_recons self.proba_step = step_model.proba_step self.value = step_model.value # TODO: Add self.successor_feature = step_model.estimate_sf self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=self.sess) self.summary = tf.summary.merge_all()
def __init__(self, make_env, hps, num_timesteps, envs_per_process, logdir): self.make_env = make_env self.hps = hps self.envs_per_process = envs_per_process self.num_timesteps = num_timesteps self.logdir = logdir self._set_env_vars() self.policy = {"rnn" : RnnPolicy, "rnnerrpred" : ErrorPredRnnPolicy,}[hps['policy_mode']] self.action_policy = self.policy( ob_space=self.ob_space, ac_space=self.ac_space, hidsize=512, feat_dim=512, ob_mean=self.ob_mean, ob_std=self.ob_std, layernormalize=False, nl=tf.nn.leaky_relu, n_env=hps['envs_per_process'], n_steps=1, reuse=False, ) with tf.variable_scope("train_model", reuse=True, custom_getter=tf_util.outer_scope_getter("train_model")): self.train_policy = self.policy( ob_space=self.ob_space, ac_space=self.ac_space, hidsize=512, feat_dim=512, ob_mean=self.ob_mean, ob_std=self.ob_std, layernormalize=False, nl=tf.nn.leaky_relu, n_env=hps['envs_per_process'] // hps['nminibatches'], n_steps=hps['nsteps_per_seg'], reuse=True, ) self.feature_extractor = {"none": FeatureExtractor, "idf": InverseDynamics,}[hps['feat_learning']] self.action_feature_extractor = self.feature_extractor(policy=self.action_policy, features_shared_with_policy=hps['feat_sharedWpol'], feat_dim=512, layernormalize=hps['layernorm']) self.train_feature_extractor = self.feature_extractor(policy=self.train_policy, features_shared_with_policy=hps['feat_sharedWpol'], feat_dim=512, layernormalize=hps['layernorm'], reuse=True) self.dynamics = Dynamics if hps['feat_learning'] != 'pix2pix' else UNet self.action_dynamics = self.dynamics(auxiliary_task=self.action_feature_extractor, predict_from_pixels=hps['dyn_from_pixels'], feat_dim=512) self.train_dynamics = self.dynamics(auxiliary_task=self.train_feature_extractor, predict_from_pixels=hps['dyn_from_pixels'], feat_dim=512, reuse=True) if 'e2e' in hps['policy_mode']: self.action_policy.prepare_else(self.action_dynamics) self.train_policy.prepare_else(self.train_dynamics) self.agent = RnnPpoOptimizer( scope='ppo', ob_space=self.ob_space, ac_space=self.ac_space, actionpol=self.action_policy, trainpol=self.train_policy, use_news=hps['use_news'], gamma=hps['gamma'], lam=hps["lambda"], nepochs=hps['nepochs'], nminibatches=hps['nminibatches'], lr=hps['lr'], cliprange=0.1, nsteps_per_seg=hps['nsteps_per_seg'], nsegs_per_env=hps['nsegs_per_env'], ent_coef=hps['ent_coeff'], normrew=hps['norm_rew'], normadv=hps['norm_adv'], ext_coeff=hps['ext_coeff'], int_coeff=hps['int_coeff'], action_dynamics=self.action_dynamics, train_dynamics=self.train_dynamics, policy_mode=hps['policy_mode'], logdir=logdir, full_tensorboard_log=hps['full_tensorboard_log'], tboard_period=hps['tboard_period'] ) self.agent.to_report['aux'] = tf.reduce_mean(self.train_feature_extractor.loss) self.agent.total_loss += self.agent.to_report['aux'] * self.hps['aux_coeff'] self.agent.to_report['dyn_loss'] = tf.reduce_mean(self.train_dynamics.loss) self.agent.total_loss += self.agent.to_report['dyn_loss'] * self.hps['dyn_coeff'] self.agent.to_report['feat_var'] = tf.reduce_mean(tf.nn.moments(self.train_feature_extractor.features, [0, 1])[1])