def setup_model(self): with SetVerbosity(self.verbose): assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the A2C model must be an " \ "instance of common.policies.ActorCriticPolicy." self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) self.n_batch = self.n_envs * self.n_steps n_batch_step = None n_batch_train = None if issubclass(self.policy, RecurrentActorCriticPolicy): n_batch_step = self.n_envs n_batch_train = self.n_envs * self.n_steps step_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, n_batch_step, reuse=False, **self.policy_kwargs) with tf.compat.v1.variable_scope( "train_model", reuse=True, custom_getter=tf_util.outer_scope_getter( "train_model")): train_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, self.n_steps, n_batch_train, reuse=True, **self.policy_kwargs) with tf.compat.v1.variable_scope("loss", reuse=False): self.actions_ph = train_model.pdtype.sample_placeholder( [None], name="action_ph") self.advs_ph = tf.compat.v1.placeholder(tf.float32, [None], name="advs_ph") self.rewards_ph = tf.compat.v1.placeholder( tf.float32, [None], name="rewards_ph") self.learning_rate_ph = tf.compat.v1.placeholder( tf.float32, [], name="learning_rate_ph") neglogpac = train_model.proba_distribution.neglogp( self.actions_ph) self.entropy = tf.reduce_mean( input_tensor=train_model.proba_distribution.entropy()) self.pg_loss = tf.reduce_mean(input_tensor=self.advs_ph * neglogpac) self.vf_loss = mse(tf.squeeze(train_model.value_flat), self.rewards_ph) # https://arxiv.org/pdf/1708.04782.pdf#page=9, https://arxiv.org/pdf/1602.01783.pdf#page=4 # and https://github.com/dennybritz/reinforcement-learning/issues/34 # suggest to add an entropy component in order to improve exploration. loss = self.pg_loss - self.entropy * self.ent_coef + self.vf_loss * self.vf_coef tf.compat.v1.summary.scalar('entropy_loss', self.entropy) tf.compat.v1.summary.scalar('policy_gradient_loss', self.pg_loss) tf.compat.v1.summary.scalar('value_function_loss', self.vf_loss) tf.compat.v1.summary.scalar('loss', loss) self.params = tf_util.get_trainable_vars("model") grads = tf.gradients(ys=loss, xs=self.params) if self.max_grad_norm is not None: grads, _ = tf.clip_by_global_norm( grads, self.max_grad_norm) grads = list(zip(grads, self.params)) with tf.compat.v1.variable_scope("input_info", reuse=False): tf.compat.v1.summary.scalar( 'discounted_rewards', tf.reduce_mean(input_tensor=self.rewards_ph)) tf.compat.v1.summary.scalar( 'learning_rate', tf.reduce_mean(input_tensor=self.learning_rate_ph)) tf.compat.v1.summary.scalar( 'advantage', tf.reduce_mean(input_tensor=self.advs_ph)) if self.full_tensorboard_log: tf.compat.v1.summary.histogram('discounted_rewards', self.rewards_ph) tf.compat.v1.summary.histogram('learning_rate', self.learning_rate_ph) tf.compat.v1.summary.histogram('advantage', self.advs_ph) if tf_util.is_image(self.observation_space): tf.compat.v1.summary.image('observation', train_model.obs_ph) else: tf.compat.v1.summary.histogram( 'observation', train_model.obs_ph) trainer = tf.compat.v1.train.RMSPropOptimizer( learning_rate=self.learning_rate_ph, decay=self.alpha, epsilon=self.epsilon, momentum=self.momentum) self.apply_backprop = trainer.apply_gradients(grads) self.train_model = train_model self.step_model = step_model self.step = step_model.step self.proba_step = step_model.proba_step self.value = step_model.value self.initial_state = step_model.initial_state tf.compat.v1.global_variables_initializer().run( session=self.sess) self.summary = tf.compat.v1.summary.merge_all()
def setup_model(self): with SetVerbosity(self.verbose): assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the PPO2 model must be " \ "an instance of common.policies.ActorCriticPolicy." self.n_batch = self.n_envs * self.n_runs * self.n_steps self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) n_batch_step = None n_batch_train = None if issubclass(self.policy, RecurrentActorCriticPolicy): assert self.n_envs % self.nminibatches == 0, "For recurrent policies, "\ "the number of environments run in parallel should be a multiple of nminibatches." n_batch_step = self.n_envs n_batch_train = self.n_batch // self.nminibatches act_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, n_batch_step, reuse=False, **self.policy_kwargs) with tf.compat.v1.variable_scope("train_model", reuse=True, custom_getter=tf_util.outer_scope_getter("train_model")): train_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs * self.n_runs // self.nminibatches, self.n_steps, n_batch_train, reuse=True, **self.policy_kwargs) self.observation_ph = tf.compat.v1.placeholder(shape=(None,) + self.observation_space.shape, dtype=self.observation_space.dtype, name='obs') self.processed_obs = tf.cast(self.observation_ph, tf.float32) self.observation_next_ph = tf.compat.v1.placeholder(shape=(None,) + self.observation_space.shape, dtype=self.observation_space.dtype, name='obs_next') self.processed_obs_next = tf.cast(self.observation_next_ph, tf.float32) with tf.compat.v1.variable_scope("obs_encoded", reuse=tf.compat.v1.AUTO_REUSE): self.obs_encoded = obs_autoencoder(self.processed_obs, self.observation_space) self.obs_next_encoded = obs_autoencoder(self.processed_obs_next, self.observation_space) #self.obs_encoded = self.processed_obs #self.obs_next_encoded = self.processed_obs_next self.act_hat = inverse_model(self.obs_encoded, self.obs_next_encoded, self.action_space) with tf.compat.v1.variable_scope("loss", reuse=False): self.action_ph = train_model.pdtype.sample_placeholder([None], name="action_ph") self.processed_act = tf.cast(tf.one_hot(self.action_ph, self.action_space.n), tf.float32) self.obs_next_hat = forward_model(self.obs_encoded, self.processed_act, self.observation_space) self.advs_ph = tf.compat.v1.placeholder(tf.float32, [None], name="advs_ph") self.rewards_ph = tf.compat.v1.placeholder(tf.float32, [None], name="rewards_ph") self.true_rewards_ph = tf.compat.v1.placeholder(tf.float32, [None], name="true_rewards_ph") self.old_neglog_pac_ph = tf.compat.v1.placeholder(tf.float32, [None], name="old_neglog_pac_ph") self.old_vpred_ph = tf.compat.v1.placeholder(tf.float32, [None], name="old_vpred_ph") self.learning_rate_ph = tf.compat.v1.placeholder(tf.float32, [], name="learning_rate_ph") self.clip_range_ph = tf.compat.v1.placeholder(tf.float32, [], name="clip_range_ph") neglogpac = train_model.proba_distribution.neglogp(self.action_ph) self.entropy = tf.reduce_mean(input_tensor=train_model.proba_distribution.entropy()) vpred = train_model.value_flat # Value function clipping: not present in the original PPO if self.cliprange_vf is None: # Default behavior (legacy from OpenAI baselines): # use the same clipping as for the policy self.clip_range_vf_ph = self.clip_range_ph self.cliprange_vf = self.cliprange elif isinstance(self.cliprange_vf, (float, int)) and self.cliprange_vf < 0: # Original PPO implementation: no value function clipping self.clip_range_vf_ph = None else: # Last possible behavior: clipping range # specific to the value function self.clip_range_vf_ph = tf.compat.v1.placeholder(tf.float32, [], name="clip_range_vf_ph") if self.clip_range_vf_ph is None: # No clipping vpred_clipped = train_model.value_flat else: # Clip the different between old and new value # NOTE: this depends on the reward scaling vpred_clipped = self.old_vpred_ph + tf.clip_by_value(train_model.value_flat - self.old_vpred_ph, - self.clip_range_vf_ph, self.clip_range_vf_ph) vf_losses1 = tf.square(vpred - self.rewards_ph) vf_losses2 = tf.square(vpred_clipped - self.rewards_ph) self.vf_loss = .5 * tf.reduce_mean(input_tensor=tf.maximum(vf_losses1, vf_losses2)) ratio = tf.exp(self.old_neglog_pac_ph - neglogpac) pg_losses = -self.advs_ph * ratio pg_losses2 = -self.advs_ph * tf.clip_by_value(ratio, 1.0 - self.clip_range_ph, 1.0 + self.clip_range_ph) self.pg_loss = tf.reduce_mean(input_tensor=tf.maximum(pg_losses, pg_losses2)) self.approxkl = .5 * tf.reduce_mean(input_tensor=tf.square(neglogpac - self.old_neglog_pac_ph)) self.clipfrac = tf.reduce_mean(input_tensor=tf.cast(tf.greater(tf.abs(ratio - 1.0), self.clip_range_ph), tf.float32)) self.params = tf.compat.v1.trainable_variables() weight_params = [v for v in self.params if '/b' not in v.name] l2_loss = tf.reduce_sum([tf.nn.l2_loss(v) for v in weight_params]) self.frw_loss = 0.5 * tf.reduce_sum(tf.math.square(self.obs_next_encoded - self.obs_next_hat)) #self.inv_loss = - tf.reduce_sum(self.processed_act * tf.math.log(self.act_hat + tf.keras.backend.epsilon())) self.inv_loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.act_hat, labels=tf.cast(self.action_ph, tf.int64))) self.int_loss = self.beta * self.frw_loss + (1.0 - self.beta) * self.inv_loss loss = self.lmd * (self.pg_loss - self.entropy * self.ent_coef + self.vf_loss * self.vf_coef) + self.int_loss self.int_reward = self.eta * self.frw_loss tf.compat.v1.summary.scalar('entropy_loss', self.entropy) tf.compat.v1.summary.scalar('policy_gradient_loss', self.pg_loss) tf.compat.v1.summary.scalar('value_function_loss', self.vf_loss) tf.compat.v1.summary.scalar('intrinsic_loss', self.int_loss) tf.compat.v1.summary.scalar('approximate_kullback-leibler', self.approxkl) tf.compat.v1.summary.scalar('clip_factor', self.clipfrac) tf.compat.v1.summary.scalar('loss', loss) for var in self.params: print(var.name, var) with tf.compat.v1.variable_scope('model'): if self.full_tensorboard_log: for var in self.params: tf.compat.v1.summary.histogram(var.name, var) grads = tf.gradients(ys=loss, xs=self.params) if self.max_grad_norm is not None: grads, _grad_norm = tf.clip_by_global_norm(grads, self.max_grad_norm) grads = list(zip(grads, self.params)) for gr in grads: print(gr) trainer = tf.compat.v1.train.AdamOptimizer(learning_rate=self.learning_rate_ph, epsilon=1e-5) self._train = trainer.apply_gradients(grads) self.loss_names = ['policy_loss', 'value_loss', 'int_loss', 'policy_entropy', 'approxkl', 'clipfrac'] with tf.compat.v1.variable_scope("input_info", reuse=False): tf.compat.v1.summary.scalar('true_rewards', tf.reduce_mean(input_tensor=self.true_rewards_ph)) tf.compat.v1.summary.scalar('discounted_rewards', tf.reduce_mean(input_tensor=self.rewards_ph)) tf.compat.v1.summary.scalar('learning_rate', tf.reduce_mean(input_tensor=self.learning_rate_ph)) tf.compat.v1.summary.scalar('advantage', tf.reduce_mean(input_tensor=self.advs_ph)) tf.compat.v1.summary.scalar('clip_range', tf.reduce_mean(input_tensor=self.clip_range_ph)) if self.clip_range_vf_ph is not None: tf.compat.v1.summary.scalar('clip_range_vf', tf.reduce_mean(input_tensor=self.clip_range_vf_ph)) tf.compat.v1.summary.scalar('old_neglog_action_probability', tf.reduce_mean(input_tensor=self.old_neglog_pac_ph)) tf.compat.v1.summary.scalar('old_value_pred', tf.reduce_mean(input_tensor=self.old_vpred_ph)) if self.full_tensorboard_log: tf.compat.v1.summary.histogram('discounted_rewards', self.rewards_ph) tf.compat.v1.summary.histogram('learning_rate', self.learning_rate_ph) tf.compat.v1.summary.histogram('advantage', self.advs_ph) tf.compat.v1.summary.histogram('clip_range', self.clip_range_ph) tf.compat.v1.summary.histogram('old_neglog_action_probability', self.old_neglog_pac_ph) tf.compat.v1.summary.histogram('old_value_pred', self.old_vpred_ph) if tf_util.is_image(self.observation_space): tf.compat.v1.summary.image('observation', train_model.obs_ph) else: tf.compat.v1.summary.histogram('observation', train_model.obs_ph) self.train_model = train_model self.act_model = act_model self.step = act_model.step self.proba_step = act_model.proba_step self.value = act_model.value self.initial_state = act_model.initial_state tf.compat.v1.global_variables_initializer().run(session=self.sess) # pylint: disable=E1101 self.summary = tf.compat.v1.summary.merge_all()
def setup_model(self): with SetVerbosity(self.verbose): assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the ACKTR model must be " \ "an instance of common.policies.ActorCriticPolicy." # Enable continuous actions tricks (normalized advantage) self.continuous_actions = isinstance(self.action_space, Box) self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) n_batch_step = None n_batch_train = None if issubclass(self.policy, RecurrentActorCriticPolicy): n_batch_step = self.n_envs n_batch_train = self.n_envs * self.n_steps step_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, n_batch_step, reuse=False, **self.policy_kwargs) self.params = params = tf_util.get_trainable_vars("model") with tf.compat.v1.variable_scope( "train_model", reuse=True, custom_getter=tf_util.outer_scope_getter( "train_model")): train_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, self.n_steps, n_batch_train, reuse=True, **self.policy_kwargs) with tf.compat.v1.variable_scope( "loss", reuse=False, custom_getter=tf_util.outer_scope_getter("loss")): self.advs_ph = advs_ph = tf.compat.v1.placeholder( tf.float32, [None]) self.rewards_ph = rewards_ph = tf.compat.v1.placeholder( tf.float32, [None]) self.learning_rate_ph = learning_rate_ph = tf.compat.v1.placeholder( tf.float32, []) self.actions_ph = train_model.pdtype.sample_placeholder( [None]) neg_log_prob = train_model.proba_distribution.neglogp( self.actions_ph) # training loss pg_loss = tf.reduce_mean(input_tensor=advs_ph * neg_log_prob) self.entropy = entropy = tf.reduce_mean( input_tensor=train_model.proba_distribution.entropy()) self.pg_loss = pg_loss = pg_loss - self.ent_coef * entropy self.vf_loss = vf_loss = mse( tf.squeeze(train_model.value_fn), rewards_ph) train_loss = pg_loss + self.vf_coef * vf_loss # Fisher loss construction self.pg_fisher = pg_fisher_loss = -tf.reduce_mean( input_tensor=neg_log_prob) sample_net = train_model.value_fn + tf.random.normal( tf.shape(input=train_model.value_fn)) self.vf_fisher = vf_fisher_loss = -self.vf_fisher_coef * tf.reduce_mean( input_tensor=tf.pow( train_model.value_fn - tf.stop_gradient(sample_net), 2)) self.joint_fisher = pg_fisher_loss + vf_fisher_loss tf.compat.v1.summary.scalar('entropy_loss', self.entropy) tf.compat.v1.summary.scalar('policy_gradient_loss', pg_loss) tf.compat.v1.summary.scalar('policy_gradient_fisher_loss', pg_fisher_loss) tf.compat.v1.summary.scalar('value_function_loss', self.vf_loss) tf.compat.v1.summary.scalar('value_function_fisher_loss', vf_fisher_loss) tf.compat.v1.summary.scalar('loss', train_loss) self.grads_check = tf.gradients(ys=train_loss, xs=params) with tf.compat.v1.variable_scope("input_info", reuse=False): tf.compat.v1.summary.scalar( 'discounted_rewards', tf.reduce_mean(input_tensor=self.rewards_ph)) tf.compat.v1.summary.scalar( 'learning_rate', tf.reduce_mean(input_tensor=self.learning_rate_ph)) tf.compat.v1.summary.scalar( 'advantage', tf.reduce_mean(input_tensor=self.advs_ph)) if self.full_tensorboard_log: tf.compat.v1.summary.histogram('discounted_rewards', self.rewards_ph) tf.compat.v1.summary.histogram('learning_rate', self.learning_rate_ph) tf.compat.v1.summary.histogram('advantage', self.advs_ph) if tf_util.is_image(self.observation_space): tf.compat.v1.summary.image('observation', train_model.obs_ph) else: tf.compat.v1.summary.histogram( 'observation', train_model.obs_ph) with tf.compat.v1.variable_scope( "kfac", reuse=False, custom_getter=tf_util.outer_scope_getter("kfac")): with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer( learning_rate=learning_rate_ph, clip_kl=self.kfac_clip, momentum=0.9, kfac_update=self.kfac_update, epsilon=0.01, stats_decay=0.99, async_eigen_decomp=self.async_eigen_decomp, cold_iter=10, max_grad_norm=self.max_grad_norm, verbose=self.verbose) optim.compute_and_apply_stats(self.joint_fisher, var_list=params) self.train_model = train_model self.step_model = step_model self.step = step_model.step self.proba_step = step_model.proba_step self.value = step_model.value self.initial_state = step_model.initial_state tf.compat.v1.global_variables_initializer().run( session=self.sess) self.summary = tf.compat.v1.summary.merge_all()
def setup_model(self): # prevent import loops from reinforcement_learning.gail.adversary import TransitionClassifier with SetVerbosity(self.verbose): assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the TRPO model must be " \ "an instance of common.policies.ActorCriticPolicy." self.nworkers = MPI.COMM_WORLD.Get_size() self.rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) if self.using_gail: self.reward_giver = TransitionClassifier(self.observation_space, self.action_space, self.hidden_size_adversary, entcoeff=self.adversary_entcoeff) # Construct network for new policy self.policy_pi = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) # Network for old policy with tf.compat.v1.variable_scope("oldpi", reuse=False): old_policy = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) with tf.compat.v1.variable_scope("loss", reuse=False): atarg = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None]) # Empirical return observation = self.policy_pi.obs_ph action = self.policy_pi.pdtype.sample_placeholder([None]) kloldnew = old_policy.proba_distribution.kl(self.policy_pi.proba_distribution) ent = self.policy_pi.proba_distribution.entropy() meankl = tf.reduce_mean(input_tensor=kloldnew) meanent = tf.reduce_mean(input_tensor=ent) entbonus = self.entcoeff * meanent vferr = tf.reduce_mean(input_tensor=tf.square(self.policy_pi.value_flat - ret)) # advantage * pnew / pold ratio = tf.exp(self.policy_pi.proba_distribution.logp(action) - old_policy.proba_distribution.logp(action)) surrgain = tf.reduce_mean(input_tensor=ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] self.loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = tf_util.get_trainable_vars("model") var_list = [v for v in all_var_list if "/vf" not in v.name and "/q/" not in v.name] vf_var_list = [v for v in all_var_list if "/pi" not in v.name and "/logstd" not in v.name] self.get_flat = tf_util.GetFlat(var_list, sess=self.sess) self.set_from_flat = tf_util.SetFromFlat(var_list, sess=self.sess) klgrads = tf.gradients(ys=dist, xs=var_list) flat_tangent = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: var_size = tf_util.intprod(shape) tangents.append(tf.reshape(flat_tangent[start: start + var_size], shape)) start += var_size gvp = tf.add_n([tf.reduce_sum(input_tensor=grad * tangent) for (grad, tangent) in zipsame(klgrads, tangents)]) # pylint: disable=E1111 # Fisher vector products fvp = tf_util.flatgrad(gvp, var_list) tf.compat.v1.summary.scalar('entropy_loss', meanent) tf.compat.v1.summary.scalar('policy_gradient_loss', optimgain) tf.compat.v1.summary.scalar('value_function_loss', surrgain) tf.compat.v1.summary.scalar('approximate_kullback-leibler', meankl) tf.compat.v1.summary.scalar('loss', optimgain + meankl + entbonus + surrgain + meanent) self.assign_old_eq_new = \ tf_util.function([], [], updates=[tf.compat.v1.assign(oldv, newv) for (oldv, newv) in zipsame(tf_util.get_globals_vars("oldpi"), tf_util.get_globals_vars("model"))]) self.compute_losses = tf_util.function([observation, old_policy.obs_ph, action, atarg], losses) self.compute_fvp = tf_util.function([flat_tangent, observation, old_policy.obs_ph, action, atarg], fvp) self.compute_vflossandgrad = tf_util.function([observation, old_policy.obs_ph, ret], tf_util.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if self.rank == 0 and self.verbose >= 1: print(colorize(msg, color='magenta')) start_time = time.time() yield print(colorize("done in {:.3f} seconds".format((time.time() - start_time)), color='magenta')) else: yield def allmean(arr): assert isinstance(arr, np.ndarray) out = np.empty_like(arr) MPI.COMM_WORLD.Allreduce(arr, out, op=MPI.SUM) out /= self.nworkers return out tf_util.initialize(sess=self.sess) th_init = self.get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) self.set_from_flat(th_init) with tf.compat.v1.variable_scope("Adam_mpi", reuse=False): self.vfadam = MpiAdam(vf_var_list, sess=self.sess) if self.using_gail: self.d_adam = MpiAdam(self.reward_giver.get_trainable_variables(), sess=self.sess) self.d_adam.sync() self.vfadam.sync() with tf.compat.v1.variable_scope("input_info", reuse=False): tf.compat.v1.summary.scalar('discounted_rewards', tf.reduce_mean(input_tensor=ret)) tf.compat.v1.summary.scalar('learning_rate', tf.reduce_mean(input_tensor=self.vf_stepsize)) tf.compat.v1.summary.scalar('advantage', tf.reduce_mean(input_tensor=atarg)) tf.compat.v1.summary.scalar('kl_clip_range', tf.reduce_mean(input_tensor=self.max_kl)) if self.full_tensorboard_log: tf.compat.v1.summary.histogram('discounted_rewards', ret) tf.compat.v1.summary.histogram('learning_rate', self.vf_stepsize) tf.compat.v1.summary.histogram('advantage', atarg) tf.compat.v1.summary.histogram('kl_clip_range', self.max_kl) if tf_util.is_image(self.observation_space): tf.compat.v1.summary.image('observation', observation) else: tf.compat.v1.summary.histogram('observation', observation) self.timed = timed self.allmean = allmean self.step = self.policy_pi.step self.proba_step = self.policy_pi.proba_step self.initial_state = self.policy_pi.initial_state self.params = tf_util.get_trainable_vars("model") + tf_util.get_trainable_vars("oldpi") if self.using_gail: self.params.extend(self.reward_giver.get_trainable_variables()) self.summary = tf.compat.v1.summary.merge_all() self.compute_lossandgrad = \ tf_util.function([observation, old_policy.obs_ph, action, atarg, ret], [self.summary, tf_util.flatgrad(optimgain, var_list)] + losses)
def setup_model(self): with SetVerbosity(self.verbose): self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) self.replay_buffer = ReplayBuffer(self.buffer_size) with tf.compat.v1.variable_scope("input", reuse=False): # Create policy and target TF objects self.policy_tf = self.policy(self.sess, self.observation_space, self.action_space, **self.policy_kwargs) self.target_policy = self.policy(self.sess, self.observation_space, self.action_space, **self.policy_kwargs) # Initialize Placeholders self.observations_ph = self.policy_tf.obs_ph # Normalized observation for pixels self.processed_obs_ph = self.policy_tf.processed_obs self.next_observations_ph = self.target_policy.obs_ph self.processed_next_obs_ph = self.target_policy.processed_obs self.action_target = self.target_policy.action_ph self.terminals_ph = tf.compat.v1.placeholder( tf.float32, shape=(None, 1), name='terminals') self.rewards_ph = tf.compat.v1.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions_ph = tf.compat.v1.placeholder( tf.float32, shape=(None, ) + self.action_space.shape, name='actions') self.learning_rate_ph = tf.compat.v1.placeholder( tf.float32, [], name="learning_rate_ph") with tf.compat.v1.variable_scope("model", reuse=False): # Create the policy # first return value corresponds to deterministic actions # policy_out corresponds to stochastic actions, used for training # logp_pi is the log probability of actions taken by the policy self.deterministic_action, policy_out, logp_pi = self.policy_tf.make_actor( self.processed_obs_ph) # Monitor the entropy of the policy, # this is not used for training self.entropy = tf.reduce_mean( input_tensor=self.policy_tf.entropy) # Use two Q-functions to improve performance by reducing overestimation bias. qf1, qf2, value_fn = self.policy_tf.make_critics( self.processed_obs_ph, self.actions_ph, create_qf=True, create_vf=True) qf1_pi, qf2_pi, _ = self.policy_tf.make_critics( self.processed_obs_ph, policy_out, create_qf=True, create_vf=False, reuse=True) # Target entropy is used when learning the entropy coefficient if self.target_entropy == 'auto': # automatically set target entropy if needed self.target_entropy = -np.prod( self.action_space.shape).astype(np.float32) else: # Force conversion # this will also throw an error for unexpected string self.target_entropy = float(self.target_entropy) # The entropy coefficient or entropy can be learned automatically # see Automating Entropy Adjustment for Maximum Entropy RL section # of https://arxiv.org/abs/1812.05905 if isinstance(self.ent_coef, str) and self.ent_coef.startswith('auto'): # Default initial value of ent_coef when learned init_value = 1.0 if '_' in self.ent_coef: init_value = float(self.ent_coef.split('_')[1]) assert init_value > 0., "The initial value of ent_coef must be greater than 0" self.log_ent_coef = tf.compat.v1.get_variable( 'log_ent_coef', dtype=tf.float32, initializer=np.log(init_value).astype(np.float32)) self.ent_coef = tf.exp(self.log_ent_coef) else: # Force conversion to float # this will throw an error if a malformed string (different from 'auto') # is passed self.ent_coef = float(self.ent_coef) with tf.compat.v1.variable_scope("target", reuse=False): # Create the value network _, _, value_target = self.target_policy.make_critics( self.processed_next_obs_ph, create_qf=False, create_vf=True) self.value_target = value_target with tf.compat.v1.variable_scope("loss", reuse=False): # Take the min of the two Q-Values (Double-Q Learning) min_qf_pi = tf.minimum(qf1_pi, qf2_pi) # Target for Q value regression q_backup = tf.stop_gradient(self.rewards_ph + (1 - self.terminals_ph) * self.gamma * self.value_target) # Compute Q-Function loss # TODO: test with huber loss (it would avoid too high values) qf1_loss = 0.5 * tf.reduce_mean(input_tensor=(q_backup - qf1)**2) qf2_loss = 0.5 * tf.reduce_mean(input_tensor=(q_backup - qf2)**2) # Compute the entropy temperature loss # it is used when the entropy coefficient is learned ent_coef_loss, entropy_optimizer = None, None if not isinstance(self.ent_coef, float): ent_coef_loss = -tf.reduce_mean( input_tensor=self.log_ent_coef * tf.stop_gradient(logp_pi + self.target_entropy)) entropy_optimizer = tf.compat.v1.train.AdamOptimizer( learning_rate=self.learning_rate_ph) # Compute the policy loss # Alternative: policy_kl_loss = tf.reduce_mean(logp_pi - min_qf_pi) policy_kl_loss = tf.reduce_mean( input_tensor=self.ent_coef * logp_pi - qf1_pi) # NOTE: in the original implementation, they have an additional # regularization loss for the Gaussian parameters # this is not used for now # policy_loss = (policy_kl_loss + policy_regularization_loss) policy_loss = policy_kl_loss # Target for value fn regression # We update the vf towards the min of two Q-functions in order to # reduce overestimation bias from function approximation error. v_backup = tf.stop_gradient(min_qf_pi - self.ent_coef * logp_pi) value_loss = 0.5 * tf.reduce_mean( input_tensor=(value_fn - v_backup)**2) values_losses = qf1_loss + qf2_loss + value_loss # Policy train op # (has to be separate from value train op, because min_qf_pi appears in policy_loss) policy_optimizer = tf.compat.v1.train.AdamOptimizer( learning_rate=self.learning_rate_ph) policy_train_op = policy_optimizer.minimize( policy_loss, var_list=tf_util.get_trainable_vars('model/pi')) # Value train op value_optimizer = tf.compat.v1.train.AdamOptimizer( learning_rate=self.learning_rate_ph) values_params = tf_util.get_trainable_vars( 'model/values_fn') source_params = tf_util.get_trainable_vars( "model/values_fn") target_params = tf_util.get_trainable_vars( "target/values_fn") # Polyak averaging for target variables self.target_update_op = [ tf.compat.v1.assign(target, (1 - self.tau) * target + self.tau * source) for target, source in zip(target_params, source_params) ] # Initializing target to match source variables target_init_op = [ tf.compat.v1.assign(target, source) for target, source in zip(target_params, source_params) ] # Control flow is used because sess.run otherwise evaluates in nondeterministic order # and we first need to compute the policy action before computing q values losses with tf.control_dependencies([policy_train_op]): train_values_op = value_optimizer.minimize( values_losses, var_list=values_params) self.infos_names = [ 'policy_loss', 'qf1_loss', 'qf2_loss', 'value_loss', 'entropy' ] # All ops to call during one training step self.step_ops = [ policy_loss, qf1_loss, qf2_loss, value_loss, qf1, qf2, value_fn, logp_pi, self.entropy, policy_train_op, train_values_op ] # Add entropy coefficient optimization operation if needed if ent_coef_loss is not None: with tf.control_dependencies([train_values_op]): ent_coef_op = entropy_optimizer.minimize( ent_coef_loss, var_list=self.log_ent_coef) self.infos_names += [ 'ent_coef_loss', 'ent_coef' ] self.step_ops += [ ent_coef_op, ent_coef_loss, self.ent_coef ] # Monitor losses and entropy in tensorboard tf.compat.v1.summary.scalar('policy_loss', policy_loss) tf.compat.v1.summary.scalar('qf1_loss', qf1_loss) tf.compat.v1.summary.scalar('qf2_loss', qf2_loss) tf.compat.v1.summary.scalar('value_loss', value_loss) tf.compat.v1.summary.scalar('entropy', self.entropy) if ent_coef_loss is not None: tf.compat.v1.summary.scalar('ent_coef_loss', ent_coef_loss) tf.compat.v1.summary.scalar('ent_coef', self.ent_coef) tf.compat.v1.summary.scalar( 'learning_rate', tf.reduce_mean(input_tensor=self.learning_rate_ph)) # Retrieve parameters that must be saved self.params = tf_util.get_trainable_vars("model") self.target_params = tf_util.get_trainable_vars( "target/values_fn") # Initialize Variables and target network with self.sess.as_default(): self.sess.run(tf.compat.v1.global_variables_initializer()) self.sess.run(target_init_op) self.summary = tf.compat.v1.summary.merge_all()