def setup_model(self): with SetVerbosity(self.verbose): self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.make_session(num_cpu=self.nprocs, graph=self.graph) self.advs_ph = advs_ph = tf.placeholder(tf.float32, [None]) self.rewards_ph = rewards_ph = tf.placeholder( tf.float32, [None]) self.pg_lr_ph = pg_lr_ph = tf.placeholder(tf.float32, []) n_batch_step = None n_batch_train = None if issubclass(self.policy, LstmPolicy): n_batch_step = self.n_envs n_batch_train = self.n_envs * self.n_steps self.model = step_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, n_batch_step, reuse=False) self.model2 = train_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, self.n_steps, n_batch_train, reuse=True) self.action_ph = action_ph = train_model.pdtype.sample_placeholder( [None]) logpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.policy, labels=action_ph) self.logits = train_model.policy # training loss pg_loss = tf.reduce_mean(advs_ph * logpac) self.entropy = entropy = tf.reduce_mean( calc_entropy(train_model.policy)) self.pg_loss = pg_loss = pg_loss - self.ent_coef * entropy self.vf_loss = vf_loss = mse(tf.squeeze(train_model.value_fn), rewards_ph) train_loss = pg_loss + self.vf_coef * vf_loss # Fisher loss construction self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac) sample_net = train_model.value_fn + tf.random_normal( tf.shape(train_model.value_fn)) self.vf_fisher = vf_fisher_loss = -self.vf_fisher_coef * tf.reduce_mean( tf.pow(train_model.value_fn - tf.stop_gradient(sample_net), 2)) self.joint_fisher = pg_fisher_loss + vf_fisher_loss self.params = params = find_trainable_variables("model") self.grads_check = tf.gradients(train_loss, params) with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer( learning_rate=pg_lr_ph, clip_kl=self.kfac_clip, momentum=0.9, kfac_update=1, epsilon=0.01, stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=self.max_grad_norm, verbose=self.verbose) optim.compute_and_apply_stats(self.joint_fisher, var_list=params) self.train_model = train_model self.step_model = step_model self.step = step_model.step self.proba_step = step_model.proba_step self.value = step_model.value self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=self.sess)
def setup_model(self): with SetVerbosity(self.verbose): assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the ACKTR model must be " \ "an instance of common.policies.ActorCriticPolicy." if isinstance(self.action_space, Box): raise NotImplementedError( "WIP: ACKTR does not support Continuous actions yet.") self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.make_session(num_cpu=self.nprocs, graph=self.graph) n_batch_step = None n_batch_train = None if issubclass(self.policy, RecurrentActorCriticPolicy): n_batch_step = self.n_envs n_batch_train = self.n_envs * self.n_steps self.model = step_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, n_batch_step, reuse=False, **self.policy_kwargs) self.params = params = tf_util.get_trainable_vars("model") with tf.variable_scope( "train_model", reuse=True, custom_getter=tf_util.outer_scope_getter( "train_model")): self.model2 = train_model = self.policy( self.sess, self.observation_space, self.action_space, self.n_envs, self.n_steps, n_batch_train, reuse=True, **self.policy_kwargs) with tf.variable_scope( "loss", reuse=False, custom_getter=tf_util.outer_scope_getter("loss")): self.advs_ph = advs_ph = tf.placeholder(tf.float32, [None]) self.rewards_ph = rewards_ph = tf.placeholder( tf.float32, [None]) self.pg_lr_ph = pg_lr_ph = tf.placeholder(tf.float32, []) self.action_ph = action_ph = train_model.pdtype.sample_placeholder( [None]) logpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.policy, labels=action_ph) self.logits = train_model.policy # training loss pg_loss = tf.reduce_mean(advs_ph * logpac) self.entropy = entropy = tf.reduce_mean( calc_entropy(train_model.policy)) self.pg_loss = pg_loss = pg_loss - self.ent_coef * entropy self.vf_loss = vf_loss = mse( tf.squeeze(train_model.value_fn), rewards_ph) train_loss = pg_loss + self.vf_coef * vf_loss # Fisher loss construction self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac) sample_net = train_model.value_fn + tf.random_normal( tf.shape(train_model.value_fn)) self.vf_fisher = vf_fisher_loss = -self.vf_fisher_coef * tf.reduce_mean( tf.pow( train_model.value_fn - tf.stop_gradient(sample_net), 2)) self.joint_fisher = pg_fisher_loss + vf_fisher_loss tf.summary.scalar('entropy_loss', self.entropy) tf.summary.scalar('policy_gradient_loss', pg_loss) tf.summary.scalar('policy_gradient_fisher_loss', pg_fisher_loss) tf.summary.scalar('value_function_loss', self.vf_loss) tf.summary.scalar('value_function_fisher_loss', vf_fisher_loss) tf.summary.scalar('loss', train_loss) self.grads_check = tf.gradients(train_loss, params) with tf.variable_scope("input_info", reuse=False): tf.summary.scalar('discounted_rewards', tf.reduce_mean(self.rewards_ph)) tf.summary.scalar('learning_rate', tf.reduce_mean(self.pg_lr_ph)) tf.summary.scalar('advantage', tf.reduce_mean(self.advs_ph)) if self.full_tensorboard_log: tf.summary.histogram('discounted_rewards', self.rewards_ph) tf.summary.histogram('learning_rate', self.pg_lr_ph) tf.summary.histogram('advantage', self.advs_ph) if tf_util.is_image(self.observation_space): tf.summary.image('observation', train_model.obs_ph) else: tf.summary.histogram('observation', train_model.obs_ph) with tf.variable_scope( "kfac", reuse=False, custom_getter=tf_util.outer_scope_getter("kfac")): with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer( learning_rate=pg_lr_ph, clip_kl=self.kfac_clip, momentum=0.9, kfac_update=1, epsilon=0.01, stats_decay=0.99, async_eigen_decomp=self.async_eigen_decomp, cold_iter=10, max_grad_norm=self.max_grad_norm, verbose=self.verbose) optim.compute_and_apply_stats(self.joint_fisher, var_list=params) self.train_model = train_model self.step_model = step_model self.step = step_model.step self.proba_step = step_model.proba_step self.value = step_model.value self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=self.sess) self.summary = tf.summary.merge_all()