def setup_model(self):
        with SetVerbosity(self.verbose):

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.sess = tf_util.make_session(num_cpu=self.nprocs,
                                                 graph=self.graph)

                self.advs_ph = advs_ph = tf.placeholder(tf.float32, [None])
                self.rewards_ph = rewards_ph = tf.placeholder(
                    tf.float32, [None])
                self.pg_lr_ph = pg_lr_ph = tf.placeholder(tf.float32, [])

                n_batch_step = None
                n_batch_train = None
                if issubclass(self.policy, LstmPolicy):
                    n_batch_step = self.n_envs
                    n_batch_train = self.n_envs * self.n_steps

                self.model = step_model = self.policy(self.sess,
                                                      self.observation_space,
                                                      self.action_space,
                                                      self.n_envs,
                                                      1,
                                                      n_batch_step,
                                                      reuse=False)
                self.model2 = train_model = self.policy(self.sess,
                                                        self.observation_space,
                                                        self.action_space,
                                                        self.n_envs,
                                                        self.n_steps,
                                                        n_batch_train,
                                                        reuse=True)

                self.action_ph = action_ph = train_model.pdtype.sample_placeholder(
                    [None])

                logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=train_model.policy, labels=action_ph)
                self.logits = train_model.policy

                # training loss
                pg_loss = tf.reduce_mean(advs_ph * logpac)
                self.entropy = entropy = tf.reduce_mean(
                    calc_entropy(train_model.policy))
                self.pg_loss = pg_loss = pg_loss - self.ent_coef * entropy
                self.vf_loss = vf_loss = mse(tf.squeeze(train_model.value_fn),
                                             rewards_ph)
                train_loss = pg_loss + self.vf_coef * vf_loss

                # Fisher loss construction
                self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac)
                sample_net = train_model.value_fn + tf.random_normal(
                    tf.shape(train_model.value_fn))
                self.vf_fisher = vf_fisher_loss = -self.vf_fisher_coef * tf.reduce_mean(
                    tf.pow(train_model.value_fn - tf.stop_gradient(sample_net),
                           2))
                self.joint_fisher = pg_fisher_loss + vf_fisher_loss

                self.params = params = find_trainable_variables("model")

                self.grads_check = tf.gradients(train_loss, params)

                with tf.device('/gpu:0'):
                    self.optim = optim = kfac.KfacOptimizer(
                        learning_rate=pg_lr_ph,
                        clip_kl=self.kfac_clip,
                        momentum=0.9,
                        kfac_update=1,
                        epsilon=0.01,
                        stats_decay=0.99,
                        async=1,
                        cold_iter=10,
                        max_grad_norm=self.max_grad_norm,
                        verbose=self.verbose)

                    optim.compute_and_apply_stats(self.joint_fisher,
                                                  var_list=params)

                self.train_model = train_model
                self.step_model = step_model
                self.step = step_model.step
                self.proba_step = step_model.proba_step
                self.value = step_model.value
                self.initial_state = step_model.initial_state
                tf.global_variables_initializer().run(session=self.sess)
Beispiel #2
0
    def setup_model(self):
        with SetVerbosity(self.verbose):

            assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the ACKTR model must be " \
                                                               "an instance of common.policies.ActorCriticPolicy."

            if isinstance(self.action_space, Box):
                raise NotImplementedError(
                    "WIP: ACKTR does not support Continuous actions yet.")

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.sess = tf_util.make_session(num_cpu=self.nprocs,
                                                 graph=self.graph)

                n_batch_step = None
                n_batch_train = None
                if issubclass(self.policy, RecurrentActorCriticPolicy):
                    n_batch_step = self.n_envs
                    n_batch_train = self.n_envs * self.n_steps

                self.model = step_model = self.policy(self.sess,
                                                      self.observation_space,
                                                      self.action_space,
                                                      self.n_envs,
                                                      1,
                                                      n_batch_step,
                                                      reuse=False,
                                                      **self.policy_kwargs)

                self.params = params = tf_util.get_trainable_vars("model")

                with tf.variable_scope(
                        "train_model",
                        reuse=True,
                        custom_getter=tf_util.outer_scope_getter(
                            "train_model")):
                    self.model2 = train_model = self.policy(
                        self.sess,
                        self.observation_space,
                        self.action_space,
                        self.n_envs,
                        self.n_steps,
                        n_batch_train,
                        reuse=True,
                        **self.policy_kwargs)

                with tf.variable_scope(
                        "loss",
                        reuse=False,
                        custom_getter=tf_util.outer_scope_getter("loss")):
                    self.advs_ph = advs_ph = tf.placeholder(tf.float32, [None])
                    self.rewards_ph = rewards_ph = tf.placeholder(
                        tf.float32, [None])
                    self.pg_lr_ph = pg_lr_ph = tf.placeholder(tf.float32, [])
                    self.action_ph = action_ph = train_model.pdtype.sample_placeholder(
                        [None])

                    logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
                        logits=train_model.policy, labels=action_ph)
                    self.logits = train_model.policy

                    # training loss
                    pg_loss = tf.reduce_mean(advs_ph * logpac)
                    self.entropy = entropy = tf.reduce_mean(
                        calc_entropy(train_model.policy))
                    self.pg_loss = pg_loss = pg_loss - self.ent_coef * entropy
                    self.vf_loss = vf_loss = mse(
                        tf.squeeze(train_model.value_fn), rewards_ph)
                    train_loss = pg_loss + self.vf_coef * vf_loss

                    # Fisher loss construction
                    self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac)
                    sample_net = train_model.value_fn + tf.random_normal(
                        tf.shape(train_model.value_fn))
                    self.vf_fisher = vf_fisher_loss = -self.vf_fisher_coef * tf.reduce_mean(
                        tf.pow(
                            train_model.value_fn -
                            tf.stop_gradient(sample_net), 2))
                    self.joint_fisher = pg_fisher_loss + vf_fisher_loss

                    tf.summary.scalar('entropy_loss', self.entropy)
                    tf.summary.scalar('policy_gradient_loss', pg_loss)
                    tf.summary.scalar('policy_gradient_fisher_loss',
                                      pg_fisher_loss)
                    tf.summary.scalar('value_function_loss', self.vf_loss)
                    tf.summary.scalar('value_function_fisher_loss',
                                      vf_fisher_loss)
                    tf.summary.scalar('loss', train_loss)

                    self.grads_check = tf.gradients(train_loss, params)

                with tf.variable_scope("input_info", reuse=False):
                    tf.summary.scalar('discounted_rewards',
                                      tf.reduce_mean(self.rewards_ph))
                    tf.summary.scalar('learning_rate',
                                      tf.reduce_mean(self.pg_lr_ph))
                    tf.summary.scalar('advantage',
                                      tf.reduce_mean(self.advs_ph))

                    if self.full_tensorboard_log:
                        tf.summary.histogram('discounted_rewards',
                                             self.rewards_ph)
                        tf.summary.histogram('learning_rate', self.pg_lr_ph)
                        tf.summary.histogram('advantage', self.advs_ph)
                        if tf_util.is_image(self.observation_space):
                            tf.summary.image('observation', train_model.obs_ph)
                        else:
                            tf.summary.histogram('observation',
                                                 train_model.obs_ph)

                with tf.variable_scope(
                        "kfac",
                        reuse=False,
                        custom_getter=tf_util.outer_scope_getter("kfac")):
                    with tf.device('/gpu:0'):
                        self.optim = optim = kfac.KfacOptimizer(
                            learning_rate=pg_lr_ph,
                            clip_kl=self.kfac_clip,
                            momentum=0.9,
                            kfac_update=1,
                            epsilon=0.01,
                            stats_decay=0.99,
                            async_eigen_decomp=self.async_eigen_decomp,
                            cold_iter=10,
                            max_grad_norm=self.max_grad_norm,
                            verbose=self.verbose)

                        optim.compute_and_apply_stats(self.joint_fisher,
                                                      var_list=params)

                self.train_model = train_model
                self.step_model = step_model
                self.step = step_model.step
                self.proba_step = step_model.proba_step
                self.value = step_model.value
                self.initial_state = step_model.initial_state
                tf.global_variables_initializer().run(session=self.sess)

                self.summary = tf.summary.merge_all()