コード例 #1
0
    def setup_model(self):
        """
        Create all the functions and tensorflow graphs necessary to train the model
        """

        assert issubclass(self.policy, MetaLstmActorCriticPolicy), "Error: the input policy for the A2C model must be an " \
                                                                         "instance of MetaLstmActorCriticPolicy."

        self.graph = tf.Graph()
        with self.graph.as_default():
            self.sess = tf_utils.make_session(graph=self.graph)

            # azért nincs step model mert ugyanaz a lépés (n_batch) így felesleges.
            policy_model = self.policy(sess=self.sess, input_length=self.input_length, output_length=self.output_length, n_steps=self.n_steps,
                                       window_size=self.window_size, layers=self.layers, lstm_units=self.lstm_units)

            with tf.variable_scope("loss", reuse=False):
                self.actions_ph = policy_model.pdtype.sample_placeholder([self.n_steps], name="action_ph")
                self.advs_ph = tf.placeholder(tf.float32, [self.n_steps], name="advs_ph")
                self.rewards_ph = tf.placeholder(tf.float32, [self.n_steps], name="rewards_ph")
                self.learning_rate_ph = tf.placeholder(tf.float32, [], name="learning_rate_ph")

                neglogpac = policy_model.proba_distribution.neglogp(self.actions_ph)
                self.entropy = tf.reduce_mean(policy_model.proba_distribution.entropy())
                self.pg_loss = tf.reduce_mean(self.advs_ph * neglogpac)
                self.vf_loss = mse(tf.squeeze(policy_model.value_fn), self.rewards_ph)
                loss = self.pg_loss - self.entropy * self.ent_coef + self.vf_loss * self.vf_coef

                self.trainable_variables = tf_utils.find_trainable_variables("model")
                grads = tf.gradients(loss, self.trainable_variables)
                if self.max_grad_norm is not None:
                    grads, _ = tf.clip_by_global_norm(grads, self.max_grad_norm)
                grads = list(zip(grads, self.trainable_variables))

            trainer = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate_ph, decay=self.alpha,
                                                epsilon=self.epsilon)
            self.apply_backprop = trainer.apply_gradients(grads)
            self.step = policy_model.step
            self.policy_model = policy_model
            self.value = self.policy_model.value
            tf.global_variables_initializer().run(session=self.sess)
コード例 #2
0
    def setup_model(self):
        with SetVerbosity(self.verbose):

            assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the A2C model must be an " \
                "instance of common.policies.ActorCriticPolicy."

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.sess = tf_util.make_session(graph=self.graph)

                self.n_batch = self.n_envs * self.n_steps

                n_batch_step = None
                n_batch_train = None
                if issubclass(self.policy, LstmPolicy):
                    n_batch_step = self.n_envs
                    n_batch_train = self.n_envs * self.n_steps

                step_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1,
                                         n_batch_step, reuse=False)

                with tf.variable_scope("train_model", reuse=True,
                                       custom_getter=tf_util.outer_scope_getter("train_model")):
                    train_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs,
                                              self.n_steps, n_batch_train, reuse=True)

                with tf.variable_scope("loss", reuse=False):
                    self.actions_ph = train_model.pdtype.sample_placeholder([None], name="action_ph")
                    self.advs_ph = tf.placeholder(tf.float32, [None], name="advs_ph")
                    self.rewards_ph = tf.placeholder(tf.float32, [None], name="rewards_ph")
                    self.learning_rate_ph = tf.placeholder(tf.float32, [], name="learning_rate_ph")

                    neglogpac = train_model.proba_distribution.neglogp(self.actions_ph)
                    self.entropy = tf.reduce_mean(train_model.proba_distribution.entropy())
                    self.pg_loss = tf.reduce_mean(self.advs_ph * neglogpac)
                    self.vf_loss = mse(tf.squeeze(train_model.value_fn), self.rewards_ph)
                    loss = self.pg_loss - self.entropy * self.ent_coef + self.vf_loss * self.vf_coef

                    tf.summary.scalar('entropy_loss', self.entropy)
                    tf.summary.scalar('policy_gradient_loss', self.pg_loss)
                    tf.summary.scalar('value_function_loss', self.vf_loss)
                    tf.summary.scalar('loss', loss)

                    self.params = find_trainable_variables("model")
                    grads = tf.gradients(loss, self.params)
                    if self.max_grad_norm is not None:
                        grads, _ = tf.clip_by_global_norm(grads, self.max_grad_norm)
                    grads = list(zip(grads, self.params))

                with tf.variable_scope("input_info", reuse=False):
                    tf.summary.scalar('discounted_rewards', tf.reduce_mean(self.rewards_ph))
                    tf.summary.histogram('discounted_rewards', self.rewards_ph)
                    tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate))
                    tf.summary.histogram('learning_rate', self.learning_rate)
                    tf.summary.scalar('advantage', tf.reduce_mean(self.advs_ph))
                    tf.summary.histogram('advantage', self.advs_ph)
                    if len(self.observation_space.shape) == 3:
                        tf.summary.image('observation', train_model.obs_ph)
                    else:
                        tf.summary.histogram('observation', train_model.obs_ph)

                trainer = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate_ph, decay=self.alpha,
                                                    epsilon=self.epsilon)
                self.apply_backprop = trainer.apply_gradients(grads)

                self.train_model = train_model
                self.step_model = step_model
                self.step = step_model.step
                # self.step_with_attention = step_model.step_with_attention
                self.proba_step = step_model.proba_step
                self.value = step_model.value
                self.initial_state = step_model.initial_state
                tf.global_variables_initializer().run(session=self.sess)

                self.summary = tf.summary.merge_all()
コード例 #3
0
ファイル: a2c.py プロジェクト: kdh0429/Relational_DRL
    def setup_model(self):
        with SetVerbosity(self.verbose):

            assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the A2C model must be an " \
                                                                "instance of common.policies.ActorCriticPolicy."

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.set_random_seed(self.seed)
                self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess,
                                                 graph=self.graph)

                self.n_batch = self.n_envs * self.n_steps

                n_batch_step = None
                n_batch_train = None
                if issubclass(self.policy, RecurrentActorCriticPolicy):
                    n_batch_step = self.n_envs
                    n_batch_train = self.n_envs * self.n_steps

                step_model = self.policy(self.sess,
                                         self.observation_space,
                                         self.action_space,
                                         self.n_envs,
                                         1,
                                         n_batch_step,
                                         reuse=False,
                                         **self.policy_kwargs)

                with tf.variable_scope(
                        "train_model",
                        reuse=True,
                        custom_getter=tf_util.outer_scope_getter(
                            "train_model")):
                    train_model = self.policy(self.sess,
                                              self.observation_space,
                                              self.action_space,
                                              self.n_envs,
                                              self.n_steps,
                                              n_batch_train,
                                              reuse=True,
                                              **self.policy_kwargs)

                with tf.variable_scope("loss", reuse=False):
                    self.actions_ph = train_model.pdtype.sample_placeholder(
                        [None], name="action_ph")
                    self.advs_ph = tf.placeholder(tf.float32, [None],
                                                  name="advs_ph")
                    self.rewards_ph = tf.placeholder(tf.float32, [None],
                                                     name="rewards_ph")
                    self.learning_rate_ph = tf.placeholder(
                        tf.float32, [], name="learning_rate_ph")

                    neglogpac = train_model.proba_distribution.neglogp(
                        self.actions_ph)
                    self.entropy = tf.reduce_mean(
                        train_model.proba_distribution.entropy())
                    self.pg_loss = tf.reduce_mean(self.advs_ph * neglogpac)
                    self.vf_loss = mse(tf.squeeze(train_model.value_flat),
                                       self.rewards_ph)
                    # https://arxiv.org/pdf/1708.04782.pdf#page=9, https://arxiv.org/pdf/1602.01783.pdf#page=4
                    # and https://github.com/dennybritz/reinforcement-learning/issues/34
                    # suggest to add an entropy component in order to improve exploration.
                    loss = self.pg_loss - self.entropy * self.ent_coef + self.vf_loss * self.vf_coef

                    tf.summary.scalar('entropy_loss', self.entropy)
                    tf.summary.scalar('policy_gradient_loss', self.pg_loss)
                    tf.summary.scalar('value_function_loss', self.vf_loss)
                    tf.summary.scalar('loss', loss)

                    self.params = tf_util.get_trainable_vars("model")
                    grads = tf.gradients(loss, self.params)
                    if self.max_grad_norm is not None:
                        grads, _ = tf.clip_by_global_norm(
                            grads, self.max_grad_norm)
                    grads = list(zip(grads, self.params))

                with tf.variable_scope("input_info", reuse=False):
                    tf.summary.scalar('discounted_rewards',
                                      tf.reduce_mean(self.rewards_ph))
                    tf.summary.scalar('learning_rate',
                                      tf.reduce_mean(self.learning_rate_ph))
                    tf.summary.scalar('advantage',
                                      tf.reduce_mean(self.advs_ph))
                    if self.full_tensorboard_log:
                        tf.summary.histogram('discounted_rewards',
                                             self.rewards_ph)
                        tf.summary.histogram('learning_rate',
                                             self.learning_rate_ph)
                        tf.summary.histogram('advantage', self.advs_ph)
                        if tf_util.is_image(self.observation_space):
                            tf.summary.image('observation', train_model.obs_ph)
                        else:
                            tf.summary.histogram('observation',
                                                 train_model.obs_ph)

                trainer = tf.train.RMSPropOptimizer(
                    learning_rate=self.learning_rate_ph,
                    decay=self.alpha,
                    epsilon=self.epsilon)
                self.apply_backprop = trainer.apply_gradients(grads)

                self.train_model = train_model
                self.step_model = step_model
                self.step = step_model.step
                self.proba_step = step_model.proba_step
                self.value = step_model.value
                self.initial_state = step_model.initial_state
                self.attention = step_model.attention
                tf.global_variables_initializer().run(session=self.sess)

                self.summary = tf.summary.merge_all()
コード例 #4
0
    def setup_model(self):
        with SetVerbosity(self.verbose):

            assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the ACKTR model must be " \
                                                               "an instance of common.policies.ActorCriticPolicy."

            # Enable continuous actions tricks (normalized advantage)
            self.continuous_actions = isinstance(self.action_space, Box)

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.sess = tf_util.make_session(num_cpu=self.nprocs,
                                                 graph=self.graph)

                n_batch_step = None
                n_batch_train = None
                if issubclass(self.policy, RecurrentActorCriticPolicy):
                    n_batch_step = self.n_envs
                    n_batch_train = self.n_envs * self.n_steps

                step_model = self.policy(self.sess,
                                         self.observation_space,
                                         self.action_space,
                                         self.n_envs,
                                         1,
                                         n_batch_step,
                                         reuse=False,
                                         **self.policy_kwargs)

                self.params = params = tf_util.get_trainable_vars("model")

                with tf.variable_scope(
                        "train_model",
                        reuse=True,
                        custom_getter=tf_util.outer_scope_getter(
                            "train_model")):
                    train_model = self.policy(self.sess,
                                              self.observation_space,
                                              self.action_space,
                                              self.n_envs,
                                              self.n_steps,
                                              n_batch_train,
                                              reuse=True,
                                              **self.policy_kwargs)

                with tf.variable_scope(
                        "loss",
                        reuse=False,
                        custom_getter=tf_util.outer_scope_getter("loss")):
                    self.advs_ph = advs_ph = tf.placeholder(tf.float32, [None])
                    self.rewards_ph = rewards_ph = tf.placeholder(
                        tf.float32, [None])
                    self.learning_rate_ph = learning_rate_ph = tf.placeholder(
                        tf.float32, [])
                    self.actions_ph = train_model.pdtype.sample_placeholder(
                        [None])

                    neg_log_prob = train_model.proba_distribution.neglogp(
                        self.actions_ph)

                    # training loss
                    pg_loss = tf.reduce_mean(advs_ph * neg_log_prob)
                    self.entropy = entropy = tf.reduce_mean(
                        train_model.proba_distribution.entropy())
                    self.pg_loss = pg_loss = pg_loss - self.ent_coef * entropy
                    self.vf_loss = vf_loss = mse(
                        tf.squeeze(train_model.value_fn), rewards_ph)
                    train_loss = pg_loss + self.vf_coef * vf_loss

                    # Fisher loss construction
                    self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(
                        neg_log_prob)
                    sample_net = train_model.value_fn + tf.random_normal(
                        tf.shape(train_model.value_fn))
                    self.vf_fisher = vf_fisher_loss = -self.vf_fisher_coef * tf.reduce_mean(
                        tf.pow(
                            train_model.value_fn -
                            tf.stop_gradient(sample_net), 2))
                    self.joint_fisher = pg_fisher_loss + vf_fisher_loss

                    tf.summary.scalar('entropy_loss', self.entropy)
                    tf.summary.scalar('policy_gradient_loss', pg_loss)
                    tf.summary.scalar('policy_gradient_fisher_loss',
                                      pg_fisher_loss)
                    tf.summary.scalar('value_function_loss', self.vf_loss)
                    tf.summary.scalar('value_function_fisher_loss',
                                      vf_fisher_loss)
                    tf.summary.scalar('loss', train_loss)

                    self.grads_check = tf.gradients(train_loss, params)

                with tf.variable_scope("input_info", reuse=False):
                    tf.summary.scalar('discounted_rewards',
                                      tf.reduce_mean(self.rewards_ph))
                    tf.summary.scalar('learning_rate',
                                      tf.reduce_mean(self.learning_rate_ph))
                    tf.summary.scalar('advantage',
                                      tf.reduce_mean(self.advs_ph))

                    if self.full_tensorboard_log:
                        tf.summary.histogram('discounted_rewards',
                                             self.rewards_ph)
                        tf.summary.histogram('learning_rate',
                                             self.learning_rate_ph)
                        tf.summary.histogram('advantage', self.advs_ph)
                        if tf_util.is_image(self.observation_space):
                            tf.summary.image('observation', train_model.obs_ph)
                        else:
                            tf.summary.histogram('observation',
                                                 train_model.obs_ph)

                with tf.variable_scope(
                        "kfac",
                        reuse=False,
                        custom_getter=tf_util.outer_scope_getter("kfac")):
                    with tf.device('/gpu:0'):
                        self.optim = optim = kfac.KfacOptimizer(
                            learning_rate=learning_rate_ph,
                            clip_kl=self.kfac_clip,
                            momentum=0.9,
                            kfac_update=self.kfac_update,
                            epsilon=0.01,
                            stats_decay=0.99,
                            async_eigen_decomp=self.async_eigen_decomp,
                            cold_iter=10,
                            max_grad_norm=self.max_grad_norm,
                            verbose=self.verbose)

                        optim.compute_and_apply_stats(self.joint_fisher,
                                                      var_list=params)

                self.train_model = train_model
                self.step_model = step_model
                self.step = step_model.step
                self.proba_step = step_model.proba_step
                self.value = step_model.value
                self.initial_state = step_model.initial_state
                tf.global_variables_initializer().run(session=self.sess)

                self.summary = tf.summary.merge_all()
コード例 #5
0
    def setup_model(self):
        with SetVerbosity(self.verbose):

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.sess = tf_util.make_session(num_cpu=self.nprocs,
                                                 graph=self.graph)

                self.advs_ph = advs_ph = tf.placeholder(tf.float32, [None])
                self.rewards_ph = rewards_ph = tf.placeholder(
                    tf.float32, [None])
                self.pg_lr_ph = pg_lr_ph = tf.placeholder(tf.float32, [])

                n_batch_step = None
                n_batch_train = None
                if issubclass(self.policy, LstmPolicy):
                    n_batch_step = self.n_envs
                    n_batch_train = self.n_envs * self.n_steps

                self.model = step_model = self.policy(self.sess,
                                                      self.observation_space,
                                                      self.action_space,
                                                      self.n_envs,
                                                      1,
                                                      n_batch_step,
                                                      reuse=False)
                self.model2 = train_model = self.policy(self.sess,
                                                        self.observation_space,
                                                        self.action_space,
                                                        self.n_envs,
                                                        self.n_steps,
                                                        n_batch_train,
                                                        reuse=True)

                self.action_ph = action_ph = train_model.pdtype.sample_placeholder(
                    [None])

                logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=train_model.policy, labels=action_ph)
                self.logits = train_model.policy

                # training loss
                pg_loss = tf.reduce_mean(advs_ph * logpac)
                self.entropy = entropy = tf.reduce_mean(
                    calc_entropy(train_model.policy))
                self.pg_loss = pg_loss = pg_loss - self.ent_coef * entropy
                self.vf_loss = vf_loss = mse(tf.squeeze(train_model.value_fn),
                                             rewards_ph)
                train_loss = pg_loss + self.vf_coef * vf_loss

                # Fisher loss construction
                self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac)
                sample_net = train_model.value_fn + tf.random_normal(
                    tf.shape(train_model.value_fn))
                self.vf_fisher = vf_fisher_loss = -self.vf_fisher_coef * tf.reduce_mean(
                    tf.pow(train_model.value_fn - tf.stop_gradient(sample_net),
                           2))
                self.joint_fisher = pg_fisher_loss + vf_fisher_loss

                self.params = params = find_trainable_variables("model")

                self.grads_check = tf.gradients(train_loss, params)

                with tf.device('/gpu:0'):
                    self.optim = optim = kfac.KfacOptimizer(
                        learning_rate=pg_lr_ph,
                        clip_kl=self.kfac_clip,
                        momentum=0.9,
                        kfac_update=1,
                        epsilon=0.01,
                        stats_decay=0.99,
                        async=1,
                        cold_iter=10,
                        max_grad_norm=self.max_grad_norm,
                        verbose=self.verbose)

                    optim.compute_and_apply_stats(self.joint_fisher,
                                                  var_list=params)

                self.train_model = train_model
                self.step_model = step_model
                self.step = step_model.step
                self.proba_step = step_model.proba_step
                self.value = step_model.value
                self.initial_state = step_model.initial_state
                tf.global_variables_initializer().run(session=self.sess)
コード例 #6
0
ファイル: MultiTaskA2C.py プロジェクト: andris955/diplomaterv
    def setup_train_model(self, transfer=False):
        with SetVerbosity(self.verbose):

            assert issubclass(self.policy, MultiTaskActorCriticPolicy), "Error: the input policy for the A2C model must be an " \
                                                                        "instance of MultiTaskActorCriticPolicy."

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.sess = tf_utils.make_session(graph=self.graph)

                self.n_batch = self.n_envs_per_task * self.n_steps

                step_model = self.policy(self.sess,
                                         self.tasks,
                                         self.observation_space_dict,
                                         self.action_space_dict,
                                         self.n_envs_per_task,
                                         n_steps=1,
                                         reuse=False)

                with tf.variable_scope(
                        "train_model",
                        reuse=True,
                        custom_getter=tf_utils.outer_scope_getter(
                            "train_model")):
                    train_model = self.policy(self.sess,
                                              self.tasks,
                                              self.observation_space_dict,
                                              self.action_space_dict,
                                              self.n_envs_per_task,
                                              self.n_steps,
                                              reuse=True)

                with tf.variable_scope("loss", reuse=False):
                    self.actions_ph = tf.placeholder(dtype=tf.int32,
                                                     shape=[None],
                                                     name="actions_ph")
                    self.advs_ph = tf.placeholder(tf.float32, [None],
                                                  name="advs_ph")  # advantages
                    self.rewards_ph = tf.placeholder(tf.float32, [None],
                                                     name="rewards_ph")
                    self.learning_rate_ph = tf.placeholder(
                        tf.float32, [], name="learning_rate_ph")

                    neglogpac = {}
                    losses = {}
                    for task in self.tasks:
                        neglogpac[task] = train_model.proba_distribution_dict[
                            task].neglogp(self.actions_ph)
                        self.entropy[task] = tf.reduce_mean(
                            train_model.proba_distribution_dict[task].entropy(
                            ))
                        self.pg_loss[task] = tf.reduce_mean(
                            self.advs_ph *
                            neglogpac[task])  # policy gradient loss
                        self.vf_loss[task] = mse(
                            tf.squeeze(train_model.value_fn_dict[task]),
                            self.rewards_ph)
                        losses[task] = self.pg_loss[task] - self.entropy[
                            task] * self.ent_coef + self.vf_loss[
                                task] * self.vf_coef

                        tf.summary.scalar(task + '_policy_gradient_loss',
                                          self.pg_loss[task])
                        tf.summary.scalar(task + '_value_function_loss',
                                          self.vf_loss[task])

                with tf.variable_scope("input_info", reuse=False):
                    tf.summary.scalar('learning_rate',
                                      tf.reduce_mean(self.learning_rate_ph))

                optimizers = {}
                grads_and_vars = {}
                self.apply_backprop = {}
                for task in self.tasks:
                    optimizers[task] = tf.train.RMSPropOptimizer(
                        learning_rate=self.learning_rate_ph,
                        decay=self.alpha,
                        epsilon=self.epsilon)
                    grads_and_vars[task] = optimizers[task].compute_gradients(
                        losses[task])
                    if self.max_grad_norm is not None:
                        grads = [grad for grad, var in grads_and_vars[task]]
                        vars = [var for grad, var in grads_and_vars[task]]
                        clipped_grads, _ = tf.clip_by_global_norm(
                            grads, self.max_grad_norm)
                        grads_and_vars[task] = list(zip(clipped_grads, vars))
                    self.apply_backprop[task] = optimizers[
                        task].apply_gradients(grads_and_vars[task])

                self.train_model = train_model
                self.step_model = step_model
                self.step = step_model.step
                self.value = step_model.value

                self.trainable_variables = tf_utils.find_trainable_variables(
                    "model")

                tf.global_variables_initializer().run(session=self.sess)

                self.summary = tf.summary.merge_all()

                if not transfer:
                    self.sess.graph.finalize()
コード例 #7
0
    def setup_model(self):
        with SetVerbosity(self.verbose):

            assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the ACKTR model must be " \
                                                               "an instance of common.policies.ActorCriticPolicy."

            if isinstance(self.action_space, Box):
                raise NotImplementedError("WIP: ACKTR does not support Continuous actions yet.")

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.sess = tf_util.make_session(num_cpu=self.nprocs, graph=self.graph)

                n_batch_step = None
                n_batch_train = None
                if issubclass(self.policy, LstmPolicy):
                    n_batch_step = self.n_envs
                    n_batch_train = self.n_envs * self.n_steps

                self.model = step_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs,
                                                      1, n_batch_step, reuse=False)

                self.params = params = find_trainable_variables("model")

                with tf.variable_scope("train_model", reuse=True,
                                       custom_getter=tf_util.outer_scope_getter("train_model")):
                    self.model2 = train_model = self.policy(self.sess, self.observation_space, self.action_space,
                                                            self.n_envs, self.n_steps, n_batch_train,
                                                            reuse=True)

                with tf.variable_scope("loss", reuse=False, custom_getter=tf_util.outer_scope_getter("loss")):
                    self.advs_ph = advs_ph = tf.placeholder(tf.float32, [None])
                    self.rewards_ph = rewards_ph = tf.placeholder(tf.float32, [None])
                    self.pg_lr_ph = pg_lr_ph = tf.placeholder(tf.float32, [])
                    self.action_ph = action_ph = train_model.pdtype.sample_placeholder([None])

                    logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.policy, labels=action_ph)
                    self.logits = train_model.policy

                    # training loss
                    pg_loss = tf.reduce_mean(advs_ph * logpac)
                    self.entropy = entropy = tf.reduce_mean(calc_entropy(train_model.policy))
                    self.pg_loss = pg_loss = pg_loss - self.ent_coef * entropy
                    self.vf_loss = vf_loss = mse(tf.squeeze(train_model.value_fn), rewards_ph)
                    train_loss = pg_loss + self.vf_coef * vf_loss

                    # Fisher loss construction
                    self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac)
                    sample_net = train_model.value_fn + tf.random_normal(tf.shape(train_model.value_fn))
                    self.vf_fisher = vf_fisher_loss = - self.vf_fisher_coef * tf.reduce_mean(
                        tf.pow(train_model.value_fn - tf.stop_gradient(sample_net), 2))
                    self.joint_fisher = pg_fisher_loss + vf_fisher_loss

                    tf.summary.scalar('entropy_loss', self.entropy)
                    tf.summary.scalar('policy_gradient_loss', pg_loss)
                    tf.summary.scalar('policy_gradient_fisher_loss', pg_fisher_loss)
                    tf.summary.scalar('value_function_loss', self.vf_loss)
                    tf.summary.scalar('value_function_fisher_loss', vf_fisher_loss)
                    tf.summary.scalar('loss', train_loss)

                    self.grads_check = tf.gradients(train_loss, params)

                with tf.variable_scope("input_info", reuse=False):
                    tf.summary.scalar('discounted_rewards', tf.reduce_mean(self.rewards_ph))
                    tf.summary.histogram('discounted_rewards', self.rewards_ph)
                    tf.summary.scalar('learning_rate', tf.reduce_mean(self.pg_lr_ph))
                    tf.summary.histogram('learning_rate', self.pg_lr_ph)
                    tf.summary.scalar('advantage', tf.reduce_mean(self.advs_ph))
                    tf.summary.histogram('advantage', self.advs_ph)
                    if len(self.observation_space.shape) == 3:
                        tf.summary.image('observation', train_model.obs_ph)
                    else:
                        tf.summary.histogram('observation', train_model.obs_ph)

                with tf.variable_scope("kfac", reuse=False, custom_getter=tf_util.outer_scope_getter("kfac")):
                    with tf.device('/gpu:0'):
                        self.optim = optim = kfac.KfacOptimizer(learning_rate=pg_lr_ph, clip_kl=self.kfac_clip,
                                                                momentum=0.9, kfac_update=1, epsilon=0.01,
                                                                stats_decay=0.99, async=1, cold_iter=10,
                                                                max_grad_norm=self.max_grad_norm, verbose=self.verbose)

                        optim.compute_and_apply_stats(self.joint_fisher, var_list=params)

                self.train_model = train_model
                self.step_model = step_model
                self.step = step_model.step
                self.proba_step = step_model.proba_step
                self.value = step_model.value
                self.initial_state = step_model.initial_state
                tf.global_variables_initializer().run(session=self.sess)

                self.summary = tf.summary.merge_all()
コード例 #8
0
  def setup_model(self):
    with SetVerbosity(self.verbose):

      assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the A2C model must be an " \
                                                         "instance of common.policies.ActorCriticPolicy."
      assert issubclass(self.policy, FeedForwardPolicy), "Error: the input policy for the A2C model must be an " \
                                                         "instance of common.policies.FeedFowardPolicy."

      self.graph = tf.Graph()
      with self.graph.as_default():
        self.sess = tf_util.make_session(graph=self.graph)

        self.n_batch = self.n_envs * self.n_steps

        n_batch_step = None
        n_batch_train = None
        n_batch_sil = None
        if issubclass(self.policy, LstmPolicy):
          n_batch_step = self.n_envs
          n_batch_train = self.n_envs * self.n_steps
          # TODO: Add
          n_batch_sil = 512

        step_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1,
                                 n_batch_step, reuse=False)

        # TODO: Add
        with tf.variable_scope("train_model", reuse=True,
                               custom_getter=tf_util.outer_scope_getter("train_model")):
          train_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs,
                                    self.n_steps, n_batch_train, reuse=True)

        with tf.variable_scope("sil_model", reuse=True,
                               custom_getter=tf_util.outer_scope_getter("sil_model")):
          sil_model = self.policy(self.sess, self.observation_space, self.action_space,
                                  self.n_envs, self.n_steps, n_batch_sil, reuse=True)

        with tf.variable_scope("loss", reuse=False):
          # self.actions_ph = train_model.pdtype.sample_placeholder([None], name="action_ph")
          self.actions_ph = train_model.action_ph
          self.advs_ph = tf.placeholder(tf.float32, [None], name="advs_ph")
          self.rewards_ph = tf.placeholder(tf.float32, [None], name="rewards_ph")
          self.successor_feature_ph = tf.placeholder(tf.float32, [None, FEATURE_SIZE], name="successor_feature_ph")
          self.learning_rate_ph = tf.placeholder(tf.float32, [], name="learning_rate_ph")

          neglogpac = train_model.proba_distribution.neglogp(self.actions_ph)
          last_frame = tf.reshape(train_model.obs_ph[..., 3], shape=[-1, 84 * 84])
          recons_losses = tf.squared_difference(x=last_frame,
                                                y=train_model.recons_mod)

          self.recons_loss = tf.losses.mean_squared_error(labels=last_frame,
                                                          predictions=train_model.recons_mod)
          self.entropy = tf.reduce_mean(train_model.proba_distribution.entropy())
          self.pg_loss = tf.reduce_mean(self.advs_ph * neglogpac)
          if self.use_recons:
            self.vf_loss = mse(tf.squeeze(train_model.value_fn),
                               self.rewards_ph + self.recons_intri * tf.stop_gradient(self.recons_loss))
          else:
            self.vf_loss = mse(tf.squeeze(train_model.value_fn), self.rewards_ph)
          # TODO: loss of SF
          self.sf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.successor_feature),
                                            self.successor_feature_ph))
          loss = self.pg_loss - \
                 self.entropy * self.ent_coef + \
                 self.vf_loss * self.vf_coef
          if self.use_recons:
            loss += self.recons_loss * self.recons_coef
          elif self.use_sf:
            loss += self.sf_loss * self.sf_coef + \
              self.recons_loss * self.recons_coef
          tf.summary.scalar('recons_loss/max', tf.reduce_max(recons_losses))
          tf.summary.scalar('recons_loss/min', tf.reduce_min(recons_losses))
          tf.summary.scalar('recons_loss', self.recons_loss)
          tf.summary.scalar('entropy_loss', self.entropy)
          tf.summary.scalar('policy_gradient_loss', self.pg_loss)
          tf.summary.scalar('value_function_loss', self.vf_loss)
          tf.summary.scalar('successor_feature_loss', self.sf_loss)
          tf.summary.scalar('loss', loss)

          self.params = find_trainable_variables("model")
          grads = tf.gradients(loss, self.params)
          if self.max_grad_norm is not None:
            grads, _ = tf.clip_by_global_norm(grads, self.max_grad_norm)
          grads = list(zip(grads, self.params))

        _last_frame = tf.reshape(last_frame, [-1, 84, 84, 1])
        _recons_mod = tf.reshape(train_model.recons_mod, [-1, 84, 84, 1])
        with tf.variable_scope("input_info", reuse=False):
          tf.summary.scalar('discounted_rewards', tf.reduce_mean(self.rewards_ph))
          tf.summary.histogram('discounted_rewards', self.rewards_ph)
          tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate))
          tf.summary.histogram('learning_rate', self.learning_rate)
          tf.summary.scalar('advantage', tf.reduce_mean(self.advs_ph))
          tf.summary.histogram('advantage', self.advs_ph)
          tf.summary.image('last_frame', _last_frame)
          tf.summary.image('reconstruction', _recons_mod)
          if len(self.observation_space.shape) == 3:
            tf.summary.image('observation', train_model.obs_ph)
          else:
            tf.summary.histogram('observation', train_model.obs_ph)

        trainer = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate_ph, decay=self.alpha,
                                            epsilon=self.epsilon)
        self.apply_backprop = trainer.apply_gradients(grads)

        # TODO: Add
        self.sil = SelfImitation(
          model_ob=sil_model.obs_ph,
          model_vf=sil_model.value_fn,
          model_entropy=sil_model.proba_distribution.entropy(),
          fn_value=sil_model.value,
          fn_neg_log_prob=sil_model.proba_distribution.neglogp,
          ac_space=self.action_space,
          fn_reward=np.sign,
          n_env=self.n_envs,
          n_update=self.sil_update,
          beta=self.sil_beta)

        self.sil.build_train_op(
          params=self.params,
          optim=trainer,
          lr=self.learning_rate_ph,
          max_grad_norm=self.max_grad_norm)

        self.train_model = train_model
        self.step_model = step_model
        # self.step = step_model.step
        self.step = step_model.step_with_sf
        self.estimate_recons = step_model.estimate_recons
        self.proba_step = step_model.proba_step
        self.value = step_model.value
        # TODO: Add
        self.successor_feature = step_model.estimate_sf
        self.initial_state = step_model.initial_state
        tf.global_variables_initializer().run(session=self.sess)

        self.summary = tf.summary.merge_all()
コード例 #9
0
ファイル: a2c.py プロジェクト: safrooze/stable-baselines
    def setup_model(self):
        with SetVerbosity(self.verbose):

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.sess = tf_util.make_session(graph=self.graph)

                self.n_batch = self.n_envs * self.n_steps

                n_batch_step = None
                n_batch_train = None
                if issubclass(self.policy, LstmPolicy):
                    n_batch_step = self.n_envs
                    n_batch_train = self.n_envs * self.n_steps

                step_model = self.policy(self.sess,
                                         self.observation_space,
                                         self.action_space,
                                         self.n_envs,
                                         1,
                                         n_batch_step,
                                         reuse=False)
                train_model = self.policy(self.sess,
                                          self.observation_space,
                                          self.action_space,
                                          self.n_envs,
                                          self.n_steps,
                                          n_batch_train,
                                          reuse=True)

                self.actions_ph = train_model.pdtype.sample_placeholder([None])
                self.advs_ph = tf.placeholder(tf.float32, [None])
                self.rewards_ph = tf.placeholder(tf.float32, [None])
                self.learning_rate_ph = tf.placeholder(tf.float32, [])

                neglogpac = train_model.proba_distribution.neglogp(
                    self.actions_ph)
                self.entropy = tf.reduce_mean(
                    train_model.proba_distribution.entropy())
                self.pg_loss = tf.reduce_mean(self.advs_ph * neglogpac)
                self.vf_loss = mse(tf.squeeze(train_model.value_fn),
                                   self.rewards_ph)
                loss = self.pg_loss - self.entropy * self.ent_coef + self.vf_loss * self.vf_coef

                self.params = find_trainable_variables("model")
                grads = tf.gradients(loss, self.params)
                if self.max_grad_norm is not None:
                    grads, _ = tf.clip_by_global_norm(grads,
                                                      self.max_grad_norm)
                grads = list(zip(grads, self.params))
                trainer = tf.train.RMSPropOptimizer(
                    learning_rate=self.learning_rate_ph,
                    decay=self.alpha,
                    epsilon=self.epsilon)
                self.apply_backprop = trainer.apply_gradients(grads)

                self.train_model = train_model
                self.step_model = step_model
                self.step = step_model.step
                self.proba_step = step_model.proba_step
                self.value = step_model.value
                self.initial_state = step_model.initial_state
                tf.global_variables_initializer().run(session=self.sess)