def setup_model(self):
        with SetVerbosity(self.verbose):

            assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the PPO2 model must be " \
                                                               "an instance of common.policies.ActorCriticPolicy."

            self.n_batch = self.n_envs * self.n_steps

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.set_random_seed(self.seed)
                self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess,
                                                 graph=self.graph)

                n_batch_step = None
                n_batch_train = None
                if issubclass(self.policy, RecurrentActorCriticPolicy):
                    assert self.n_envs % self.nminibatches == 0, "For recurrent policies, "\
                        "the number of environments run in parallel should be a multiple of nminibatches."
                    n_batch_step = self.n_envs
                    n_batch_train = self.n_batch // self.nminibatches

                act_model = self.policy(self.sess,
                                        self.observation_space,
                                        self.action_space,
                                        self.n_envs,
                                        1,
                                        n_batch_step,
                                        reuse=False,
                                        **self.policy_kwargs)
                with tf.variable_scope(
                        "train_model",
                        reuse=True,
                        custom_getter=tf_util.outer_scope_getter(
                            "train_model")):
                    train_model = self.policy(self.sess,
                                              self.observation_space,
                                              self.action_space,
                                              self.n_envs // self.nminibatches,
                                              self.n_steps,
                                              n_batch_train,
                                              reuse=True,
                                              **self.policy_kwargs)

                with tf.variable_scope("loss", reuse=False):
                    self.action_ph = train_model.pdtype.sample_placeholder(
                        [None], name="action_ph")
                    self.advs_ph = tf.placeholder(tf.float32, [None],
                                                  name="advs_ph")
                    self.rewards_ph = tf.placeholder(tf.float32, [None],
                                                     name="rewards_ph")
                    self.old_neglog_pac_ph = tf.placeholder(
                        tf.float32, [None], name="old_neglog_pac_ph")
                    self.old_vpred_ph = tf.placeholder(tf.float32, [None],
                                                       name="old_vpred_ph")
                    self.learning_rate_ph = tf.placeholder(
                        tf.float32, [], name="learning_rate_ph")
                    self.clip_range_ph = tf.placeholder(tf.float32, [],
                                                        name="clip_range_ph")

                    neglogpac = train_model.proba_distribution.neglogp(
                        self.action_ph)
                    self.entropy = tf.reduce_mean(
                        train_model.proba_distribution.entropy())

                    vpred = train_model.value_flat

                    # Value function clipping: not present in the original PPO
                    if self.cliprange_vf is None:
                        # Default behavior (legacy from OpenAI baselines):
                        # use the same clipping as for the policy
                        self.clip_range_vf_ph = self.clip_range_ph
                        self.cliprange_vf = self.cliprange
                    elif isinstance(self.cliprange_vf,
                                    (float, int)) and self.cliprange_vf < 0:
                        # Original PPO implementation: no value function clipping
                        self.clip_range_vf_ph = None
                    else:
                        # Last possible behavior: clipping range
                        # specific to the value function
                        self.clip_range_vf_ph = tf.placeholder(
                            tf.float32, [], name="clip_range_vf_ph")

                    if self.clip_range_vf_ph is None:
                        # No clipping
                        vpred_clipped = train_model.value_flat
                    else:
                        # Clip the different between old and new value
                        # NOTE: this depends on the reward scaling
                        vpred_clipped = self.old_vpred_ph + \
                            tf.clip_by_value(train_model.value_flat - self.old_vpred_ph,
                                             - self.clip_range_vf_ph, self.clip_range_vf_ph)

                    vf_losses1 = tf.square(vpred - self.rewards_ph)
                    vf_losses2 = tf.square(vpred_clipped - self.rewards_ph)
                    self.vf_loss = .5 * tf.reduce_mean(
                        tf.maximum(vf_losses1, vf_losses2))

                    ratio = tf.exp(self.old_neglog_pac_ph - neglogpac)
                    pg_losses = -self.advs_ph * ratio
                    pg_losses2 = -self.advs_ph * tf.clip_by_value(
                        ratio, 1.0 - self.clip_range_ph,
                        1.0 + self.clip_range_ph)
                    self.pg_loss = tf.reduce_mean(
                        tf.maximum(pg_losses, pg_losses2))
                    self.approxkl = .5 * tf.reduce_mean(
                        tf.square(neglogpac - self.old_neglog_pac_ph))
                    self.clipfrac = tf.reduce_mean(
                        tf.cast(
                            tf.greater(tf.abs(ratio - 1.0),
                                       self.clip_range_ph), tf.float32))
                    loss = self.pg_loss - self.entropy * self.ent_coef + self.vf_loss * self.vf_coef

                    tf.summary.scalar('entropy_loss', self.entropy)
                    tf.summary.scalar('policy_gradient_loss', self.pg_loss)
                    tf.summary.scalar('value_function_loss', self.vf_loss)
                    tf.summary.scalar('approximate_kullback-leibler',
                                      self.approxkl)
                    tf.summary.scalar('clip_factor', self.clipfrac)
                    tf.summary.scalar('loss', loss)

                    with tf.variable_scope('model'):
                        self.params = tf.trainable_variables()
                        if self.full_tensorboard_log:
                            for var in self.params:
                                tf.summary.histogram(var.name, var)
                    grads = tf.gradients(loss, self.params)
                    if self.max_grad_norm is not None:
                        grads, _grad_norm = tf.clip_by_global_norm(
                            grads, self.max_grad_norm)
                    grads = list(zip(grads, self.params))
                trainer = tf.train.AdamOptimizer(
                    learning_rate=self.learning_rate_ph, epsilon=1e-5)
                self._train = trainer.apply_gradients(grads)

                self.loss_names = [
                    'policy_loss', 'value_loss', 'policy_entropy', 'approxkl',
                    'clipfrac'
                ]

                with tf.variable_scope("input_info", reuse=False):
                    tf.summary.scalar('discounted_rewards',
                                      tf.reduce_mean(self.rewards_ph))
                    tf.summary.scalar('learning_rate',
                                      tf.reduce_mean(self.learning_rate_ph))
                    tf.summary.scalar('advantage',
                                      tf.reduce_mean(self.advs_ph))
                    tf.summary.scalar('clip_range',
                                      tf.reduce_mean(self.clip_range_ph))
                    if self.clip_range_vf_ph is not None:
                        tf.summary.scalar(
                            'clip_range_vf',
                            tf.reduce_mean(self.clip_range_vf_ph))

                    tf.summary.scalar('old_neglog_action_probability',
                                      tf.reduce_mean(self.old_neglog_pac_ph))
                    tf.summary.scalar('old_value_pred',
                                      tf.reduce_mean(self.old_vpred_ph))

                    if self.full_tensorboard_log:
                        tf.summary.histogram('discounted_rewards',
                                             self.rewards_ph)
                        tf.summary.histogram('learning_rate',
                                             self.learning_rate_ph)
                        tf.summary.histogram('advantage', self.advs_ph)
                        tf.summary.histogram('clip_range', self.clip_range_ph)
                        tf.summary.histogram('old_neglog_action_probability',
                                             self.old_neglog_pac_ph)
                        tf.summary.histogram('old_value_pred',
                                             self.old_vpred_ph)
                        if tf_util.is_image(self.observation_space):
                            tf.summary.image('observation', train_model.obs_ph)
                        else:
                            tf.summary.histogram('observation',
                                                 train_model.obs_ph)

                self.train_model = train_model
                self.act_model = act_model
                self.step = act_model.step
                self.proba_step = act_model.proba_step
                self.value = act_model.value
                self.initial_state = act_model.initial_state
                tf.global_variables_initializer().run(session=self.sess)  # pylint: disable=E1101

                self.summary = tf.summary.merge_all()
Esempio n. 2
0
    def setup_model(self):
        with SetVerbosity(self.verbose):

            assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the A2C model must be an " \
                                                                "instance of common.policies.ActorCriticPolicy."

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.set_random_seed(self.seed)
                self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess,
                                                 graph=self.graph)

                self.n_batch = self.n_envs * self.n_steps

                n_batch_step = None
                n_batch_train = None
                if issubclass(self.policy, RecurrentActorCriticPolicy):
                    n_batch_step = self.n_envs
                    n_batch_train = self.n_envs * self.n_steps

                step_model = self.policy(self.sess,
                                         self.observation_space,
                                         self.action_space,
                                         self.n_envs,
                                         1,
                                         n_batch_step,
                                         reuse=False,
                                         **self.policy_kwargs)

                with tf.variable_scope(
                        "train_model",
                        reuse=True,
                        custom_getter=tf_util.outer_scope_getter(
                            "train_model")):
                    train_model = self.policy(self.sess,
                                              self.observation_space,
                                              self.action_space,
                                              self.n_envs,
                                              self.n_steps,
                                              n_batch_train,
                                              reuse=True,
                                              **self.policy_kwargs)

                with tf.variable_scope("loss", reuse=False):
                    self.actions_ph = train_model.pdtype.sample_placeholder(
                        [None], name="action_ph")
                    self.advs_ph = tf.placeholder(tf.float32, [None],
                                                  name="advs_ph")
                    self.rewards_ph = tf.placeholder(tf.float32, [None],
                                                     name="rewards_ph")
                    self.learning_rate_ph = tf.placeholder(
                        tf.float32, [], name="learning_rate_ph")

                    neglogpac = train_model.proba_distribution.neglogp(
                        self.actions_ph)
                    self.entropy = tf.reduce_mean(
                        train_model.proba_distribution.entropy())
                    self.pg_loss = tf.reduce_mean(self.advs_ph * neglogpac)
                    self.vf_loss = mse(tf.squeeze(train_model.value_flat),
                                       self.rewards_ph)
                    # https://arxiv.org/pdf/1708.04782.pdf#page=9, https://arxiv.org/pdf/1602.01783.pdf#page=4
                    # and https://github.com/dennybritz/reinforcement-learning/issues/34
                    # suggest to add an entropy component in order to improve exploration.
                    loss = self.pg_loss - self.entropy * self.ent_coef + self.vf_loss * self.vf_coef

                    tf.summary.scalar('entropy_loss', self.entropy)
                    tf.summary.scalar('policy_gradient_loss', self.pg_loss)
                    tf.summary.scalar('value_function_loss', self.vf_loss)
                    tf.summary.scalar('loss', loss)

                    self.params = tf_util.get_trainable_vars("model")
                    grads = tf.gradients(loss, self.params)
                    if self.max_grad_norm is not None:
                        grads, _ = tf.clip_by_global_norm(
                            grads, self.max_grad_norm)
                    grads = list(zip(grads, self.params))

                with tf.variable_scope("input_info", reuse=False):
                    tf.summary.scalar('discounted_rewards',
                                      tf.reduce_mean(self.rewards_ph))
                    tf.summary.scalar('learning_rate',
                                      tf.reduce_mean(self.learning_rate_ph))
                    tf.summary.scalar('advantage',
                                      tf.reduce_mean(self.advs_ph))
                    if self.full_tensorboard_log:
                        tf.summary.histogram('discounted_rewards',
                                             self.rewards_ph)
                        tf.summary.histogram('learning_rate',
                                             self.learning_rate_ph)
                        tf.summary.histogram('advantage', self.advs_ph)
                        if tf_util.is_image(self.observation_space):
                            tf.summary.image('observation', train_model.obs_ph)
                        else:
                            tf.summary.histogram('observation',
                                                 train_model.obs_ph)

                trainer = tf.train.RMSPropOptimizer(
                    learning_rate=self.learning_rate_ph,
                    decay=self.alpha,
                    epsilon=self.epsilon,
                    momentum=self.momentum)
                self.apply_backprop = trainer.apply_gradients(grads)

                self.train_model = train_model
                self.step_model = step_model
                self.step = step_model.step
                self.proba_step = step_model.proba_step
                self.value = step_model.value
                self.initial_state = step_model.initial_state
                tf.global_variables_initializer().run(session=self.sess)

                self.summary = tf.summary.merge_all()
Esempio n. 3
0
    def setup_entire_model(self):
        with SetVerbosity(self.verbose):

            assert issubclass(self.policy, LatentSkillsPolicy), "Error: the input policy for the PlannedPPO model " \
                                                                "must be an instance of LatentSkillsPolicy."

            self.n_batch = self.n_envs * self.n_steps

            n_cpu = multiprocessing.cpu_count()
            if sys.platform == 'darwin':
                n_cpu //= 2

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.sess = tf_util.make_session(num_cpu=n_cpu,
                                                 graph=self.graph)

                n_batch_step = None
                n_batch_train = None

                act_model = self.policy(self.sess,
                                        self.observation_space,
                                        self.action_space,
                                        self.z_dim,
                                        self.goal_dim,
                                        self.traj_len,
                                        self.n_envs,
                                        1,
                                        n_batch_step,
                                        reuse=False,
                                        **self.policy_kwargs)
                with tf.variable_scope(
                        "train_model",
                        reuse=True,
                        custom_getter=tf_util.outer_scope_getter(
                            "train_model")):
                    train_model = self.policy(self.sess,
                                              self.observation_space,
                                              self.action_space,
                                              self.z_dim,
                                              self.goal_dim,
                                              self.traj_len,
                                              self.n_envs // self.nminibatches,
                                              self.n_steps,
                                              n_batch_train,
                                              reuse=True,
                                              **self.policy_kwargs)
                with tf.variable_scope("loss", reuse=False):
                    self.action_ph = train_model.pdtype.sample_placeholder(
                        [None], name="action_ph")
                    self.advs_ph = tf.placeholder(tf.float32, [None],
                                                  name="advs_ph")
                    self.rewards_ph = tf.placeholder(tf.float32, [None],
                                                     name="rewards_ph")
                    self.old_neglog_pac_ph = tf.placeholder(
                        tf.float32, [None], name="old_neglog_pac_ph")
                    self.old_vpred_ph = tf.placeholder(tf.float32, [None],
                                                       name="old_vpred_ph")
                    self.learning_rate_ph = tf.placeholder(
                        tf.float32, [], name="learning_rate_ph")
                    self.clip_range_ph = tf.placeholder(tf.float32, [],
                                                        name="clip_range_ph")
                    #TODO: add placeholders for vae and cvae loss functions
                    self.trajectory_ph = tf.placeholder(tf.float32, [None],
                                                        name="trajectory_ph")
                    self.trajectory_hat_ph = tf.placeholder(
                        tf.float32, [None], name="trajectory_hat_ph")
                    self.z_mean_ph = tf.placeholder(tf.float32, [None],
                                                    name="z_mean_ph")
                    self.z_stddev_ph = tf.placeholder(tf.float32, [None],
                                                      "z_stddev_ph")

                    neglogpac = train_model.proba_distribution.neglogp(
                        self.action_ph)
                    self.entropy = tf.reduce_mean(
                        train_model.proba_distribution.entropy())

                    vpred = train_model._value
                    vpredclipped = self.old_vpred_ph + tf.clip_by_value(
                        train_model._value - self.old_vpred_ph,
                        -self.clip_range_ph, self.clip_range_ph)
                    vf_losses1 = tf.square(vpred - self.rewards_ph)
                    vf_losses2 = tf.square(vpredclipped - self.rewards_ph)
                    self.vf_loss = 0.5 * tf.reduce_mean(
                        tf.maximum(vf_losses1, vf_losses2))
                    ratio = tf.exp(self.old_neglog_pac_ph - neglogpac)
                    pg_losses = -self.advs_ph * ratio
                    pg_losses2 = -self.advs_ph * tf.clip_by_value(
                        ratio, 1.0 - self.clip_range_ph,
                        1.0 + self.clip_range_ph)
                    self.pg_loss = tf.reduce_mean(
                        tf.maximum(pg_losses, pg_losses2))
                    self.approxkl = 0.5 * tf.reduce_mean(
                        tf.square(neglogpac - self.old_neglog_pac_ph))
                    self.clipfrac = tf.reduce_mean(
                        tf.to_float(
                            tf.greater(tf.abs(ratio - 1.0),
                                       self.clip_range_ph)))
                    loss = self.pg_loss - self.entropy * self.ent_coef + self.vf_loss * self.vf_coef
                    #TODO: add loss terms from vae and cvae to loss expression above
                    traj_loss = tf.reduce_sum(
                        tf.squared_difference(self.trajectory_ph,
                                              self.trajectory_hat_ph), 1)
                    latent_skill_loss = -0.5 * tf.reduce_sum(1.0 + 2.0 * self.z_stddev_ph - tf.square(self.z_mean_ph) - \
                                                             tf.exp(2.0 * self.z_stddev_ph), 1)
                    self.skill_vae_loss = tf.reduce_mean(traj_loss +
                                                         latent_skill_loss)
                    loss += self.skill_vae_loss

                    tf.summary.scalar('entropy_loss', self.entropy)
                    tf.summary.scalar('policy_gradient_loss', self.pg_loss)
                    tf.summary.scalar('value_function_loss', self.vf_loss)
                    tf.summary.scalar('approximate_kullback-leiber',
                                      self.approxkl)
                    tf.summary.scalar('clip_factor', self.clipfrac)
                    tf.summary.scalar('loss', loss)

                #TODO: confirm all trainable variables are captured in scope "model"
                with tf.variable_scope("model"):
                    self.params = tf.trainable_variables()
                    if self.full_tensorboard_log:
                        for var in self.params:
                            tf.summary.histogram(var.name, var)
                grads = tf.gradients(loss, self.params)
                if self.max_grad_norm is not None:
                    grads, _grad_norm = tf.clip_by_global_norm(
                        grads, self.max_grad_norm)
                grads = list(zip(grads, self.params))

            trainer = tf.train.AdamOptimizer(
                learning_rate=self.learning_rate_ph, epsilon=1e-5)
            self._train = trainer.apply_gradients(grads)

            #TODO: include vae and cvae losses
            self.loss_names = [
                'policy_loss', 'value_loss', 'policy_entropy', 'approxkl',
                'clipfrac'
            ]

            with tf.variable_scope("input_info", reuse=False):
                tf.summary.scalar('discounted_rewards',
                                  tf.reduce_mean(self.rewards_ph))
                tf.summary.scalar('learning_rate',
                                  tf.reduce_mean(self.learning_rate_ph))
                tf.summary.scalar('advantage', tf.reduce_mean(self.advs_ph))
                tf.summary.scalar('clip_range',
                                  tf.reduce_mean(self.clip_range_ph))
                tf.summary.scalar('old_neglog_action_probabilty',
                                  tf.reduce_mean(self.old_neglog_pac_ph))
                tf.summary.scalar('old_value_pred',
                                  tf.reduce_mean(self.old_vpred_ph))

                if self.full_tensorboard_log:
                    tf.summary.histogram('discounted_rewards', self.rewards_ph)
                    tf.summary.histogram('learning_rate',
                                         self.learning_rate_ph)
                    tf.summary.histogram('advantage', self.advs_ph)
                    tf.summary.histogram('clip_range', self.clip_range_ph)
                    tf.summary.histogram('old_neglog_action_probabilty',
                                         self.old_neglog_pac_ph)
                    tf.summary.histogram('old_value_pred', self.old_vpred_ph)
                    if tf_util.is_image(self.observation_space):
                        tf.summary.image('observation', train_model.obs_ph)
                    else:
                        tf.summary.histogram('observation', train_model.obs_ph)

            self.train_model = train_model
            self.act_model = act_model
            self.step = act_model.step
            self.proba_step = act_model.proba_step
            self.value = act_model.value
            self.initial_state = act_model.initial_state
            tf.global_variables_initializer().run(session=self.sess)
            #TODO: consider saving other quantities to "self"

            self.summary = tf.summary.merge_all()
Esempio n. 4
0
def build_train(q_func,
                ob_space,
                ac_space,
                optimizer,
                sess,
                grad_norm_clipping=None,
                gamma=1.0,
                kappa=1.0,
                double_q=True,
                scope="deepq",
                reuse=None,
                param_noise=False,
                param_noise_filter_func=None,
                full_tensorboard_log=False):
    """
    Creates the train function:

    :param q_func: (DQNPolicy) the policy
    :param ob_space: (Gym Space) The observation space of the environment
    :param ac_space: (Gym Space) The action space of the environment
    :param reuse: (bool) whether or not to reuse the graph variables
    :param optimizer: (tf.train.Optimizer) optimizer to use for the Q-learning objective.
    :param sess: (TensorFlow session) The current TensorFlow session
    :param grad_norm_clipping: (float) clip gradient norms to this value. If None no clipping is performed.
    :param gamma: (float) discount rate.
    :param double_q: (bool) if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a
        good idea to keep it enabled.
    :param scope: (str or VariableScope) optional scope for variable_scope.
    :param reuse: (bool) whether or not the variables should be reused. To be able to reuse the scope must be given.
    :param param_noise: (bool) whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905)
    :param param_noise_filter_func: (function (TensorFlow Tensor): bool) function that decides whether or not a
        variable should be perturbed. Only applicable if param_noise is True. If set to None, default_param_noise_filter
        is used by default.
    :param full_tensorboard_log: (bool) enable additional logging when using tensorboard
        WARNING: this logging can take a lot of space quickly

    :return: (tuple)

        act: (function (TensorFlow Tensor, bool, float): TensorFlow Tensor) function to select and action given
            observation. See the top of the file for details.
        train: (function (Any, numpy float, numpy float, Any, numpy bool, numpy float): numpy float)
            optimize the error in Bellman's equation. See the top of the file for details.
        update_target: (function) copy the parameters from optimized Q function to the target Q function.
            See the top of the file for details.
        step_model: (DQNPolicy) Policy for evaluation
    """
    n_actions = ac_space.nvec if isinstance(ac_space,
                                            MultiDiscrete) else ac_space.n
    with tf.variable_scope("input", reuse=reuse):
        stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic")
        update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps")

    with tf.variable_scope(scope, reuse=reuse):
        if param_noise:
            act_f, obs_phs = build_act_with_param_noise(
                q_func,
                ob_space,
                ac_space,
                stochastic_ph,
                update_eps_ph,
                sess,
                param_noise_filter_func=param_noise_filter_func)
        else:
            act_f, obs_phs = build_act(q_func, ob_space, ac_space,
                                       stochastic_ph, update_eps_ph, sess)

        # q network evaluation
        with tf.variable_scope(
                "step_model",
                reuse=True,
                custom_getter=tf_util.outer_scope_getter("step_model")):
            step_model = q_func(sess,
                                ob_space,
                                ac_space,
                                1,
                                1,
                                None,
                                reuse=True,
                                obs_phs=obs_phs)
        q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                        scope=tf.get_variable_scope().name +
                                        "/model")
        # target q network evaluation

        with tf.variable_scope("target_q_func", reuse=False):
            target_policy = q_func(sess,
                                   ob_space,
                                   ac_space,
                                   1,
                                   1,
                                   None,
                                   reuse=False)
        target_q_func_vars = tf.get_collection(
            tf.GraphKeys.GLOBAL_VARIABLES,
            scope=tf.get_variable_scope().name + "/target_q_func")

        # compute estimate of best possible value starting from state at t + 1
        double_q_values = None
        double_obs_ph = target_policy.obs_ph
        if double_q:
            with tf.variable_scope(
                    "double_q",
                    reuse=True,
                    custom_getter=tf_util.outer_scope_getter("double_q")):
                double_policy = q_func(sess,
                                       ob_space,
                                       ac_space,
                                       1,
                                       1,
                                       None,
                                       reuse=True)
                double_q_values = double_policy.q_values
                double_obs_ph = double_policy.obs_ph

    with tf.variable_scope("loss", reuse=reuse):
        # set up placeholders
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None],
                                               name="weight")

        # q scores for actions which we know were selected in the given state.
        q_t_selected = tf.reduce_sum(step_model.q_values *
                                     tf.one_hot(act_t_ph, n_actions),
                                     axis=1)

        # compute estimate of best possible value starting from state at t + 1
        if double_q:
            q_tp1_best_using_online_net = tf.argmax(double_q_values, axis=1)
            q_tp1_best = tf.reduce_sum(
                target_policy.q_values *
                tf.one_hot(q_tp1_best_using_online_net, n_actions),
                axis=1)
        else:
            q_tp1_best = tf.reduce_max(target_policy.q_values, axis=1)
        q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best

        # compute kappa-reward #TODO
        act_policy_t_ph = self.act()
        q_t_policy = tf.reduce_sum(step_model.q_values *
                                   tf.one_hot(act_t_ph, n_actions),
                                   axis=1)
        rew_kappa_t_ph = rew_t_ph + gamma * (1 - kappa) * q_t_policy

        # compute RHS of bellman equation
        q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked

        # compute the error (potentially clipped)
        td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
        errors = tf_util.huber_loss(td_error)
        weighted_error = tf.reduce_mean(importance_weights_ph * errors)

        tf.summary.scalar("td_error", tf.reduce_mean(td_error))
        tf.summary.scalar("loss", weighted_error)

        if full_tensorboard_log:
            tf.summary.histogram("td_error", td_error)

        # update_target_fn will be called periodically to copy Q network to target Q network
        update_target_expr = []
        for var, var_target in zip(
                sorted(q_func_vars, key=lambda v: v.name),
                sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        # compute optimization op (potentially with gradient clipping)
        gradients = optimizer.compute_gradients(weighted_error,
                                                var_list=q_func_vars)
        if grad_norm_clipping is not None:
            for i, (grad, var) in enumerate(gradients):
                if grad is not None:
                    gradients[i] = (tf.clip_by_norm(grad,
                                                    grad_norm_clipping), var)

    with tf.variable_scope("input_info", reuse=False):
        tf.summary.scalar('rewards', tf.reduce_mean(rew_t_ph))
        tf.summary.scalar('importance_weights',
                          tf.reduce_mean(importance_weights_ph))

        if full_tensorboard_log:
            tf.summary.histogram('rewards', rew_t_ph)
            tf.summary.histogram('importance_weights', importance_weights_ph)
            if tf_util.is_image(obs_phs[0]):
                tf.summary.image('observation', obs_phs[0])
            elif len(obs_phs[0].shape) == 1:
                tf.summary.histogram('observation', obs_phs[0])

    optimize_expr = optimizer.apply_gradients(gradients)

    summary = tf.summary.merge_all()

    # Create callable functions
    train = tf_util.function(inputs=[
        obs_phs[0], act_t_ph, rew_t_ph, target_policy.obs_ph, double_obs_ph,
        done_mask_ph, importance_weights_ph
    ],
                             outputs=[summary, td_error],
                             updates=[optimize_expr])
    update_target = tf_util.function([], [], updates=[update_target_expr])

    return act_f, train, update_target, step_model
    def setup_model(self):
        with SetVerbosity(self.verbose):

            assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the PPO2 model must be " \
                                                               "an instance of common.policies.ActorCriticPolicy."

            self.n_batch = self.n_envs * self.n_steps

            n_cpu = multiprocessing.cpu_count()
            if sys.platform == 'darwin':
                n_cpu //= 2

            self.graph = tf.Graph()
            with self.graph.as_default():
                set_global_seeds(seed=self.seed)

                self.sess = tf_util.make_session(num_cpu=n_cpu,
                                                 graph=self.graph)

                n_batch_step = None
                n_batch_train = None
                if issubclass(self.policy, LstmPolicy):
                    assert self.n_envs % self.nminibatches == 0, "For recurrent policies, " \
                                                                 "the number of environments run in parallel should be a multiple of nminibatches."
                    n_batch_step = self.n_envs
                    n_batch_train = self.n_batch // self.nminibatches

                act_model = self.policy(self.sess,
                                        self.observation_space,
                                        self.action_space,
                                        self.n_envs,
                                        1,
                                        n_batch_step,
                                        reuse=False,
                                        **self.policy_kwargs)
                with tf.variable_scope(
                        "train_model",
                        reuse=True,
                        custom_getter=tf_util.outer_scope_getter(
                            "train_model")):

                    train_model = self.policy(self.sess,
                                              self.observation_space,
                                              self.action_space,
                                              self.n_envs // self.nminibatches,
                                              self.n_steps,
                                              n_batch_train,
                                              reuse=True,
                                              **self.policy_kwargs)

                with tf.variable_scope("loss", reuse=False):
                    self.action_ph = train_model.pdtype.sample_placeholder(
                        [None], name="action_ph")
                    self.advs_ph = tf.placeholder(tf.float32, [None],
                                                  name="advs_ph")
                    self.rewards_ph = tf.placeholder(tf.float32, [None],
                                                     name="rewards_ph")
                    self.old_neglog_pac_ph = tf.placeholder(
                        tf.float32, [None], name="old_neglog_pac_ph")
                    self.old_vpred_ph = tf.placeholder(tf.float32, [None],
                                                       name="old_vpred_ph")
                    self.learning_rate_ph = tf.placeholder(
                        tf.float32, [], name="learning_rate_ph")
                    self.clip_range_ph = tf.placeholder(tf.float32, [],
                                                        name="clip_range_ph")

                    neglogpac = train_model.proba_distribution.neglogp(
                        self.action_ph)
                    self.entropy = tf.reduce_mean(
                        train_model.proba_distribution.entropy())

                    vpred = train_model._value
                    vpredclipped = self.old_vpred_ph + tf.clip_by_value(
                        train_model._value - self.old_vpred_ph,
                        -self.clip_range_ph, self.clip_range_ph)
                    vf_losses1 = tf.square(vpred - self.rewards_ph)
                    vf_losses2 = tf.square(vpredclipped - self.rewards_ph)
                    self.vf_loss = .5 * tf.reduce_mean(
                        tf.maximum(vf_losses1, vf_losses2))
                    ratio = tf.exp(self.old_neglog_pac_ph - neglogpac)
                    pg_losses = -self.advs_ph * ratio
                    pg_losses2 = -self.advs_ph * tf.clip_by_value(
                        ratio, 1.0 - self.clip_range_ph,
                        1.0 + self.clip_range_ph)
                    self.pg_loss = tf.reduce_mean(
                        tf.maximum(pg_losses, pg_losses2))
                    self.approxkl = .5 * tf.reduce_mean(
                        tf.square(neglogpac - self.old_neglog_pac_ph))
                    self.clipfrac = tf.reduce_mean(
                        tf.to_float(
                            tf.greater(tf.abs(ratio - 1.0),
                                       self.clip_range_ph)))
                    loss = self.pg_loss - self.entropy * self.ent_coef + self.vf_loss * self.vf_coef

                    tf.summary.scalar('entropy_loss', self.entropy)
                    tf.summary.scalar('policy_gradient_loss', self.pg_loss)
                    tf.summary.scalar('value_function_loss', self.vf_loss)
                    tf.summary.scalar('approximate_kullback-leiber',
                                      self.approxkl)
                    tf.summary.scalar('clip_factor', self.clipfrac)
                    tf.summary.scalar('loss', loss)

                    with tf.variable_scope('model'):
                        self.params = tf.trainable_variables()
                        self.pi_params = [
                            p for p in self.params if "model/pi" in p.name
                        ]
                        self.vf_params = [
                            p for p in self.params if "model/pi" not in p.name
                        ]
                        assert len(self.pi_params) + len(
                            self.vf_params) == len(self.params)
                        self.get_vf_flat = tf_util.GetFlat(self.vf_params,
                                                           sess=self.sess)
                        self.get_pi_flat = tf_util.GetFlat(self.pi_params,
                                                           sess=self.sess)

                        self.set_vf_from_flat = tf_util.SetFromFlat(
                            self.vf_params, sess=self.sess)
                        self.set_pi_from_flat = tf_util.SetFromFlat(
                            self.pi_params, sess=self.sess)

                        if self.full_tensorboard_log:
                            for var in self.params:
                                tf.summary.histogram(var.name, var)
                    grads = tf.gradients(loss, self.params)
                    if self.max_grad_norm is not None:
                        grads, _grad_norm = tf.clip_by_global_norm(
                            grads, self.max_grad_norm)

                    self.flat_grads = tf.concat(
                        axis=0,
                        values=[
                            tf.reshape(
                                grad if grad is not None else tf.zeros_like(v),
                                [numel(v)])
                            for (v, grad) in zip(self.params, grads)
                        ])

                    grads = list(zip(grads, self.params))

                if self.optimizer_type == 'adam':
                    trainer = tf.train.AdamOptimizer(
                        learning_rate=self.learning_rate_ph, epsilon=1e-5)
                else:
                    trainer = tf.train.GradientDescentOptimizer(
                        learning_rate=self.learning_rate_ph)
                self._train = trainer.apply_gradients(grads)

                self.loss_names = [
                    'policy_loss', 'value_loss', 'policy_entropy', 'approxkl',
                    'clipfrac'
                ]

                with tf.variable_scope("input_info", reuse=False):
                    tf.summary.scalar('discounted_rewards',
                                      tf.reduce_mean(self.rewards_ph))
                    tf.summary.scalar('learning_rate',
                                      tf.reduce_mean(self.learning_rate_ph))
                    tf.summary.scalar('advantage',
                                      tf.reduce_mean(self.advs_ph))
                    tf.summary.scalar('clip_range',
                                      tf.reduce_mean(self.clip_range_ph))
                    tf.summary.scalar('old_neglog_action_probabilty',
                                      tf.reduce_mean(self.old_neglog_pac_ph))
                    tf.summary.scalar('old_value_pred',
                                      tf.reduce_mean(self.old_vpred_ph))

                    if self.full_tensorboard_log:
                        tf.summary.histogram('discounted_rewards',
                                             self.rewards_ph)
                        tf.summary.histogram('learning_rate',
                                             self.learning_rate_ph)
                        tf.summary.histogram('advantage', self.advs_ph)
                        tf.summary.histogram('clip_range', self.clip_range_ph)
                        tf.summary.histogram('old_neglog_action_probabilty',
                                             self.old_neglog_pac_ph)
                        tf.summary.histogram('old_value_pred',
                                             self.old_vpred_ph)
                        if tf_util.is_image(self.observation_space):
                            tf.summary.image('observation', train_model.obs_ph)
                        else:
                            tf.summary.histogram('observation',
                                                 train_model.obs_ph)

                self.train_model = train_model
                self.act_model = act_model
                self.step = act_model.step

                self.step_with_neurons = act_model.step_with_neurons
                self.proba_step = act_model.proba_step
                self.value = act_model.value
                self.initial_state = act_model.initial_state
                self.give_neuron_values = act_model.give_neuron_values
                self.get_weight_values = act_model.get_weight_values
                self.get_all_weight_values = act_model.get_all_weight_values

                tf.global_variables_initializer().run(session=self.sess)  # pylint: disable=E1101

                self.summary = tf.summary.merge_all()
    def setup_model(self):
        # prevent import loops
        from stable_baselines.gail.adversary import TransitionClassifier

        with SetVerbosity(self.verbose):

            assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the TRPO model must be " \
                                                               "an instance of common.policies.ActorCriticPolicy."

            self.nworkers = MPI.COMM_WORLD.Get_size()
            self.rank = MPI.COMM_WORLD.Get_rank()
            np.set_printoptions(precision=3)

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.sess = tf_util.single_threaded_session(graph=self.graph)

                if self.using_gail:
                    self.reward_giver = TransitionClassifier(
                        self.observation_space,
                        self.action_space,
                        self.hidden_size_adversary,
                        entcoeff=self.adversary_entcoeff)

                # Construct network for new policy
                self.policy_pi = self.policy(self.sess,
                                             self.observation_space,
                                             self.action_space,
                                             self.n_envs,
                                             1,
                                             None,
                                             reuse=False,
                                             **self.policy_kwargs)

                # Network for old policy
                with tf.variable_scope("oldpi", reuse=False):
                    old_policy = self.policy(self.sess,
                                             self.observation_space,
                                             self.action_space,
                                             self.n_envs,
                                             1,
                                             None,
                                             reuse=False,
                                             **self.policy_kwargs)

                with tf.variable_scope("loss", reuse=False):
                    atarg = tf.placeholder(dtype=tf.float32, shape=[
                        None
                    ])  # Target advantage function (if applicable)
                    ret = tf.placeholder(dtype=tf.float32,
                                         shape=[None])  # Empirical return

                    observation = self.policy_pi.obs_ph
                    action = self.policy_pi.pdtype.sample_placeholder([None])

                    kloldnew = old_policy.proba_distribution.kl(
                        self.policy_pi.proba_distribution)
                    ent = self.policy_pi.proba_distribution.entropy()
                    meankl = tf.reduce_mean(kloldnew)
                    meanent = tf.reduce_mean(ent)
                    entbonus = self.entcoeff * meanent

                    vferr = tf.reduce_mean(
                        tf.square(self.policy_pi.value_flat - ret))

                    # advantage * pnew / pold
                    ratio = tf.exp(
                        self.policy_pi.proba_distribution.logp(action) -
                        old_policy.proba_distribution.logp(action))
                    surrgain = tf.reduce_mean(ratio * atarg)

                    optimgain = surrgain + entbonus
                    losses = [optimgain, meankl, entbonus, surrgain, meanent]
                    self.loss_names = [
                        "optimgain", "meankl", "entloss", "surrgain", "entropy"
                    ]

                    dist = meankl

                    all_var_list = tf_util.get_trainable_vars("model")
                    var_list = [
                        v for v in all_var_list
                        if "/vf" not in v.name and "/q/" not in v.name
                    ]
                    vf_var_list = [
                        v for v in all_var_list
                        if "/pi" not in v.name and "/logstd" not in v.name
                    ]

                    self.get_flat = tf_util.GetFlat(var_list, sess=self.sess)
                    self.set_from_flat = tf_util.SetFromFlat(var_list,
                                                             sess=self.sess)

                    klgrads = tf.gradients(dist, var_list)
                    flat_tangent = tf.placeholder(dtype=tf.float32,
                                                  shape=[None],
                                                  name="flat_tan")
                    shapes = [var.get_shape().as_list() for var in var_list]
                    start = 0
                    tangents = []
                    for shape in shapes:
                        var_size = tf_util.intprod(shape)
                        tangents.append(
                            tf.reshape(flat_tangent[start:start + var_size],
                                       shape))
                        start += var_size
                    gvp = tf.add_n([
                        tf.reduce_sum(grad * tangent)
                        for (grad, tangent) in zipsame(klgrads, tangents)
                    ])  # pylint: disable=E1111
                    # Fisher vector products
                    fvp = tf_util.flatgrad(gvp, var_list)

                    tf.summary.scalar('entropy_loss', meanent)
                    tf.summary.scalar('policy_gradient_loss', optimgain)
                    tf.summary.scalar('value_function_loss', surrgain)
                    tf.summary.scalar('approximate_kullback-leibler', meankl)
                    tf.summary.scalar(
                        'loss',
                        optimgain + meankl + entbonus + surrgain + meanent)

                    self.assign_old_eq_new = \
                        tf_util.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in
                                                          zipsame(tf_util.get_globals_vars("oldpi"),
                                                                  tf_util.get_globals_vars("model"))])
                    self.compute_losses = tf_util.function(
                        [observation, old_policy.obs_ph, action, atarg],
                        losses)
                    self.compute_fvp = tf_util.function([
                        flat_tangent, observation, old_policy.obs_ph, action,
                        atarg
                    ], fvp)
                    self.compute_vflossandgrad = tf_util.function(
                        [observation, old_policy.obs_ph, ret],
                        tf_util.flatgrad(vferr, vf_var_list))

                    @contextmanager
                    def timed(msg):
                        if self.rank == 0 and self.verbose >= 1:
                            print(colorize(msg, color='magenta'))
                            start_time = time.time()
                            yield
                            print(
                                colorize("done in {:.3f} seconds".format(
                                    (time.time() - start_time)),
                                         color='magenta'))
                        else:
                            yield

                    def allmean(arr):
                        assert isinstance(arr, np.ndarray)
                        out = np.empty_like(arr)
                        MPI.COMM_WORLD.Allreduce(arr, out, op=MPI.SUM)
                        out /= self.nworkers
                        return out

                    tf_util.initialize(sess=self.sess)

                    th_init = self.get_flat()
                    MPI.COMM_WORLD.Bcast(th_init, root=0)
                    self.set_from_flat(th_init)

                with tf.variable_scope("Adam_mpi", reuse=False):
                    self.vfadam = MpiAdam(vf_var_list, sess=self.sess)
                    if self.using_gail:
                        self.d_adam = MpiAdam(
                            self.reward_giver.get_trainable_variables(),
                            sess=self.sess)
                        self.d_adam.sync()
                    self.vfadam.sync()

                with tf.variable_scope("input_info", reuse=False):
                    tf.summary.scalar('discounted_rewards',
                                      tf.reduce_mean(ret))
                    tf.summary.scalar('learning_rate',
                                      tf.reduce_mean(self.vf_stepsize))
                    tf.summary.scalar('advantage', tf.reduce_mean(atarg))
                    tf.summary.scalar('kl_clip_range',
                                      tf.reduce_mean(self.max_kl))

                    if self.full_tensorboard_log:
                        tf.summary.histogram('discounted_rewards', ret)
                        tf.summary.histogram('learning_rate', self.vf_stepsize)
                        tf.summary.histogram('advantage', atarg)
                        tf.summary.histogram('kl_clip_range', self.max_kl)
                        if tf_util.is_image(self.observation_space):
                            tf.summary.image('observation', observation)
                        else:
                            tf.summary.histogram('observation', observation)

                self.timed = timed
                self.allmean = allmean

                self.step = self.policy_pi.step
                self.proba_step = self.policy_pi.proba_step
                self.initial_state = self.policy_pi.initial_state

                self.params = tf_util.get_trainable_vars(
                    "model") + tf_util.get_trainable_vars("oldpi")
                if self.using_gail:
                    self.params.extend(
                        self.reward_giver.get_trainable_variables())

                self.summary = tf.summary.merge_all()

                self.compute_lossandgrad = \
                    tf_util.function([observation, old_policy.obs_ph, action, atarg, ret],
                                     [self.summary, tf_util.flatgrad(optimgain, var_list)] + losses)
Esempio n. 7
0
    def setup_model(self):
        with SetVerbosity(self.verbose):

            assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the ACKTR model must be " \
                                                               "an instance of common.policies.ActorCriticPolicy."

            # Enable continuous actions tricks (normalized advantage)
            self.continuous_actions = isinstance(self.action_space, Box)

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.set_random_seed(self.seed)
                self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess,
                                                 graph=self.graph)

                n_batch_step = None
                n_batch_train = None
                if issubclass(self.policy, RecurrentActorCriticPolicy):
                    n_batch_step = self.n_envs
                    n_batch_train = self.n_envs * self.n_steps

                step_model = self.policy(self.sess,
                                         self.observation_space,
                                         self.action_space,
                                         self.n_envs,
                                         1,
                                         n_batch_step,
                                         reuse=False,
                                         **self.policy_kwargs)

                self.params = params = tf_util.get_trainable_vars("model")

                with tf.variable_scope(
                        "train_model",
                        reuse=True,
                        custom_getter=tf_util.outer_scope_getter(
                            "train_model")):
                    train_model = self.policy(self.sess,
                                              self.observation_space,
                                              self.action_space,
                                              self.n_envs,
                                              self.n_steps,
                                              n_batch_train,
                                              reuse=True,
                                              **self.policy_kwargs)

                with tf.variable_scope(
                        "loss",
                        reuse=False,
                        custom_getter=tf_util.outer_scope_getter("loss")):
                    self.advs_ph = advs_ph = tf.placeholder(tf.float32, [None])
                    self.rewards_ph = rewards_ph = tf.placeholder(
                        tf.float32, [None])
                    self.learning_rate_ph = learning_rate_ph = tf.placeholder(
                        tf.float32, [])
                    self.actions_ph = train_model.pdtype.sample_placeholder(
                        [None])

                    neg_log_prob = train_model.proba_distribution.neglogp(
                        self.actions_ph)

                    # training loss
                    pg_loss = tf.reduce_mean(advs_ph * neg_log_prob)
                    self.entropy = entropy = tf.reduce_mean(
                        train_model.proba_distribution.entropy())
                    self.pg_loss = pg_loss = pg_loss - self.ent_coef * entropy
                    self.vf_loss = vf_loss = mse(
                        tf.squeeze(train_model.value_fn), rewards_ph)
                    train_loss = pg_loss + self.vf_coef * vf_loss

                    # Fisher loss construction
                    self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(
                        neg_log_prob)
                    sample_net = train_model.value_fn + tf.random_normal(
                        tf.shape(train_model.value_fn))
                    self.vf_fisher = vf_fisher_loss = -self.vf_fisher_coef * tf.reduce_mean(
                        tf.pow(
                            train_model.value_fn -
                            tf.stop_gradient(sample_net), 2))
                    self.joint_fisher = pg_fisher_loss + vf_fisher_loss

                    tf.summary.scalar('entropy_loss', self.entropy)
                    tf.summary.scalar('policy_gradient_loss', pg_loss)
                    tf.summary.scalar('policy_gradient_fisher_loss',
                                      pg_fisher_loss)
                    tf.summary.scalar('value_function_loss', self.vf_loss)
                    tf.summary.scalar('value_function_fisher_loss',
                                      vf_fisher_loss)
                    tf.summary.scalar('loss', train_loss)

                    self.grads_check = tf.gradients(train_loss, params)

                with tf.variable_scope("input_info", reuse=False):
                    tf.summary.scalar('discounted_rewards',
                                      tf.reduce_mean(self.rewards_ph))
                    tf.summary.scalar('learning_rate',
                                      tf.reduce_mean(self.learning_rate_ph))
                    tf.summary.scalar('advantage',
                                      tf.reduce_mean(self.advs_ph))

                    if self.full_tensorboard_log:
                        tf.summary.histogram('discounted_rewards',
                                             self.rewards_ph)
                        tf.summary.histogram('learning_rate',
                                             self.learning_rate_ph)
                        tf.summary.histogram('advantage', self.advs_ph)
                        if tf_util.is_image(self.observation_space):
                            tf.summary.image('observation', train_model.obs_ph)
                        else:
                            tf.summary.histogram('observation',
                                                 train_model.obs_ph)

                with tf.variable_scope(
                        "kfac",
                        reuse=False,
                        custom_getter=tf_util.outer_scope_getter("kfac")):
                    with tf.device('/gpu:0'):
                        self.optim = optim = kfac.KfacOptimizer(
                            learning_rate=learning_rate_ph,
                            clip_kl=self.kfac_clip,
                            momentum=0.9,
                            kfac_update=self.kfac_update,
                            epsilon=0.01,
                            stats_decay=0.99,
                            async_eigen_decomp=self.async_eigen_decomp,
                            cold_iter=10,
                            max_grad_norm=self.max_grad_norm,
                            verbose=self.verbose)

                        optim.compute_and_apply_stats(self.joint_fisher,
                                                      var_list=params)

                self.train_model = train_model
                self.step_model = step_model
                self.step = step_model.step
                self.proba_step = step_model.proba_step
                self.value = step_model.value
                self.initial_state = step_model.initial_state
                tf.global_variables_initializer().run(session=self.sess)

                self.summary = tf.summary.merge_all()
Esempio n. 8
0
    def setup_model(self):
        with SetVerbosity(self.verbose):

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.sess = tf_util.single_threaded_session(graph=self.graph)

                # Construct network for new policy
                self.policy_pi = self.policy(self.sess,
                                             self.observation_space,
                                             self.action_space,
                                             self.n_envs,
                                             1,
                                             None,
                                             reuse=False,
                                             **self.policy_kwargs)

                # Network for old policy
                with tf.variable_scope("oldpi", reuse=False):
                    old_pi = self.policy(self.sess,
                                         self.observation_space,
                                         self.action_space,
                                         self.n_envs,
                                         1,
                                         None,
                                         reuse=False,
                                         **self.policy_kwargs)

                with tf.variable_scope("loss", reuse=False):
                    # Target advantage function (if applicable)
                    atarg = tf.placeholder(dtype=tf.float32, shape=[None])

                    # Empirical return
                    ret = tf.placeholder(dtype=tf.float32, shape=[None])

                    # learning rate multiplier, updated with schedule
                    lrmult = tf.placeholder(name='lrmult',
                                            dtype=tf.float32,
                                            shape=[])

                    # Annealed cliping parameter epislon
                    clip_param = self.clip_param * lrmult

                    obs_ph = self.policy_pi.obs_ph
                    action_ph = self.policy_pi.pdtype.sample_placeholder(
                        [None])

                    kloldnew = old_pi.proba_distribution.kl(
                        self.policy_pi.proba_distribution)
                    ent = self.policy_pi.proba_distribution.entropy()
                    meankl = tf.reduce_mean(kloldnew)
                    meanent = tf.reduce_mean(ent)
                    pol_entpen = (-self.entcoeff) * meanent

                    # pnew / pold
                    ratio = tf.exp(
                        self.policy_pi.proba_distribution.logp(action_ph) -
                        old_pi.proba_distribution.logp(action_ph))

                    # surrogate from conservative policy iteration
                    surr1 = ratio * atarg
                    surr2 = tf.clip_by_value(ratio, 1.0 - clip_param,
                                             1.0 + clip_param) * atarg

                    # PPO's pessimistic surrogate (L^CLIP)
                    pol_surr = -tf.reduce_mean(tf.minimum(surr1, surr2))
                    vf_loss = tf.reduce_mean(
                        tf.square(self.policy_pi.value_flat - ret))
                    total_loss = pol_surr + pol_entpen + vf_loss
                    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
                    self.loss_names = [
                        "pol_surr", "pol_entpen", "vf_loss", "kl", "ent"
                    ]

                    tf.summary.scalar('entropy_loss', pol_entpen)
                    tf.summary.scalar('policy_gradient_loss', pol_surr)
                    tf.summary.scalar('value_function_loss', vf_loss)
                    tf.summary.scalar('approximate_kullback-leiber', meankl)
                    tf.summary.scalar('clip_factor', clip_param)
                    tf.summary.scalar('loss', total_loss)

                    self.params = tf_util.get_trainable_vars("model")

                    self.assign_old_eq_new = tf_util.function(
                        [], [],
                        updates=[
                            tf.assign(oldv, newv) for (oldv, newv) in zipsame(
                                tf_util.get_globals_vars("oldpi"),
                                tf_util.get_globals_vars("model"))
                        ])

                with tf.variable_scope("Adam_mpi", reuse=False):
                    self.adam = MpiAdam(self.params,
                                        epsilon=self.adam_epsilon,
                                        sess=self.sess)

                with tf.variable_scope("input_info", reuse=False):
                    tf.summary.scalar('discounted_rewards',
                                      tf.reduce_mean(ret))
                    tf.summary.scalar('learning_rate',
                                      tf.reduce_mean(self.optim_stepsize))
                    tf.summary.scalar('advantage', tf.reduce_mean(atarg))
                    tf.summary.scalar('clip_range',
                                      tf.reduce_mean(self.clip_param))

                    if self.full_tensorboard_log:
                        tf.summary.histogram('discounted_rewards', ret)
                        tf.summary.histogram('learning_rate',
                                             self.optim_stepsize)
                        tf.summary.histogram('advantage', atarg)
                        tf.summary.histogram('clip_range', self.clip_param)
                        if tf_util.is_image(self.observation_space):
                            tf.summary.image('observation', obs_ph)
                        else:
                            tf.summary.histogram('observation', obs_ph)

                self.step = self.policy_pi.step
                self.proba_step = self.policy_pi.proba_step
                self.initial_state = self.policy_pi.initial_state

                tf_util.initialize(sess=self.sess)

                self.summary = tf.summary.merge_all()

                self.lossandgrad = tf_util.function(
                    [obs_ph, old_pi.obs_ph, action_ph, atarg, ret, lrmult],
                    [self.summary,
                     tf_util.flatgrad(total_loss, self.params)] + losses)
                self.compute_losses = tf_util.function(
                    [obs_ph, old_pi.obs_ph, action_ph, atarg, ret, lrmult],
                    losses)
Esempio n. 9
0
    def setup_model(self):
        with SetVerbosity(self.verbose):

            assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the ACER model must be " \
                                                               "an instance of common.policies.ActorCriticPolicy."

            if isinstance(self.action_space, Discrete):
                self.n_act = self.action_space.n
                continuous = False
            elif isinstance(self.action_space, Box):
                # self.n_act = self.action_space.shape[-1]
                # continuous = True
                raise NotImplementedError(
                    "WIP: Acer does not support Continuous actions yet.")
            else:
                raise ValueError(
                    "Error: ACER does not work with {} actions space.".format(
                        self.action_space))

            self.n_batch = self.n_envs * self.n_steps

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.sess = tf_util.make_session(num_cpu=self.num_procs,
                                                 graph=self.graph)

                n_batch_step = None
                if issubclass(self.policy, RecurrentActorCriticPolicy):
                    n_batch_step = self.n_envs
                n_batch_train = self.n_envs * (self.n_steps + 1)

                step_model = self.policy(self.sess,
                                         self.observation_space,
                                         self.action_space,
                                         self.n_envs,
                                         1,
                                         n_batch_step,
                                         reuse=False,
                                         **self.policy_kwargs)

                self.params = find_trainable_variables("model")

                with tf.variable_scope(
                        "train_model",
                        reuse=True,
                        custom_getter=tf_util.outer_scope_getter(
                            "train_model")):
                    train_model = self.policy(self.sess,
                                              self.observation_space,
                                              self.action_space,
                                              self.n_envs,
                                              self.n_steps + 1,
                                              n_batch_train,
                                              reuse=True,
                                              **self.policy_kwargs)

                with tf.variable_scope("moving_average"):
                    # create averaged model
                    ema = tf.train.ExponentialMovingAverage(self.alpha)
                    ema_apply_op = ema.apply(self.params)

                    def custom_getter(getter, name, *args, **kwargs):
                        name = name.replace("polyak_model/", "")
                        val = ema.average(getter(name, *args, **kwargs))
                        return val

                with tf.variable_scope("polyak_model",
                                       reuse=True,
                                       custom_getter=custom_getter):
                    self.polyak_model = polyak_model = self.policy(
                        self.sess,
                        self.observation_space,
                        self.action_space,
                        self.n_envs,
                        self.n_steps + 1,
                        self.n_envs * (self.n_steps + 1),
                        reuse=True,
                        **self.policy_kwargs)

                with tf.variable_scope("loss", reuse=False):
                    self.done_ph = tf.placeholder(tf.float32,
                                                  [self.n_batch])  # dones
                    self.reward_ph = tf.placeholder(
                        tf.float32, [self.n_batch])  # rewards, not returns
                    self.mu_ph = tf.placeholder(
                        tf.float32, [self.n_batch, self.n_act])  # mu's
                    self.action_ph = train_model.pdtype.sample_placeholder(
                        [self.n_batch])
                    self.learning_rate_ph = tf.placeholder(tf.float32, [])
                    eps = 1e-6

                    # Notation: (var) = batch variable, (var)s = sequence variable,
                    # (var)_i = variable index by action at step i
                    # shape is [n_envs * (n_steps + 1)]
                    if continuous:
                        value = train_model.value_flat
                    else:
                        value = tf.reduce_sum(train_model.policy_proba *
                                              train_model.q_value,
                                              axis=-1)

                    rho, rho_i_ = None, None
                    if continuous:
                        action_ = strip(
                            train_model.proba_distribution.sample(),
                            self.n_envs, self.n_steps)
                        distribution_f = tf.contrib.distributions.MultivariateNormalDiag(
                            loc=strip(train_model.proba_distribution.mean,
                                      self.n_envs, self.n_steps),
                            scale_diag=strip(
                                train_model.proba_distribution.logstd,
                                self.n_envs, self.n_steps))
                        f_polyak = tf.contrib.distributions.MultivariateNormalDiag(
                            loc=strip(polyak_model.proba_distribution.mean,
                                      self.n_envs, self.n_steps),
                            scale_diag=strip(
                                polyak_model.proba_distribution.logstd,
                                self.n_envs, self.n_steps))

                        f_i = distribution_f.prob(self.action_ph)
                        f_i_ = distribution_f.prob(action_)
                        f_polyak_i = f_polyak.prob(self.action_ph)
                        phi_i = strip(train_model.proba_distribution.mean,
                                      self.n_envs, self.n_steps)

                        q_value = strip(train_model.value_fn, self.n_envs,
                                        self.n_steps)
                        q_i = q_value[:, 0]

                        rho_i = tf.reshape(f_i, [-1, 1]) / (self.mu_ph + eps)
                        rho_i_ = tf.reshape(f_i_, [-1, 1]) / (self.mu_ph + eps)

                        qret = q_retrace(self.reward_ph, self.done_ph, q_i,
                                         value, tf.pow(rho_i, 1 / self.n_act),
                                         self.n_envs, self.n_steps, self.gamma)
                    else:
                        # strip off last step
                        # f is a distribution, chosen to be Gaussian distributions
                        # with fixed diagonal covariance and mean \phi(x)
                        # in the paper
                        distribution_f, f_polyak, q_value = \
                            map(lambda variables: strip(variables, self.n_envs, self.n_steps),
                                [train_model.policy_proba, polyak_model.policy_proba, train_model.q_value])

                        # Get pi and q values for actions taken
                        f_i = get_by_index(distribution_f, self.action_ph)
                        f_i_ = distribution_f
                        phi_i = distribution_f
                        f_polyak_i = f_polyak

                        q_i = get_by_index(q_value, self.action_ph)

                        # Compute ratios for importance truncation
                        rho = distribution_f / (self.mu_ph + eps)
                        rho_i = get_by_index(rho, self.action_ph)

                        # Calculate Q_retrace targets
                        qret = q_retrace(self.reward_ph, self.done_ph, q_i,
                                         value, rho_i, self.n_envs,
                                         self.n_steps, self.gamma)

                    # Calculate losses
                    # Entropy
                    entropy = tf.reduce_sum(
                        train_model.proba_distribution.entropy())

                    # Policy Gradient loss, with truncated importance sampling & bias correction
                    value = strip(value, self.n_envs, self.n_steps, True)
                    # check_shape([qret, value, rho_i, f_i], [[self.n_envs * self.n_steps]] * 4)
                    # check_shape([rho, distribution_f, q_value], [[self.n_envs * self.n_steps, self.n_act]] * 2)

                    # Truncated importance sampling
                    adv = qret - value
                    log_f = tf.log(f_i + eps)
                    # [n_envs * n_steps]
                    gain_f = log_f * tf.stop_gradient(
                        adv * tf.minimum(self.correction_term, rho_i))
                    loss_f = -tf.reduce_mean(gain_f)

                    # Bias correction for the truncation
                    adv_bc = (
                        q_value -
                        tf.reshape(value, [self.n_envs * self.n_steps, 1])
                    )  # [n_envs * n_steps, n_act]

                    # check_shape([adv_bc, log_f_bc], [[self.n_envs * self.n_steps, self.n_act]] * 2)
                    if continuous:
                        gain_bc = tf.stop_gradient(
                            adv_bc * tf.nn.relu(1.0 - (self.correction_term /
                                                       (rho_i_ + eps))) * f_i_)
                    else:
                        log_f_bc = tf.log(f_i_ + eps)  # / (f_old + eps)
                        gain_bc = tf.reduce_sum(log_f_bc * tf.stop_gradient(
                            adv_bc * tf.nn.relu(1.0 - (self.correction_term /
                                                       (rho + eps))) * f_i_),
                                                axis=1)
                    # IMP: This is sum, as expectation wrt f
                    loss_bc = -tf.reduce_mean(gain_bc)

                    loss_policy = loss_f + loss_bc

                    # Value/Q function loss, and explained variance
                    check_shape([qret, q_i],
                                [[self.n_envs * self.n_steps]] * 2)
                    explained_variance = q_explained_variance(
                        tf.reshape(q_i, [self.n_envs, self.n_steps]),
                        tf.reshape(qret, [self.n_envs, self.n_steps]))
                    loss_q = tf.reduce_mean(
                        tf.square(tf.stop_gradient(qret) - q_i) * 0.5)

                    # Net loss
                    check_shape([loss_policy, loss_q, entropy], [[]] * 3)
                    loss = loss_policy + self.q_coef * loss_q - self.ent_coef * entropy

                    tf.summary.scalar('entropy_loss', entropy)
                    tf.summary.scalar('policy_gradient_loss', loss_policy)
                    tf.summary.scalar('value_function_loss', loss_q)
                    tf.summary.scalar('loss', loss)

                    norm_grads_q, norm_grads_policy, avg_norm_grads_f = None, None, None
                    avg_norm_k, avg_norm_g, avg_norm_k_dot_g, avg_norm_adj = None, None, None, None
                    if self.trust_region:
                        # [n_envs * n_steps, n_act]
                        grad = tf.gradients(
                            -(loss_policy - self.ent_coef * entropy) *
                            self.n_steps * self.n_envs, phi_i)
                        # [n_envs * n_steps, n_act] # Directly computed gradient of KL divergence wrt f
                        kl_grad = -f_polyak_i / (f_i_ + eps)
                        k_dot_g = tf.reduce_sum(kl_grad * grad, axis=-1)
                        adj = tf.maximum(
                            0.0, (tf.reduce_sum(kl_grad * grad, axis=-1) -
                                  self.delta) /
                            (tf.reduce_sum(tf.square(kl_grad), axis=-1) +
                             eps))  # [n_envs * n_steps]

                        # Calculate stats (before doing adjustment) for logging.
                        avg_norm_k = avg_norm(kl_grad)
                        avg_norm_g = avg_norm(grad)
                        avg_norm_k_dot_g = tf.reduce_mean(tf.abs(k_dot_g))
                        avg_norm_adj = tf.reduce_mean(tf.abs(adj))

                        grad = grad - tf.reshape(
                            adj, [self.n_envs * self.n_steps, 1]) * kl_grad
                        # These are turst region adjusted gradients wrt f ie statistics of policy pi
                        grads_f = -grad / (self.n_envs * self.n_steps)
                        grads_policy = tf.gradients(f_i_, self.params, grads_f)
                        grads_q = tf.gradients(loss_q * self.q_coef,
                                               self.params)
                        grads = [
                            gradient_add(g1, g2, param, verbose=self.verbose)
                            for (g1, g2, param
                                 ) in zip(grads_policy, grads_q, self.params)
                        ]

                        avg_norm_grads_f = avg_norm(grads_f) * (self.n_steps *
                                                                self.n_envs)
                        norm_grads_q = tf.global_norm(grads_q)
                        norm_grads_policy = tf.global_norm(grads_policy)
                    else:
                        grads = tf.gradients(loss, self.params)

                    norm_grads = None
                    if self.max_grad_norm is not None:
                        grads, norm_grads = tf.clip_by_global_norm(
                            grads, self.max_grad_norm)
                    grads = list(zip(grads, self.params))

                with tf.variable_scope("input_info", reuse=False):
                    tf.summary.scalar('rewards',
                                      tf.reduce_mean(self.reward_ph))
                    tf.summary.scalar('learning_rate',
                                      tf.reduce_mean(self.learning_rate))
                    tf.summary.scalar('advantage', tf.reduce_mean(adv))
                    tf.summary.scalar('action_probabilty',
                                      tf.reduce_mean(self.mu_ph))

                    if self.full_tensorboard_log:
                        tf.summary.histogram('rewards', self.reward_ph)
                        tf.summary.histogram('learning_rate',
                                             self.learning_rate)
                        tf.summary.histogram('advantage', adv)
                        tf.summary.histogram('action_probabilty', self.mu_ph)
                        if tf_util.is_image(self.observation_space):
                            tf.summary.image('observation', train_model.obs_ph)
                        else:
                            tf.summary.histogram('observation',
                                                 train_model.obs_ph)

                trainer = tf.train.RMSPropOptimizer(
                    learning_rate=self.learning_rate_ph,
                    decay=self.rprop_alpha,
                    epsilon=self.rprop_epsilon)
                _opt_op = trainer.apply_gradients(grads)

                # so when you call _train, you first do the gradient step, then you apply ema
                with tf.control_dependencies([_opt_op]):
                    _train = tf.group(ema_apply_op)

                # Ops/Summaries to run, and their names for logging
                assert norm_grads is not None
                run_ops = [
                    _train, loss, loss_q, entropy, loss_policy, loss_f,
                    loss_bc, explained_variance, norm_grads
                ]
                names_ops = [
                    'loss', 'loss_q', 'entropy', 'loss_policy', 'loss_f',
                    'loss_bc', 'explained_variance', 'norm_grads'
                ]
                if self.trust_region:
                    self.run_ops = run_ops + [
                        norm_grads_q, norm_grads_policy, avg_norm_grads_f,
                        avg_norm_k, avg_norm_g, avg_norm_k_dot_g, avg_norm_adj
                    ]
                    self.names_ops = names_ops + [
                        'norm_grads_q', 'norm_grads_policy',
                        'avg_norm_grads_f', 'avg_norm_k', 'avg_norm_g',
                        'avg_norm_k_dot_g', 'avg_norm_adj'
                    ]

                self.train_model = train_model
                self.step_model = step_model
                self.step = step_model.step
                self.proba_step = step_model.proba_step
                self.initial_state = step_model.initial_state

                tf.global_variables_initializer().run(session=self.sess)

                self.summary = tf.summary.merge_all()
Esempio n. 10
0
    def setup_model(self):
        with SetVerbosity(self.verbose):

            assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the A2C model must be an " \
                                                                "instance of common.policies.ActorCriticPolicy."

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.sess = tf_util.make_session(graph=self.graph)

                self.n_batch = self.n_envs * self.n_steps

                n_batch_step = None
                n_batch_train = None
                if issubclass(self.policy, LstmPolicy):
                    n_batch_step = self.n_envs
                    n_batch_train = self.n_envs * self.n_steps

                step_model = self.policy(self.sess,
                                         self.observation_space,
                                         self.action_space,
                                         self.n_envs,
                                         1,
                                         n_batch_step,
                                         reuse=False,
                                         **self.policy_kwargs)

                with tf.variable_scope(
                        "train_model",
                        reuse=True,
                        custom_getter=tf_util.outer_scope_getter(
                            "train_model")):
                    train_model = self.policy(self.sess,
                                              self.observation_space,
                                              self.action_space,
                                              self.n_envs,
                                              self.n_steps,
                                              n_batch_train,
                                              reuse=True,
                                              **self.policy_kwargs)

                with tf.variable_scope("loss", reuse=False):
                    self.actions_ph = train_model.pdtype.sample_placeholder(
                        [None], name="action_ph")
                    self.advs_ph = tf.placeholder(tf.float32, [None],
                                                  name="advs_ph")
                    self.rewards_ph = tf.placeholder(tf.float32, [None],
                                                     name="rewards_ph")
                    self.learning_rate_ph = tf.placeholder(
                        tf.float32, [], name="learning_rate_ph")

                    neglogpac = train_model.proba_distribution.neglogp(
                        self.actions_ph)
                    self.entropy = tf.reduce_mean(
                        train_model.proba_distribution.entropy())
                    self.pg_loss = tf.reduce_mean(self.advs_ph * neglogpac)
                    self.vf_loss = mse(tf.squeeze(train_model.value_fn),
                                       self.rewards_ph)
                    loss = self.pg_loss - self.entropy * self.ent_coef + self.vf_loss * self.vf_coef

                    tf.summary.scalar('entropy_loss', self.entropy)
                    tf.summary.scalar('policy_gradient_loss', self.pg_loss)
                    tf.summary.scalar('value_function_loss', self.vf_loss)
                    tf.summary.scalar('loss', loss)

                    self.params = find_trainable_variables("model")
                    grads = tf.gradients(loss, self.params)
                    if self.max_grad_norm is not None:
                        grads, _ = tf.clip_by_global_norm(
                            grads, self.max_grad_norm)
                    grads = list(zip(grads, self.params))

                with tf.variable_scope("input_info", reuse=False):
                    tf.summary.scalar('discounted_rewards',
                                      tf.reduce_mean(self.rewards_ph))
                    tf.summary.scalar('learning_rate',
                                      tf.reduce_mean(self.learning_rate))
                    tf.summary.scalar('advantage',
                                      tf.reduce_mean(self.advs_ph))
                    if self.full_tensorboard_log:
                        tf.summary.histogram('discounted_rewards',
                                             self.rewards_ph)
                        tf.summary.histogram('learning_rate',
                                             self.learning_rate)
                        tf.summary.histogram('advantage', self.advs_ph)
                        if tf_util.is_image(self.observation_space):
                            tf.summary.image('observation', train_model.obs_ph)
                        else:
                            tf.summary.histogram('observation',
                                                 train_model.obs_ph)

                trainer = tf.train.RMSPropOptimizer(
                    learning_rate=self.learning_rate_ph,
                    decay=self.alpha,
                    epsilon=self.epsilon)
                self.apply_backprop = trainer.apply_gradients(grads)

                self.train_model = train_model
                self.step_model = step_model
                self.step = step_model.step
                self.proba_step = step_model.proba_step
                self.value = step_model.value
                self.initial_state = step_model.initial_state
                tf.global_variables_initializer().run(session=self.sess)

                self.summary = tf.summary.merge_all()
Esempio n. 11
0
    def setup_model(self):
        with SetVerbosity(self.verbose):

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.set_random_seed(self.seed)
                self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess,
                                                 graph=self.graph)

                # Construct network for new policy
                self.policy_pi = self.policy(self.sess,
                                             self.observation_space,
                                             self.action_space,
                                             self.n_envs,
                                             1,
                                             None,
                                             reuse=False,
                                             **self.policy_kwargs)

                # Network for old policy
                with tf.variable_scope("oldpi", reuse=False):
                    old_pi = self.policy(self.sess,
                                         self.observation_space,
                                         self.action_space,
                                         self.n_envs,
                                         1,
                                         None,
                                         reuse=False,
                                         **self.policy_kwargs)

                with tf.variable_scope("loss", reuse=False):
                    self.grad_inverter = grad_inverter(
                        [self.action_space.high, self.action_space.low])
                    # Target advantage function (if applicable)
                    atarg = tf.placeholder(dtype=tf.float32, shape=[None])

                    # Empirical return
                    ret = tf.placeholder(dtype=tf.float32, shape=[None])

                    # learning rate multiplier, updated with schedule
                    lrmult = tf.placeholder(name='lrmult',
                                            dtype=tf.float32,
                                            shape=[])

                    # Annealed cliping parameter epislon
                    clip_param = self.clip_param * lrmult

                    obs_ph = self.policy_pi.obs_ph
                    action_ph = self.policy_pi.pdtype.sample_placeholder(
                        [None])

                    if debug:
                        action_ph_val = tf.Print(
                            action_ph, [
                                action_ph,
                            ],
                            '\n\n ====================Unclipped action in: \n',
                            summarize=-1)
                        action_ph_val = tf.Print(
                            action_ph_val, [],
                            '\n ======================================== \n',
                            summarize=-1)

                    kloldnew = old_pi.proba_distribution.kl(
                        self.policy_pi.proba_distribution)
                    # old_logstd = old_pi.proba_distribution.logstd
                    # new_logstd = self.policy_pi.proba_distribution.logstd
                    # old_std = old_pi.proba_distribution.std
                    # new_std = self.policy_pi.proba_distribution.std
                    ent = self.policy_pi.proba_distribution.entropy()
                    meankl = tf.reduce_mean(kloldnew)
                    # meankl = tf.Print(meankl, [meankl,], "kl value: ")
                    # meankl_log = tf.Print(meankl, [old_logstd,], "high kl, old logstd value: ", summarize=-1)
                    # meankl_log = tf.Print(meankl_log, [new_logstd,], "high kl, new logstd value: ", summarize=-1)
                    # meankl_log = tf.Print(meankl_log, [old_std,], "high kl, old std value: ", summarize=-1)
                    # meankl_log = tf.Print(meankl_log, [new_std,], "high kl, new std value: ", summarize=-1)
                    # meanklvalue_ = tf.where(
                    #     tf.greater(meankl, tf.constant(1, dtype = tf.float32)),
                    #     meankl_log,
                    #     meankl)
                    meanent = tf.reduce_mean(ent)
                    pol_entpen = (-self.entcoeff) * meanent

                    # pnew / pold
                    if debug:
                        old_logp = old_pi.proba_distribution.logp(
                            action_ph_val)
                        old_mean = old_pi.proba_distribution.mode()
                        old_std = old_pi.proba_distribution.std
                        old_logp = tf.Print(old_logp, [
                            old_logp,
                        ],
                                            '======  OLD logp: \n',
                                            summarize=-1)
                        old_logp = tf.Print(old_logp, [], '\n', summarize=-1)
                        old_logp = tf.Print(old_logp, [
                            old_mean,
                        ],
                                            '======  OLD mean: \n',
                                            summarize=-1)
                        old_logp = tf.Print(old_logp, [], '\n', summarize=-1)
                        old_logp = tf.Print(old_logp, [
                            old_std,
                        ],
                                            '======  OLD std: \n',
                                            summarize=-1)
                        old_logp = tf.Print(old_logp, [], '\n', summarize=-1)
                        now_logp = self.policy_pi.proba_distribution.logp(
                            action_ph_val)
                        now_mean = self.policy_pi.proba_distribution.mode()
                        now_std = self.policy_pi.proba_distribution.std
                        now_logp = tf.Print(now_logp, [
                            now_logp,
                        ],
                                            '======  NOW logp: \n',
                                            summarize=-1)
                        now_logp = tf.Print(now_logp, [], '\n', summarize=-1)
                        now_logp = tf.Print(now_logp, [
                            now_mean,
                        ],
                                            '======  NOW mean: \n',
                                            summarize=-1)
                        now_logp = tf.Print(now_logp, [], '\n', summarize=-1)
                        now_logp = tf.Print(now_logp, [
                            now_std,
                        ],
                                            '======  NOW std: \n',
                                            summarize=-1)
                        now_logp = tf.Print(now_logp, [], '\n', summarize=-1)
                    else:
                        now_logp = self.policy_pi.proba_distribution.logp(
                            action_ph)
                        old_logp = old_pi.proba_distribution.logp(action_ph)

                    ratio = tf.exp(now_logp - old_logp)
                    if debug:
                        ratio = tf.Print(ratio, [
                            ratio,
                        ],
                                         'ratio: \n',
                                         summarize=-1)
                        ratio = tf.Print(ratio, [], '\n', summarize=-1)
                    # surrogate from conservative policy iteration
                    surr1 = ratio * atarg
                    surr2 = tf.clip_by_value(ratio, 1.0 - clip_param,
                                             1.0 + clip_param) * atarg

                    # PPO's pessimistic surrogate (L^CLIP)
                    pol_surr = -tf.reduce_mean(tf.minimum(surr1, surr2))
                    vf_loss = tf.reduce_mean(
                        tf.square(self.policy_pi.value_flat - ret))
                    total_loss = pol_surr + pol_entpen + vf_loss
                    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
                    # losses = [pol_surr, pol_entpen, vf_loss, meanklvalue_, meanent]
                    self.loss_names = [
                        "pol_surr", "pol_entpen", "vf_loss", "kl", "ent"
                    ]

                    tf.summary.scalar('entropy_loss', pol_entpen)
                    tf.summary.scalar('policy_gradient_loss', pol_surr)
                    tf.summary.scalar('value_function_loss', vf_loss)
                    tf.summary.scalar('approximate_kullback-leibler', meankl)
                    tf.summary.scalar('clip_factor', clip_param)
                    tf.summary.scalar('loss', total_loss)

                    self.params = tf_util.get_trainable_vars("model")

                    self.assign_old_eq_new = tf_util.function(
                        [], [],
                        updates=[
                            tf.assign(oldv, newv) for (oldv, newv) in zipsame(
                                tf_util.get_globals_vars("oldpi"),
                                tf_util.get_globals_vars("model"))
                        ])

                with tf.variable_scope("Adam_mpi", reuse=False):
                    self.adam = MpiAdam(self.params,
                                        epsilon=self.adam_epsilon,
                                        sess=self.sess)

                with tf.variable_scope("input_info", reuse=False):
                    tf.summary.scalar('discounted_rewards',
                                      tf.reduce_mean(ret))
                    tf.summary.scalar('learning_rate',
                                      tf.reduce_mean(self.optim_stepsize))
                    tf.summary.scalar('advantage', tf.reduce_mean(atarg))
                    tf.summary.scalar('clip_range',
                                      tf.reduce_mean(self.clip_param))

                    if self.full_tensorboard_log:
                        tf.summary.histogram('discounted_rewards', ret)
                        tf.summary.histogram('learning_rate',
                                             self.optim_stepsize)
                        tf.summary.histogram('advantage', atarg)
                        tf.summary.histogram('clip_range', self.clip_param)
                        if tf_util.is_image(self.observation_space):
                            tf.summary.image('observation', obs_ph)
                        else:
                            tf.summary.histogram('observation', obs_ph)

                self.step = self.policy_pi.step
                self.proba_step = self.policy_pi.proba_step
                self.initial_state = self.policy_pi.initial_state

                tf_util.initialize(sess=self.sess)

                self.summary = tf.summary.merge_all()

                self.lossandgrad = tf_util.function(
                    [obs_ph, old_pi.obs_ph, action_ph, atarg, ret, lrmult],
                    [self.summary,
                     tf_util.flatgrad(total_loss, self.params)] + losses)
                self.compute_losses = tf_util.function(
                    [obs_ph, old_pi.obs_ph, action_ph, atarg, ret, lrmult],
                    losses)
Esempio n. 12
0
    def setup_model(self):
        # prevent import loops
        from stable_baselines.gail.adversary import TransitionClassifier
        from stable_baselines.mdal.adversary import TabularAdversaryTF, NeuralAdversaryTRPO


        with SetVerbosity(self.verbose):

            assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the MDPO model must be " \
                                                               "an instance of common.policies.ActorCriticPolicy."

            self.nworkers = MPI.COMM_WORLD.Get_size()
            self.rank = MPI.COMM_WORLD.Get_rank()
            np.set_printoptions(precision=3)

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.sess = tf_util.single_threaded_session(graph=self.graph)
                # self._setup_learn(self.seed)
                self._setup_learn()

                if self.using_gail:
                    self.reward_giver = TransitionClassifier(self.observation_space, self.action_space,
                                                             self.hidden_size_adversary,
                                                             entcoeff=self.adversary_entcoeff)
                elif self.using_mdal:
                    if self.neural:
                        self.reward_giver = NeuralAdversaryTRPO(self.sess, self.observation_space, self.action_space,
                                                                self.hidden_size_adversary,
                                                                entcoeff=self.adversary_entcoeff)
                    else:
                        self.reward_giver = TabularAdversaryTF(self.sess, self.observation_space, self.action_space,
                                                                 self.hidden_size_adversary,
                                                                 entcoeff=self.adversary_entcoeff,
                                                                 expert_features=self.expert_dataset.successor_features,
                                                                 exploration_bonus=self.exploration_bonus,
                                                                 bonus_coef=self.bonus_coef,
                                                                 t_c=self.t_c,
                                                                 is_action_features=self.is_action_features)
                # Construct network for new policy
                self.policy_pi = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1,
                                             None, reuse=False, **self.policy_kwargs)

                # Network for old policy
                with tf.variable_scope("oldpi", reuse=False):
                    self.old_policy = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1,
                                             None, reuse=False, **self.policy_kwargs)

                # Network for fitting closed form
                with tf.variable_scope("closedpi", reuse=False):
                    self.closed_policy = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1,
                                             None, reuse=False, **self.policy_kwargs)

                with tf.variable_scope("loss", reuse=False):
                    self.atarg = tf.placeholder(dtype=tf.float32, shape=[None])  # Target advantage function (if applicable)
                    self.vtarg = tf.placeholder(dtype=tf.float32, shape=[None])
                    self.ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return
                    self.learning_rate_ph = tf.placeholder(dtype=tf.float32, shape=[], name="learning_rate_ph")
                    self.outer_learning_rate_ph = tf.placeholder(dtype=tf.float32, shape=[], name="outer_learning_rate_ph")
                    self.old_vpred_ph = tf.placeholder(dtype=tf.float32, shape=[None], name="old_vpred_ph")
                    self.clip_range_vf_ph = tf.placeholder(dtype=tf.float32, shape=[], name="clip_range_ph")

                    observation = self.policy_pi.obs_ph
                    self.action = self.policy_pi.pdtype.sample_placeholder([None])

                    if self.tsallis_q == 1.0:
                        kloldnew = self.policy_pi.proba_distribution.kl(self.old_policy.proba_distribution)
                        ent = self.policy_pi.proba_distribution.entropy()
                        meankl = tf.reduce_mean(kloldnew)

                    else:
                        logp_pi = self.policy_pi.proba_distribution.logp(self.action)
                        logp_pi_old =  self.old_policy.proba_distribution.logp(self.action)
                        ent = self.policy_pi.proba_distribution.entropy()
                        #kloldnew = self.policy_pi.proba_distribution.kl_tsallis(self.old_policy.proba_distribution, self.tsallis_q)
                        tsallis_q = 2.0 - self.tsallis_q
                        meankl = tf.reduce_mean(tf_log_q(tf.exp(logp_pi), tsallis_q) - tf_log_q(tf.exp(logp_pi_old), tsallis_q)) #tf.reduce_mean(kloldnew)

                    meanent = tf.reduce_mean(ent)
                    entbonus = self.entcoeff * meanent

                    if self.cliprange_vf is None:
                        vpred_clipped = self.policy_pi.value_flat
                    else:
                        vpred_clipped = self.old_vpred_ph + \
                            tf.clip_by_value(self.policy_pi.value_flat - self.old_vpred_ph,
                                             - self.clip_range_vf_ph, self.clip_range_vf_ph)

                    vf_losses1 = tf.square(self.policy_pi.value_flat - self.ret)
                    vf_losses2 = tf.square(vpred_clipped - self.ret)
                    vferr = tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))

                    # advantage * pnew / pold
                    ratio = tf.exp(self.policy_pi.proba_distribution.logp(self.action) -
                                   self.old_policy.proba_distribution.logp(self.action))

                    if self.method == "multistep-SGD":
                        surrgain = tf.reduce_mean(ratio * self.atarg) - meankl / self.learning_rate_ph
                    elif self.method == "closedreverse-KL":
                        surrgain = tf.reduce_mean(tf.exp(self.atarg) * self.policy_pi.proba_distribution.logp(self.action))
                    else:
                        policygain = tf.reduce_mean(tf.exp(self.atarg) * tf.log(self.closed_policy.proba_distribution.mean))
                        surrgain = tf.reduce_mean(ratio * self.atarg) - tf.reduce_mean(self.learning_rate_ph * ratio * self.policy_pi.proba_distribution.logp(self.action))

                    optimgain = surrgain #+ entbonus - self.learning_rate_ph * meankl
                    losses = [optimgain, meankl, entbonus, surrgain, meanent]
                    self.loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"]

                    dist = meankl

                    all_var_list = tf_util.get_trainable_vars("model")
                    var_list = [v for v in all_var_list if "/vf" not in v.name and "/q/" not in v.name]
                    vf_var_list = [v for v in all_var_list if "/pi" not in v.name and "/logstd" not in v.name]
                    print("policy vars", var_list)

                    all_closed_var_list = tf_util.get_trainable_vars("closedpi")
                    closed_var_list = [v for v in all_closed_var_list if "/vf" not in v.name and "/q" not in v.name]

                    self.get_flat = tf_util.GetFlat(var_list, sess=self.sess)
                    self.set_from_flat = tf_util.SetFromFlat(var_list, sess=self.sess)

                    klgrads = tf.gradients(dist, var_list)
                    flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan")
                    shapes = [var.get_shape().as_list() for var in var_list]
                    start = 0
                    tangents = []
                    for shape in shapes:
                        var_size = tf_util.intprod(shape)
                        tangents.append(tf.reshape(flat_tangent[start: start + var_size], shape))
                        start += var_size
                    gvp = tf.add_n([tf.reduce_sum(grad * tangent)
                                    for (grad, tangent) in zipsame(klgrads, tangents)])  # pylint: disable=E1111
                    fvp = tf_util.flatgrad(gvp, var_list)

                    # tf.summary.scalar('entropy_loss', meanent)
                    # tf.summary.scalar('policy_gradient_loss', optimgain)
                    # tf.summary.scalar('value_function_loss', surrgain)
                    # tf.summary.scalar('approximate_kullback-leibler', meankl)
                    # tf.summary.scalar('loss', optimgain + meankl + entbonus + surrgain + meanent)

                    self.assign_old_eq_new = \
                        tf_util.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in
                                                          zipsame(tf_util.get_globals_vars("oldpi"),
                                                                  tf_util.get_globals_vars("model"))])
                    self.compute_losses = tf_util.function([observation, self.old_policy.obs_ph, self.action, self.atarg, self.learning_rate_ph, self.vtarg], losses)
                    self.compute_fvp = tf_util.function([flat_tangent, observation, self.old_policy.obs_ph, self.action, self.atarg],
                                                        fvp)
                    self.compute_vflossandgrad = tf_util.function([observation, self.old_policy.obs_ph, self.ret, self.old_vpred_ph, self.clip_range_vf_ph],
                                                                  tf_util.flatgrad(vferr, vf_var_list))

                    grads = tf.gradients(-optimgain, var_list)
                    grads, _grad_norm = tf.clip_by_global_norm(grads, 0.5)
                    trainer = tf.train.AdamOptimizer(learning_rate=self.outer_learning_rate_ph, epsilon=1e-5)
                    # trainer = tf.train.AdamOptimizer(learning_rate=3e-4, epsilon=1e-5)
                    grads = list(zip(grads, var_list))
                    self._train = trainer.apply_gradients(grads)

                    @contextmanager
                    def timed(msg):
                        if self.rank == 0 and self.verbose >= 1:
                            # print(colorize(msg, color='magenta'))
                            # start_time = time.time()
                            yield
                            # print(colorize("done in {:.3f} seconds".format((time.time() - start_time)),
                            #                color='magenta'))
                        else:
                            yield

                    def allmean(arr):
                        assert isinstance(arr, np.ndarray)
                        out = np.empty_like(arr)
                        MPI.COMM_WORLD.Allreduce(arr, out, op=MPI.SUM)
                        out /= self.nworkers
                        return out

                    tf_util.initialize(sess=self.sess)

                    th_init = self.get_flat()
                    MPI.COMM_WORLD.Bcast(th_init, root=0)
                    self.set_from_flat(th_init)

                with tf.variable_scope("Adam_mpi", reuse=False):
                    self.vfadam = MpiAdam(vf_var_list, sess=self.sess)
                    if self.using_gail or self.using_mdal:
                        self.d_adam = MpiAdam(self.reward_giver.get_trainable_variables(), sess=self.sess)
                        self.d_adam.sync()
                    self.vfadam.sync()

                with tf.variable_scope("input_info", reuse=False):
                    tf.summary.scalar('discounted_rewards', tf.reduce_mean(self.ret))
                    tf.summary.scalar('learning_rate', tf.reduce_mean(self.vf_stepsize))
                    tf.summary.scalar('advantage', tf.reduce_mean(self.atarg))
                    tf.summary.scalar('kl_clip_range', tf.reduce_mean(self.max_kl))

                    if self.full_tensorboard_log:
                        tf.summary.histogram('discounted_rewards', self.ret)
                        tf.summary.histogram('learning_rate', self.vf_stepsize)
                        tf.summary.histogram('advantage', self.atarg)
                        tf.summary.histogram('kl_clip_range', self.max_kl)
                        if tf_util.is_image(self.observation_space):
                            tf.summary.image('observation', observation)
                        else:
                            tf.summary.histogram('observation', observation)

                self.timed = timed
                self.allmean = allmean

                self.step = self.policy_pi.step
                self.proba_step = self.policy_pi.proba_step
                self.initial_state = self.policy_pi.initial_state

                self.params = tf_util.get_trainable_vars("model") + tf_util.get_trainable_vars("oldpi")
                if self.using_gail:
                    self.params.extend(self.reward_giver.get_trainable_variables())

                self.summary = tf.summary.merge_all()

                self.compute_lossandgrad = \
                    tf_util.function([observation, self.old_policy.obs_ph, self.action, self.atarg, self.ret, self.learning_rate_ph, self.vtarg, self.closed_policy.obs_ph],
                                     [self.summary, tf_util.flatgrad(optimgain, var_list)] + losses)