Example #1
0
    def update_opt(self,
                   loss,
                   target,
                   inputs,
                   extra_inputs=None,
                   name='LBFGSOptimizer',
                   **kwargs):
        """Construct operation graph for the optimizer.

        Args:
            loss (tf.Tensor): Loss objective to minimize.
            target (object): Target object to optimize. The object should
                implemenet `get_params()` and `get_param_values`.
            inputs (list[tf.Tensor]): List of input placeholders.
            extra_inputs (list[tf.Tensor]): List of extra input placeholders.
            name (str): Name scope.
            kwargs (dict): Extra unused keyword arguments. Some optimizers
                have extra input, e.g. KL constraint.

        """
        del kwargs
        self._target = target
        params = target.get_params()
        with tf.name_scope(name):

            def get_opt_output():
                """Helper function to construct graph.

                Returns:
                    list[tf.Tensor]: Loss and gradient tensor.

                """
                with tf.name_scope('get_opt_output'):
                    flat_grad = flatten_tensor_variables(
                        tf.gradients(loss, params))
                    return [
                        tf.cast(loss, tf.float64),
                        tf.cast(flat_grad, tf.float64)
                    ]

            if extra_inputs is None:
                extra_inputs = list()

            self._opt_fun = LazyDict(
                f_loss=lambda: compile_function(inputs + extra_inputs, loss),
                f_opt=lambda: compile_function(
                    inputs=inputs + extra_inputs,
                    outputs=get_opt_output(),
                ))
    def update_opt(self, loss, target, inputs, extra_inputs=None, **kwargs):
        """Construct operation graph for the optimizer.

        Args:
            loss (tf.Tensor): Loss objective to minimize.
            target (object): Target object to optimize. The object should
                implemenet `get_params()` and `get_param_values`.
            inputs (list[tf.Tensor]): List of input placeholders.
            extra_inputs (list[tf.Tensor]): List of extra input placeholders.
            kwargs (dict): Extra unused keyword arguments. Some optimizers
                have extra input, e.g. KL constraint.

        """
        del kwargs
        with tf.name_scope(self._name):
            self._target = target
            tf_optimizer = make_optimizer(self._tf_optimizer,
                                          **self._learning_rate)
            self._train_op = tf_optimizer.minimize(
                loss, var_list=target.get_params())

            if extra_inputs is None:
                extra_inputs = list()
            self._input_vars = inputs + extra_inputs
            self._opt_fun = LazyDict(
                f_loss=lambda: compile_function(inputs + extra_inputs, loss), )
Example #3
0
    def _initialize(self):
        input_var = tf.compat.v1.placeholder(tf.float32,
                                             shape=(None, ) +
                                             self._input_shape)
        ys_var = tf.compat.v1.placeholder(dtype=tf.float32,
                                          name='ys',
                                          shape=(None, self._output_dim))

        self._old_network = self._old_model.build(input_var)
        (norm_dist, norm_mean, norm_log_std, _, mean, _, self._x_mean,
         self._x_std, self._y_mean,
         self._y_std) = self.build(input_var).outputs

        normalized_ys_var = (ys_var - self._y_mean) / self._y_std
        old_normalized_dist = self._old_network.normalized_dist

        mean_kl = tf.reduce_mean(old_normalized_dist.kl_divergence(norm_dist))
        loss = -tf.reduce_mean(norm_dist.log_prob(normalized_ys_var))
        self._f_predict = compile_function([input_var], mean)
        optimizer_args = dict(
            loss=loss,
            target=self,
            network_outputs=[norm_mean, norm_log_std],
        )
        if self._use_trust_region:
            optimizer_args['leq_constraint'] = (mean_kl, self._max_kl_step)
        optimizer_args['inputs'] = [input_var, ys_var]
        with tf.name_scope('update_opt'):
            self._optimizer.update_opt(**optimizer_args)
Example #4
0
    def _build_entropy_term(self, i):
        """Build policy entropy tensor.

        Args:
            i (namedtuple): Collection of variables to compute policy loss.

        Returns:
            tf.Tensor: Policy entropy.

        """
        pol_dist = self._policy_network.dist

        with tf.name_scope('policy_entropy'):
            if self._use_neg_logli_entropy:
                policy_entropy = -pol_dist.log_prob(i.action_var,
                                                    name='policy_log_likeli')
            else:
                policy_entropy = pol_dist.entropy()

            # This prevents entropy from becoming negative for small policy std
            if self._use_softplus_entropy:
                policy_entropy = tf.nn.softplus(policy_entropy)

            if self._stop_entropy_gradient:
                policy_entropy = tf.stop_gradient(policy_entropy)

        # dense form, match the shape of advantage
        policy_entropy = tf.reshape(policy_entropy,
                                    [-1, self.max_episode_length])

        self._f_policy_entropy = compile_function(
            flatten_inputs(self._policy_opt_inputs), policy_entropy)

        return policy_entropy
Example #5
0
    def _build_encoder_kl(self):
        """Build graph for encoder KL divergence.

        Returns:
            tf.Tensor: Encoder KL divergence.

        """
        dist = self._encoder_network.dist
        old_dist = self._old_encoder_network.dist

        with tf.name_scope('encoder_kl'):
            kl = old_dist.kl_divergence(dist)
            mean_kl = tf.reduce_mean(kl)

            # Diagnostic function
            self._f_encoder_kl = compile_function(
                flatten_inputs(self._policy_opt_inputs), mean_kl)

            return mean_kl
Example #6
0
    def _initialize(self):
        input_var = tf.compat.v1.placeholder(tf.float32,
                                             shape=(None, ) +
                                             self._input_shape)

        ys_var = tf.compat.v1.placeholder(dtype=tf.float32,
                                          name='ys',
                                          shape=(None, self._output_dim))

        (self._y_hat, self._x_mean,
         self._x_std) = self.build(input_var).outputs

        loss = tf.reduce_mean(tf.square(self._y_hat - ys_var))
        self._f_predict = compile_function([input_var], self._y_hat)
        optimizer_args = dict(
            loss=loss,
            target=self,
            network_outputs=[ys_var],
        )
        optimizer_args['inputs'] = [input_var, ys_var]
        with tf.name_scope('update_opt'):
            self._optimizer.update_opt(**optimizer_args)
Example #7
0
    def _init_opt(self):
        """Build the loss function and init the optimizer."""
        with tf.name_scope(self._name):
            # Create target policy (actor) and qf (critic) networks
            with tf.name_scope('inputs'):
                obs_dim = self._env_spec.observation_space.flat_dim
                y = tf.compat.v1.placeholder(tf.float32,
                                             shape=(None, 1),
                                             name='input_y')
                obs = tf.compat.v1.placeholder(tf.float32,
                                               shape=(None, obs_dim),
                                               name='input_observation')
                actions = tf.compat.v1.placeholder(
                    tf.float32,
                    shape=(None, self._env_spec.action_space.flat_dim),
                    name='input_action')

            policy_network_outputs = self._target_policy.build(obs,
                                                               name='policy')
            target_qf_outputs = self._target_qf.build(obs, actions, name='qf')
            target_qf2_outputs = self._target_qf2.build(obs,
                                                        actions,
                                                        name='qf')

            self._target_policy_f_prob_online = compile_function(
                inputs=[obs], outputs=policy_network_outputs)

            self._target_qf_f_prob_online = compile_function(
                inputs=[obs, actions], outputs=target_qf_outputs)

            self._target_qf2_f_prob_online = compile_function(
                inputs=[obs, actions], outputs=target_qf2_outputs)

            # Set up target init and update functions
            with tf.name_scope('setup_target'):
                policy_init_op, policy_update_op = get_target_ops(
                    self.policy.get_global_vars(),
                    self._target_policy.get_global_vars(), self._tau)
                qf_init_ops, qf_update_ops = get_target_ops(
                    self.qf.get_global_vars(),
                    self._target_qf.get_global_vars(), self._tau)
                qf2_init_ops, qf2_update_ops = get_target_ops(
                    self.qf2.get_global_vars(),
                    self._target_qf2.get_global_vars(), self._tau)
                target_init_op = policy_init_op + qf_init_ops + qf2_init_ops
                target_update_op = (policy_update_op + qf_update_ops +
                                    qf2_update_ops)

            f_init_target = compile_function(inputs=[], outputs=target_init_op)
            f_update_target = compile_function(inputs=[],
                                               outputs=target_update_op)

            # Set up policy training function
            next_action = self.policy.build(obs, name='policy_action')
            next_qval = self.qf.build(obs,
                                      next_action,
                                      name='policy_action_qval')
            with tf.name_scope('action_loss'):
                action_loss = -tf.reduce_mean(next_qval)

            with tf.name_scope('minimize_action_loss'):
                policy_optimizer = make_optimizer(
                    self._policy_optimizer,
                    learning_rate=self._policy_lr,
                    name='PolicyOptimizer')
                policy_train_op = policy_optimizer.minimize(
                    action_loss, var_list=self.policy.get_trainable_vars())

            f_train_policy = compile_function(
                inputs=[obs], outputs=[policy_train_op, action_loss])

            # Set up qf training function
            qval = self.qf.build(obs, actions, name='q_value')
            q2val = self.qf2.build(obs, actions, name='q2_value')
            with tf.name_scope('qval1_loss'):
                qval1_loss = tf.reduce_mean(tf.math.squared_difference(
                    y, qval))
            with tf.name_scope('qval2_loss'):
                qval2_loss = tf.reduce_mean(
                    tf.math.squared_difference(y, q2val))

            with tf.name_scope('minimize_qf_loss'):
                qf_optimizer = make_optimizer(self._qf_optimizer,
                                              learning_rate=self._qf_lr,
                                              name='QFunctionOptimizer')
                qf_train_op = qf_optimizer.minimize(
                    qval1_loss, var_list=self.qf.get_trainable_vars())
                qf2_train_op = qf_optimizer.minimize(
                    qval2_loss, var_list=self.qf2.get_trainable_vars())

            f_train_qf = compile_function(
                inputs=[y, obs, actions],
                outputs=[qf_train_op, qval1_loss, qval])
            f_train_qf2 = compile_function(
                inputs=[y, obs, actions],
                outputs=[qf2_train_op, qval2_loss, q2val])

            self._f_train_policy = f_train_policy
            self._f_train_qf = f_train_qf
            self._f_init_target = f_init_target
            self._f_update_target = f_update_target
            self._f_train_qf2 = f_train_qf2
Example #8
0
    def _build_entropy_terms(self, i):
        """Build policy entropy tensor.

        Args:
            i (namedtuple): Collection of variables to compute policy loss.

        Returns:
            tf.Tensor: Policy entropy.

        """
        pol_dist = self._policy_network.dist
        infer_dist = self._infer_network.dist
        enc_dist = self._encoder_network.dist
        with tf.name_scope('entropy_terms'):
            # 1. Encoder distribution total entropy
            with tf.name_scope('encoder_entropy'):
                encoder_dist, _, _ = self.policy.encoder.build(
                    i.task_var, name='encoder_entropy').outputs
                encoder_all_task_entropies = -encoder_dist.log_prob(
                    i.latent_var)

                if self._use_softplus_entropy:
                    encoder_entropy = tf.nn.softplus(
                        encoder_all_task_entropies)

                encoder_entropy = tf.reduce_mean(encoder_entropy,
                                                 name='encoder_entropy')
                encoder_entropy = tf.stop_gradient(encoder_entropy)

            # 2. Infernece distribution cross-entropy (log-likelihood)
            with tf.name_scope('inference_ce'):
                # Build inference with trajectory windows

                traj_ll = infer_dist.log_prob(
                    enc_dist.sample(seed=deterministic.get_tf_seed_stream()),
                    name='traj_ll')

                inference_ce_raw = -traj_ll
                inference_ce = tf.clip_by_value(inference_ce_raw, -3, 3)

                if self._use_softplus_entropy:
                    inference_ce = tf.nn.softplus(inference_ce)

                if self._stop_ce_gradient:
                    inference_ce = tf.stop_gradient(inference_ce)

            # 3. Policy path entropies
            with tf.name_scope('policy_entropy'):
                policy_entropy = -pol_dist.log_prob(i.action_var,
                                                    name='policy_log_likeli')

                # This prevents entropy from becoming negative
                # for small policy std
                if self._use_softplus_entropy:
                    policy_entropy = tf.nn.softplus(policy_entropy)

                policy_entropy = tf.stop_gradient(policy_entropy)

        # Diagnostic functions
        self._f_task_entropies = compile_function(
            flatten_inputs(self._policy_opt_inputs),
            encoder_all_task_entropies)
        self._f_encoder_entropy = compile_function(
            flatten_inputs(self._policy_opt_inputs), encoder_entropy)
        self._f_inference_ce = compile_function(
            flatten_inputs(self._policy_opt_inputs),
            tf.reduce_mean(inference_ce * i.valid_var))
        self._f_policy_entropy = compile_function(
            flatten_inputs(self._policy_opt_inputs), policy_entropy)

        return encoder_entropy, inference_ce, policy_entropy
Example #9
0
    def _init_opt(self):
        """Build the loss function and init the optimizer."""
        with tf.name_scope(self._name):
            # Create target policy and qf network
            with tf.name_scope('inputs'):
                obs_dim = self._env_spec.observation_space.flat_dim
                input_y = tf.compat.v1.placeholder(tf.float32,
                                                   shape=(None, 1),
                                                   name='input_y')
                obs = tf.compat.v1.placeholder(tf.float32,
                                               shape=(None, obs_dim),
                                               name='input_observation')
                actions = tf.compat.v1.placeholder(
                    tf.float32,
                    shape=(None, self._env_spec.action_space.flat_dim),
                    name='input_action')

            policy_network_outputs = self._target_policy.build(obs,
                                                               name='policy')
            target_qf_outputs = self._target_qf.build(obs, actions, name='qf')

            self._target_policy_f_prob_online = compile_function(
                inputs=[obs], outputs=policy_network_outputs)
            self._target_qf_f_prob_online = compile_function(
                inputs=[obs, actions], outputs=target_qf_outputs)

            # Set up target init and update function
            with tf.name_scope('setup_target'):
                ops = get_target_ops(self.policy.get_global_vars(),
                                     self._target_policy.get_global_vars(),
                                     self._tau)
                policy_init_ops, policy_update_ops = ops
                qf_init_ops, qf_update_ops = get_target_ops(
                    self._qf.get_global_vars(),
                    self._target_qf.get_global_vars(), self._tau)
                target_init_op = policy_init_ops + qf_init_ops
                target_update_op = policy_update_ops + qf_update_ops

            f_init_target = compile_function(inputs=[], outputs=target_init_op)
            f_update_target = compile_function(inputs=[],
                                               outputs=target_update_op)

            with tf.name_scope('inputs'):
                obs_dim = self._env_spec.observation_space.flat_dim
                input_y = tf.compat.v1.placeholder(tf.float32,
                                                   shape=(None, 1),
                                                   name='input_y')
                obs = tf.compat.v1.placeholder(tf.float32,
                                               shape=(None, obs_dim),
                                               name='input_observation')
                actions = tf.compat.v1.placeholder(
                    tf.float32,
                    shape=(None, self._env_spec.action_space.flat_dim),
                    name='input_action')
            # Set up policy training function
            next_action = self.policy.build(obs, name='policy_action')
            next_qval = self._qf.build(obs,
                                       next_action,
                                       name='policy_action_qval')
            with tf.name_scope('action_loss'):
                action_loss = -tf.reduce_mean(next_qval)
                if self._policy_weight_decay > 0.:
                    regularizer = tf.keras.regularizers.l2(
                        self._policy_weight_decay)
                    for var in self.policy.get_regularizable_vars():
                        policy_reg = regularizer(var)
                        action_loss += policy_reg

            with tf.name_scope('minimize_action_loss'):
                policy_optimizer = make_optimizer(
                    self._policy_optimizer,
                    learning_rate=self._policy_lr,
                    name='PolicyOptimizer')
                policy_train_op = policy_optimizer.minimize(
                    action_loss, var_list=self.policy.get_trainable_vars())

            f_train_policy = compile_function(
                inputs=[obs], outputs=[policy_train_op, action_loss])

            # Set up qf training function
            qval = self._qf.build(obs, actions, name='q_value')
            with tf.name_scope('qval_loss'):
                qval_loss = tf.reduce_mean(
                    tf.compat.v1.squared_difference(input_y, qval))
                if self._qf_weight_decay > 0.:
                    regularizer = tf.keras.regularizers.l2(
                        self._qf_weight_decay)
                    for var in self._qf.get_regularizable_vars():
                        qf_reg = regularizer(var)
                        qval_loss += qf_reg

            with tf.name_scope('minimize_qf_loss'):
                qf_optimizer = make_optimizer(self._qf_optimizer,
                                              learning_rate=self._qf_lr,
                                              name='QFunctionOptimizer')
                qf_train_op = qf_optimizer.minimize(
                    qval_loss, var_list=self._qf.get_trainable_vars())

            f_train_qf = compile_function(
                inputs=[input_y, obs, actions],
                outputs=[qf_train_op, qval_loss, qval])

            self._f_train_policy = f_train_policy
            self._f_train_qf = f_train_qf
            self._f_init_target = f_init_target
            self._f_update_target = f_update_target
Example #10
0
    def _build_policy_loss(self, i):
        """Build policy loss and other output tensors.

        Args:
            i (namedtuple): Collection of variables to compute policy loss.

        Returns:
            tf.Tensor: Policy loss.
            tf.Tensor: Mean policy KL divergence.

        Raises:
            NotImplementedError: If is_recurrent is True.

        """
        pol_dist = self._policy_network.dist
        old_pol_dist = self._old_policy_network.dist

        # Initialize dual params
        self._param_eta = 15.
        self._param_v = np.random.rand(
            self._env_spec.observation_space.flat_dim * 2 + 4)

        with tf.name_scope('bellman_error'):
            delta_v = tf.boolean_mask(i.reward_var,
                                      i.valid_var) + tf.tensordot(
                                          i.feat_diff, i.param_v, 1)

        with tf.name_scope('policy_loss'):
            ll = pol_dist.log_prob(i.action_var)
            ll = tf.boolean_mask(ll, i.valid_var)
            loss = -tf.reduce_mean(
                ll * tf.exp(delta_v / i.param_eta -
                            tf.reduce_max(delta_v / i.param_eta)))

            reg_params = self.policy.get_regularizable_vars()
            loss += self._l2_reg_loss * tf.reduce_sum(
                [tf.reduce_mean(tf.square(param))
                 for param in reg_params]) / len(reg_params)

        with tf.name_scope('kl'):
            kl = old_pol_dist.kl_divergence(pol_dist)
            pol_mean_kl = tf.reduce_mean(kl)

        with tf.name_scope('dual'):
            dual_loss = i.param_eta * self._epsilon + (
                i.param_eta * tf.math.log(
                    tf.reduce_mean(
                        tf.exp(delta_v / i.param_eta -
                               tf.reduce_max(delta_v / i.param_eta)))) +
                i.param_eta * tf.reduce_max(delta_v / i.param_eta))

            dual_loss += self._l2_reg_dual * (tf.square(i.param_eta) +
                                              tf.square(1 / i.param_eta))

            dual_grad = tf.gradients(dual_loss, [i.param_eta, i.param_v])

        self._f_dual = compile_function(
            flatten_inputs(self._dual_opt_inputs),
            dual_loss)

        self._f_dual_grad = compile_function(
            flatten_inputs(self._dual_opt_inputs),
            dual_grad)

        self._f_policy_kl = compile_function(
            flatten_inputs(self._policy_opt_inputs),
            pol_mean_kl)

        return loss
Example #11
0
    def update_opt(self,
                   loss,
                   target,
                   leq_constraint,
                   inputs,
                   constraint_name='constraint',
                   name='PenaltyLBFGSOptimizer',
                   **kwargs):
        """Construct operation graph for the optimizer.

        Args:
            loss (tf.Tensor): Loss objective to minimize.
            target (object): Target object to optimize. The object should
                implemenet `get_params()` and `get_param_values`.
            leq_constraint (tuple): It contains a tf.Tensor and a float value.
                The tf.Tensor represents the constraint term, and the float
                value is the constraint value.
            inputs (list[tf.Tensor]): List of input placeholders.
            constraint_name (str): Constraint name for logging.
            name (str): Name scope.
            kwargs (dict): Extra unused keyword arguments. Some optimizers
                have extra input, e.g. KL constraint.

        """
        del kwargs
        params = target.get_params()
        with tf.name_scope(name):
            constraint_term, constraint_value = leq_constraint
            penalty_var = tf.compat.v1.placeholder(tf.float32,
                                                   tuple(),
                                                   name='penalty')
            penalized_loss = loss + penalty_var * constraint_term

            self._target = target
            self._max_constraint_val = constraint_value
            self._constraint_name = constraint_name

            def get_opt_output():
                """Helper function to construct graph.

                Returns:
                    list[tf.Tensor]: Penalized loss and gradient tensor.

                """
                with tf.name_scope('get_opt_output'):
                    grads = tf.gradients(penalized_loss, params)
                    for idx, (grad, param) in enumerate(zip(grads, params)):
                        if grad is None:
                            grads[idx] = tf.zeros_like(param)
                    flat_grad = flatten_tensor_variables(grads)
                    return [
                        tf.cast(penalized_loss, tf.float64),
                        tf.cast(flat_grad, tf.float64),
                    ]

            self._opt_fun = LazyDict(
                f_loss=lambda: compile_function(inputs, loss),
                f_constraint=lambda: compile_function(inputs, constraint_term),
                f_penalized_loss=lambda: compile_function(
                    inputs=inputs + [penalty_var],
                    outputs=[penalized_loss, loss, constraint_term],
                ),
                f_opt=lambda: compile_function(
                    inputs=inputs + [penalty_var],
                    outputs=get_opt_output(),
                ))
Example #12
0
    def _init_opt(self):
        """Initialize the networks and Ops.

        Assume discrete space for dqn, so action dimension
        will always be action_space.n
        """
        action_dim = self._env_spec.action_space.n

        # build q networks
        with tf.name_scope(self._name):
            action_t_ph = tf.compat.v1.placeholder(tf.int32,
                                                   None,
                                                   name='action')
            reward_t_ph = tf.compat.v1.placeholder(tf.float32,
                                                   None,
                                                   name='reward')
            done_t_ph = tf.compat.v1.placeholder(tf.float32, None, name='done')

            with tf.name_scope('update_ops'):
                target_update_op = get_target_ops(
                    self._qf.get_global_vars(),
                    self._target_qf.get_global_vars())

            self._qf_update_ops = compile_function(inputs=[],
                                                   outputs=target_update_op)

            with tf.name_scope('td_error'):
                # Q-value of the selected action
                action = tf.one_hot(action_t_ph,
                                    action_dim,
                                    on_value=1.,
                                    off_value=0.)
                q_selected = tf.reduce_sum(
                    self._qf.q_vals * action,  # yapf: disable
                    axis=1)

                # r + Q'(s', argmax_a(Q(s', _)) - Q(s, a)
                if self._double_q:
                    target_qval_with_online_q = self._qf.build(
                        self._target_qf.input, self._qf.name)
                    future_best_q_val_action = tf.argmax(
                        target_qval_with_online_q, 1)
                    future_best_q_val = tf.reduce_sum(
                        self._target_qf.q_vals *
                        tf.one_hot(future_best_q_val_action,
                                   action_dim,
                                   on_value=1.,
                                   off_value=0.),
                        axis=1)
                else:
                    # r + max_a(Q'(s', _)) - Q(s, a)
                    future_best_q_val = tf.reduce_max(self._target_qf.q_vals,
                                                      axis=1)

                q_best_masked = (1.0 - done_t_ph) * future_best_q_val
                # if done, it's just reward
                # else reward + discount * future_best_q_val
                target_q_values = (reward_t_ph +
                                   self._discount * q_best_masked)

                # td_error = q_selected - tf.stop_gradient(target_q_values)
                loss = tf.compat.v1.losses.huber_loss(
                    q_selected, tf.stop_gradient(target_q_values))
                loss = tf.reduce_mean(loss)

            with tf.name_scope('optimize_ops'):
                qf_optimizer = make_optimizer(self._qf_optimizer,
                                              learning_rate=self._qf_lr)
                if self._grad_norm_clipping is not None:
                    gradients = qf_optimizer.compute_gradients(
                        loss, var_list=self._qf.get_trainable_vars())
                    for i, (grad, var) in enumerate(gradients):
                        if grad is not None:
                            gradients[i] = (tf.clip_by_norm(
                                grad, self._grad_norm_clipping), var)
                        optimize_loss = qf_optimizer.apply_gradients(gradients)
                else:
                    optimize_loss = qf_optimizer.minimize(
                        loss, var_list=self._qf.get_trainable_vars())

            self._train_qf = compile_function(inputs=[
                self._qf.input, action_t_ph, reward_t_ph, done_t_ph,
                self._target_qf.input
            ],
                                              outputs=[loss, optimize_loss])