Esempio n. 1
0
    def create_tf_operations(self, config):
        """
        Creates TRPO training operations, i.e. the natural gradient update step
        based on the KL divergence constraint between new and old policy.
        :return:
        """
        super(TRPOModel, self).create_tf_operations(config)

        with tf.variable_scope('update'):
            log_probs = list()
            prob_ratios = list()
            kl_divs = list()

            # for diagnostics
            kl_divergences = list()
            entropies = list()
            self.distribution_tensors = dict()
            self.prev_distribution_tensors = dict()

            for name, action in self.action.items():
                shape_size = util.prod(config.actions[name].shape)
                distribution = self.distribution[name]
                fixed_distribution = distribution.__class__.from_tensors(
                    tensors=[
                        tf.stop_gradient(x)
                        for x in distribution.get_tensors()
                    ],
                    deterministic=self.deterministic)

                log_prob = distribution.log_probability(action=action)
                log_prob = tf.reshape(tensor=log_prob, shape=(-1, shape_size))
                log_probs.append(log_prob)

                fixed_log_prob = fixed_distribution.log_probability(
                    action=action)
                fixed_log_prob = tf.reshape(tensor=fixed_log_prob,
                                            shape=(-1, shape_size))

                log_prob_diff = log_prob - fixed_log_prob
                prob_ratio = tf.exp(x=log_prob_diff)
                prob_ratios.append(prob_ratio)

                kl_div = fixed_distribution.kl_divergence(other=distribution)
                kl_div = tf.reshape(tensor=kl_div, shape=(-1, shape_size))
                kl_divs.append(kl_div)

                self.distribution_tensors[name] = list(
                    distribution.get_tensors())
                prev_distribution = list(
                    tf.placeholder(dtype=tf.float32,
                                   shape=util.shape(tensor, unknown=None))
                    for tensor in distribution.get_tensors())
                self.prev_distribution_tensors[name] = prev_distribution
                prev_distribution = distribution.from_tensors(
                    tensors=prev_distribution,
                    deterministic=self.deterministic)

                kl_divergence = prev_distribution.kl_divergence(
                    other=distribution)
                kl_divergence = tf.reshape(tensor=kl_divergence,
                                           shape=(-1, shape_size))
                kl_divergences.append(kl_divergence)

                entropy = distribution.entropy()
                entropy = tf.reshape(tensor=entropy, shape=(-1, shape_size))
                entropies.append(entropy)

            self.log_prob = tf.reduce_mean(input_tensor=tf.concat(
                values=log_probs, axis=1),
                                           axis=1)

            prob_ratio = tf.reduce_mean(input_tensor=tf.concat(
                values=prob_ratios, axis=1),
                                        axis=1)
            self.loss_per_instance = -prob_ratio * self.reward
            self.surrogate_loss = tf.reduce_mean(
                input_tensor=self.loss_per_instance, axis=0)

            kl_div = tf.reduce_mean(input_tensor=tf.concat(values=kl_divs,
                                                           axis=1),
                                    axis=1)

            # Get symbolic gradient expressions
            variables = list(
                tf.trainable_variables()
            )  # TODO: ideally not value function (see also for "gradients" below)
            gradients = tf.gradients(self.surrogate_loss, variables)
            # gradients[0] = tf.Print(gradients[0], (gradients[0],))
            variables = [
                var for var, grad in zip(variables, gradients)
                if grad is not None
            ]
            gradients = [grad for grad in gradients if grad is not None]
            self.policy_gradient = tf.concat(
                values=[tf.reshape(grad, (-1, )) for grad in gradients],
                axis=0)  # util.prod(util.shape(v))

            self.tangent = tf.placeholder(tf.float32, shape=(None, ))
            offset = 0
            tangents = []
            for variable in variables:
                shape = util.shape(variable)
                size = util.prod(shape)
                tangents.append(
                    tf.reshape(self.tangent[offset:offset + size], shape))
                offset += size

            gradients = tf.gradients(kl_div, variables)
            gradient_vector_product = [
                tf.reduce_sum(g * t) for (g, t) in zip(gradients, tangents)
            ]

            self.flat_variable_helper = FlatVarHelper(variables)
            gradients = tf.gradients(gradient_vector_product, variables)
            self.fisher_vector_product = tf.concat(
                values=[tf.reshape(grad, (-1, )) for grad in gradients],
                axis=0)

            self.cg_optimizer = ConjugateGradientOptimizer(
                self.logger, config.cg_iterations)

            kl_divergence = tf.reduce_mean(input_tensor=tf.concat(
                values=kl_divergences, axis=1),
                                           axis=1)
            self.kl_divergence = tf.reduce_mean(input_tensor=kl_divergence,
                                                axis=0)

            entropy = tf.reduce_mean(input_tensor=tf.concat(values=entropies,
                                                            axis=1),
                                     axis=1)
            self.entropy = tf.reduce_mean(input_tensor=entropy, axis=0)
Esempio n. 2
0
    def create_tf_operations(self, config):
        """
        Creates TRPO training operations, i.e. the natural gradient update step
        based on the KL divergence constraint between new and old policy.
        :return:
        """
        super(TRPOModel, self).create_tf_operations(config)

        with tf.variable_scope('update'):
            losses = list()
            for name, action in config.actions:
                distribution = self.distribution[name]
                previous_distribution = tuple(tf.placeholder(dtype=tf.float32, shape=util.shape(x, unknown=None)) for x in distribution)
                self.internal_inputs.extend(previous_distribution)
                self.internal_outputs.extend(distribution)
                if sum(1 for _ in distribution) == 2:
                    for n, x in enumerate(distribution):
                        if n == 0:
                            self.internal_inits.append(np.zeros(shape=util.shape(x)[1:]))
                        else:
                            self.internal_inits.append(np.ones(shape=util.shape(x)[1:]))
                else:
                    self.internal_inits.extend(np.zeros(shape=util.shape(x)[1:]) for x in distribution)
                previous_distribution = self.distribution[name].__class__(distribution=previous_distribution)

                log_prob = distribution.log_probability(action=self.action[name])
                previous_log_prob = previous_distribution.log_probability(action=self.action[name])
                prob_ratio = tf.minimum(tf.exp(log_prob - previous_log_prob), 1000)

                self.loss_per_instance = tf.multiply(x=prob_ratio, y=self.reward)
                surrogate_loss = -tf.reduce_mean(self.loss_per_instance, axis=0)
                kl_divergence = distribution.kl_divergence(previous_distribution)
                entropy = distribution.entropy()
                losses.append((surrogate_loss, kl_divergence, entropy))

            self.losses = [tf.reduce_mean(loss) for loss in zip(*losses)]

            # Get symbolic gradient expressions
            variables = list(tf.trainable_variables())  # TODO: ideally not value function (see also for "gradients" below)
            gradients = tf.gradients(self.losses, variables)
            variables = [var for var, grad in zip(variables, gradients) if grad is not None]
            gradients = [grad for grad in gradients if grad is not None]
            self.policy_gradient = tf.concat(values=[tf.reshape(grad, (-1,)) for grad in gradients], axis=0)  # util.prod(util.shape(v))

            fixed_distribution = distribution.__class__([tf.stop_gradient(x) for x in distribution])
            fixed_kl_divergence = fixed_distribution.kl_divergence(distribution)

            self.tangent = tf.placeholder(tf.float32, shape=(None,))
            offset = 0
            tangents = []
            for variable in variables:
                shape = util.shape(variable)
                size = util.prod(shape)
                tangents.append(tf.reshape(self.tangent[offset:offset + size], shape))
                offset += size

            gradients = tf.gradients(fixed_kl_divergence, variables)
            gradient_vector_product = [tf.reduce_sum(g * t) for (g, t) in zip(gradients, tangents)]

            self.flat_variable_helper = FlatVarHelper(variables)
            gradients = tf.gradients(gradient_vector_product, variables)
            self.fisher_vector_product = tf.concat(values=[tf.reshape(grad, (-1,)) for grad in gradients], axis=0)

            self.cg_optimizer = ConjugateGradientOptimizer(self.logger, config.cg_iterations)
Esempio n. 3
0
class TRPOModel(PolicyGradientModel):

    allows_discrete_actions = True
    allows_continuous_actions = True

    default_config = dict(optimizer=None,
                          max_kl_divergence=0.1,
                          cg_iterations=20,
                          cg_damping=0.001,
                          ls_max_backtracks=10,
                          ls_accept_ratio=0.9,
                          ls_override=False)

    def __init__(self, config):
        config.default(TRPOModel.default_config)
        super(TRPOModel, self).__init__(config)
        self.max_kl_divergence = config.max_kl_divergence
        self.cg_damping = config.cg_damping
        self.ls_max_backtracks = config.ls_max_backtracks
        self.ls_accept_ratio = config.ls_accept_ratio
        self.ls_override = config.ls_override

    def create_tf_operations(self, config):
        """
        Creates TRPO training operations, i.e. the natural gradient update step
        based on the KL divergence constraint between new and old policy.
        :return:
        """
        super(TRPOModel, self).create_tf_operations(config)

        with tf.variable_scope('update'):
            log_probs = list()
            prob_ratios = list()
            kl_divs = list()

            # for diagnostics
            kl_divergences = list()
            entropies = list()
            self.distribution_tensors = dict()
            self.prev_distribution_tensors = dict()

            for name, action in self.action.items():
                shape_size = util.prod(config.actions[name].shape)
                distribution = self.distribution[name]
                fixed_distribution = distribution.__class__.from_tensors(
                    tensors=[
                        tf.stop_gradient(x)
                        for x in distribution.get_tensors()
                    ],
                    deterministic=self.deterministic)

                log_prob = distribution.log_probability(action=action)
                log_prob = tf.reshape(tensor=log_prob, shape=(-1, shape_size))
                log_probs.append(log_prob)

                fixed_log_prob = fixed_distribution.log_probability(
                    action=action)
                fixed_log_prob = tf.reshape(tensor=fixed_log_prob,
                                            shape=(-1, shape_size))

                log_prob_diff = log_prob - fixed_log_prob
                prob_ratio = tf.exp(x=log_prob_diff)
                prob_ratios.append(prob_ratio)

                kl_div = fixed_distribution.kl_divergence(other=distribution)
                kl_div = tf.reshape(tensor=kl_div, shape=(-1, shape_size))
                kl_divs.append(kl_div)

                self.distribution_tensors[name] = list(
                    distribution.get_tensors())
                prev_distribution = list(
                    tf.placeholder(dtype=tf.float32,
                                   shape=util.shape(tensor, unknown=None))
                    for tensor in distribution.get_tensors())
                self.prev_distribution_tensors[name] = prev_distribution
                prev_distribution = distribution.from_tensors(
                    tensors=prev_distribution,
                    deterministic=self.deterministic)

                kl_divergence = prev_distribution.kl_divergence(
                    other=distribution)
                kl_divergence = tf.reshape(tensor=kl_divergence,
                                           shape=(-1, shape_size))
                kl_divergences.append(kl_divergence)

                entropy = distribution.entropy()
                entropy = tf.reshape(tensor=entropy, shape=(-1, shape_size))
                entropies.append(entropy)

            self.log_prob = tf.reduce_mean(input_tensor=tf.concat(
                values=log_probs, axis=1),
                                           axis=1)

            prob_ratio = tf.reduce_mean(input_tensor=tf.concat(
                values=prob_ratios, axis=1),
                                        axis=1)
            self.loss_per_instance = -prob_ratio * self.reward
            self.surrogate_loss = tf.reduce_mean(
                input_tensor=self.loss_per_instance, axis=0)

            kl_div = tf.reduce_mean(input_tensor=tf.concat(values=kl_divs,
                                                           axis=1),
                                    axis=1)

            # Get symbolic gradient expressions
            variables = list(
                tf.trainable_variables()
            )  # TODO: ideally not value function (see also for "gradients" below)
            gradients = tf.gradients(self.surrogate_loss, variables)
            # gradients[0] = tf.Print(gradients[0], (gradients[0],))
            variables = [
                var for var, grad in zip(variables, gradients)
                if grad is not None
            ]
            gradients = [grad for grad in gradients if grad is not None]
            self.policy_gradient = tf.concat(
                values=[tf.reshape(grad, (-1, )) for grad in gradients],
                axis=0)  # util.prod(util.shape(v))

            self.tangent = tf.placeholder(tf.float32, shape=(None, ))
            offset = 0
            tangents = []
            for variable in variables:
                shape = util.shape(variable)
                size = util.prod(shape)
                tangents.append(
                    tf.reshape(self.tangent[offset:offset + size], shape))
                offset += size

            gradients = tf.gradients(kl_div, variables)
            gradient_vector_product = [
                tf.reduce_sum(g * t) for (g, t) in zip(gradients, tangents)
            ]

            self.flat_variable_helper = FlatVarHelper(variables)
            gradients = tf.gradients(gradient_vector_product, variables)
            self.fisher_vector_product = tf.concat(
                values=[tf.reshape(grad, (-1, )) for grad in gradients],
                axis=0)

            self.cg_optimizer = ConjugateGradientOptimizer(
                self.logger, config.cg_iterations)

            kl_divergence = tf.reduce_mean(input_tensor=tf.concat(
                values=kl_divergences, axis=1),
                                           axis=1)
            self.kl_divergence = tf.reduce_mean(input_tensor=kl_divergence,
                                                axis=0)

            entropy = tf.reduce_mean(input_tensor=tf.concat(values=entropies,
                                                            axis=1),
                                     axis=1)
            self.entropy = tf.reduce_mean(input_tensor=entropy, axis=0)

    def set_session(self, session):
        super(TRPOModel, self).set_session(session)
        self.flat_variable_helper.session = session

    def update(self, batch):
        """
        Compute update for one batch of experiences using general advantage estimation
        and the constrained optimisation based on the fixed kl-divergence constraint.

        :param batch:
        :return:
        """
        super(TRPOModel, self).update(batch)

        assert 'policy_gradient' not in self.distribution_tensors
        fetches = dict(policy_gradient=self.policy_gradient)
        fetches.update(self.distribution_tensors)

        self.feed_dict = {
            state: batch['states'][name]
            for name, state in self.state.items()
        }
        self.feed_dict.update({
            action: batch['actions'][name]
            for name, action in self.action.items()
        })
        self.feed_dict[self.reward] = batch['rewards']
        self.feed_dict[self.terminal] = batch['terminals']
        self.feed_dict.update({
            internal: batch['internals'][n]
            for n, internal in enumerate(self.internal_inputs)
        })

        prev_distribution_tensors = self.session.run(
            fetches=fetches, feed_dict=self.feed_dict)  # dL
        gradient = prev_distribution_tensors.pop('policy_gradient')

        if np.allclose(gradient, np.zeros_like(gradient)):
            self.logger.debug('Gradient zero, skipping update.')
            return

        # The details of the approximations used here to solve the constrained
        # optimisation can be found in Appendix C of the TRPO paper
        # Note that no subsampling is used, which would improve computational performance
        search_direction = self.cg_optimizer.solve(
            self.compute_fvp, -gradient)  # x = ddKL(=F)^(-1) * -dL

        # Search direction has now been approximated as cg-solution s= A^-1g where A is
        # Fisher matrix, which is a local approximation of the
        # KL divergence constraint
        shs = 0.5 * search_direction.dot(self.compute_fvp(
            search_direction))  # (c lambda^2) = 0.5 * xT * F * x
        if shs < 0:
            self.logger.debug(
                'Computing search direction failed, skipping update.')
            return

        lagrange_multiplier = max(np.sqrt(shs / self.max_kl_divergence),
                                  util.epsilon)
        natural_gradient_step = search_direction / lagrange_multiplier  # c
        negative_gradient_direction = -gradient.dot(
            search_direction)  # -dL * x
        estimated_improvement = negative_gradient_direction / lagrange_multiplier

        # Improve update step through simple backtracking line search
        # N.b. some implementations skip the line search
        parameters = self.flat_variable_helper.get()
        new_parameters = self.line_search(
            rewards=batch['rewards'],
            parameters=parameters,
            natural_gradient_step=natural_gradient_step,
            estimated_improvement=estimated_improvement)

        # Use line search results, otherwise take full step
        # N.B. some implementations don't use the line search
        if new_parameters is not None:
            self.logger.debug('Updating with line search result.')
            self.flat_variable_helper.set(new_parameters)
        elif self.ls_override:
            self.logger.debug('Updating with full step.')
            self.flat_variable_helper.set(parameters + natural_gradient_step)
        else:
            self.logger.debug(
                'Failed to find line search solution, skipping update.')
            self.flat_variable_helper.set(parameters)

        # Get loss values for progress monitoring
        fetches = (self.surrogate_loss, self.kl_divergence, self.entropy,
                   self.loss_per_instance)
        prev_distribution_tensors = {
            placeholder: tensor
            for name, placeholders in self.prev_distribution_tensors.items()
            for placeholder, tensor in zip(placeholders,
                                           prev_distribution_tensors[name])
        }
        self.feed_dict.update(prev_distribution_tensors)

        surrogate_loss, kl_divergence, entropy, loss_per_instance = self.session.run(
            fetches=fetches, feed_dict=self.feed_dict)

        # Sanity checks. Is entropy decreasing? Is KL divergence within reason? Is loss non-zero?
        self.logger.debug('Surrogate loss = {}'.format(surrogate_loss))
        self.logger.debug(
            'KL-divergence after update = {}'.format(kl_divergence))
        self.logger.debug('Entropy = {}'.format(entropy))

        return (surrogate_loss, kl_divergence, entropy), loss_per_instance

    def compute_fvp(self, p):
        self.feed_dict[self.tangent] = p
        return self.session.run(self.fisher_vector_product,
                                self.feed_dict) + p * self.cg_damping

    def compute_log_prob(self, theta):
        self.flat_variable_helper.set(theta)
        return self.session.run(self.log_prob, self.feed_dict)

    def line_search(self, rewards, parameters, natural_gradient_step,
                    estimated_improvement):
        """
        Line search for TRPO where a full step is taken first and then backtracked to
        find optimal step size.

        :param rewards:
        :param parameters:
        :param natural_gradient_step:
        :param estimated_improvement:

        :return:
        """

        log_prob = self.compute_log_prob(parameters)
        old_value = sum(rewards) / len(rewards)
        estimated_improvement = max(estimated_improvement, util.epsilon)

        step_fraction = 1.0
        for backtrack in range(self.ls_max_backtracks):
            new_parameters = parameters + step_fraction * natural_gradient_step
            new_log_prob = self.compute_log_prob(new_parameters)

            prob_ratio = np.exp(new_log_prob - log_prob)
            new_value = prob_ratio.dot(rewards) / prob_ratio.shape[0]

            improvement_ratio = (new_value - old_value) / estimated_improvement
            if improvement_ratio > self.ls_accept_ratio:
                self.logger.debug(
                    'Line search successful after {} backtracking steps.'.
                    format(backtrack))
                return new_parameters

            step_fraction /= 2.0
            estimated_improvement /= 2.0

        return None
Esempio n. 4
0
class TRPOModel(PolicyGradientModel):

    allows_discrete_actions = True
    allows_continuous_actions = True

    default_config = dict(
        optimizer=None,
        override_line_search=False,
        cg_damping=0.001,
        line_search_steps=20,
        max_kl_divergence=0.001,
        cg_iterations=20
    )

    def __init__(self, config):
        config.default(TRPOModel.default_config)
        super(TRPOModel, self).__init__(config)

        self.override_line_search = config.override_line_search
        self.cg_damping = config.cg_damping
        self.max_kl_divergence = config.max_kl_divergence
        self.line_search_steps = config.line_search_steps

    def create_tf_operations(self, config):
        """
        Creates TRPO training operations, i.e. the natural gradient update step
        based on the KL divergence constraint between new and old policy.
        :return:
        """
        super(TRPOModel, self).create_tf_operations(config)

        with tf.variable_scope('update'):
            losses = list()
            for name, action in config.actions:
                distribution = self.distribution[name]
                previous_distribution = tuple(tf.placeholder(dtype=tf.float32, shape=util.shape(x, unknown=None)) for x in distribution)
                self.internal_inputs.extend(previous_distribution)
                self.internal_outputs.extend(distribution)
                if sum(1 for _ in distribution) == 2:
                    for n, x in enumerate(distribution):
                        if n == 0:
                            self.internal_inits.append(np.zeros(shape=util.shape(x)[1:]))
                        else:
                            self.internal_inits.append(np.ones(shape=util.shape(x)[1:]))
                else:
                    self.internal_inits.extend(np.zeros(shape=util.shape(x)[1:]) for x in distribution)
                previous_distribution = self.distribution[name].__class__(distribution=previous_distribution)

                log_prob = distribution.log_probability(action=self.action[name])
                previous_log_prob = previous_distribution.log_probability(action=self.action[name])
                prob_ratio = tf.minimum(tf.exp(log_prob - previous_log_prob), 1000)

                self.loss_per_instance = tf.multiply(x=prob_ratio, y=self.reward)
                surrogate_loss = -tf.reduce_mean(self.loss_per_instance, axis=0)
                kl_divergence = distribution.kl_divergence(previous_distribution)
                entropy = distribution.entropy()
                losses.append((surrogate_loss, kl_divergence, entropy))

            self.losses = [tf.reduce_mean(loss) for loss in zip(*losses)]

            # Get symbolic gradient expressions
            variables = list(tf.trainable_variables())  # TODO: ideally not value function (see also for "gradients" below)
            gradients = tf.gradients(self.losses, variables)
            variables = [var for var, grad in zip(variables, gradients) if grad is not None]
            gradients = [grad for grad in gradients if grad is not None]
            self.policy_gradient = tf.concat(values=[tf.reshape(grad, (-1,)) for grad in gradients], axis=0)  # util.prod(util.shape(v))

            fixed_distribution = distribution.__class__([tf.stop_gradient(x) for x in distribution])
            fixed_kl_divergence = fixed_distribution.kl_divergence(distribution)

            self.tangent = tf.placeholder(tf.float32, shape=(None,))
            offset = 0
            tangents = []
            for variable in variables:
                shape = util.shape(variable)
                size = util.prod(shape)
                tangents.append(tf.reshape(self.tangent[offset:offset + size], shape))
                offset += size

            gradients = tf.gradients(fixed_kl_divergence, variables)
            gradient_vector_product = [tf.reduce_sum(g * t) for (g, t) in zip(gradients, tangents)]

            self.flat_variable_helper = FlatVarHelper(variables)
            gradients = tf.gradients(gradient_vector_product, variables)
            self.fisher_vector_product = tf.concat(values=[tf.reshape(grad, (-1,)) for grad in gradients], axis=0)

            self.cg_optimizer = ConjugateGradientOptimizer(self.logger, config.cg_iterations)

    def set_session(self, session):
        super(TRPOModel, self).set_session(session)
        self.flat_variable_helper.session = session

    def update(self, batch):
        """
        Compute update for one batch of experiences using general advantage estimation
        and the constrained optimisation based on the fixed kl-divergence constraint.

        :param batch:
        :return:
        """
        super(TRPOModel, self).update(batch)

        self.feed_dict = {state: batch['states'][name] for name, state in self.state.items()}
        self.feed_dict.update({action: batch['actions'][name] for name, action in self.action.items()})
        self.feed_dict[self.reward] = batch['rewards']
        self.feed_dict[self.terminal] = batch['terminals']
        self.feed_dict.update({internal: batch['internals'][n] for n, internal in enumerate(self.internal_inputs)})

        gradient = self.session.run(self.policy_gradient, self.feed_dict)

        if np.allclose(gradient, np.zeros_like(gradient)):
            self.logger.debug('Gradient zero, skipping update.')
            return

        # The details of the approximations used here to solve the constrained
        # optimisation can be found in Appendix C of the TRPO paper
        # Note that no subsampling is used, which would improve computational performance
        search_direction = self.cg_optimizer.solve(self.compute_fvp, -gradient)

        # Search direction has now been approximated as cg-solution s= A^-1g where A is
        # Fisher matrix, which is a local approximation of the
        # KL divergence constraint
        shs = 0.5 * search_direction.dot(self.compute_fvp(search_direction))
        if shs < 0:
            self.logger.debug('Computing search direction failed, skipping update.')
            return

        lagrange_multiplier = np.sqrt(shs / self.max_kl_divergence)
        update_step = search_direction / (lagrange_multiplier + util.epsilon)
        negative_gradient_direction = -gradient.dot(search_direction)

        # Improve update step through simple backtracking line search
        # N.b. some implementations skip the line search
        previous_theta = self.flat_variable_helper.get()
        improved, theta = line_search(self.compute_surrogate_loss, previous_theta, update_step, negative_gradient_direction / (lagrange_multiplier + util.epsilon), self.line_search_steps)

        # Use line search results, otherwise take full step
        # N.B. some implementations don't use the line search
        if improved:
            self.logger.debug('Updating with line search result..')
            self.flat_variable_helper.set(theta)
        elif self.override_line_search:
            self.logger.debug('Updating with full step..')
            self.flat_variable_helper.set(previous_theta + update_step)
        else:
            self.logger.debug('Failed to find line search solution, skipping update.')

        # Get loss values for progress monitoring
        surrogate_loss, kl_divergence, entropy, loss_per_instance = self.session.run(self.losses + [self.loss_per_instance], self.feed_dict)

        # Sanity checks. Is entropy decreasing? Is KL divergence within reason? Is loss non-zero?
        self.logger.debug('Surrogate loss = ' + str(surrogate_loss))
        self.logger.debug('KL-divergence after update = ' + str(kl_divergence))
        self.logger.debug('Entropy = ' + str(entropy))
        return (surrogate_loss, kl_divergence, entropy), loss_per_instance

    def compute_fvp(self, p):
        self.feed_dict[self.tangent] = p

        return self.session.run(self.fisher_vector_product, self.feed_dict) + p * self.cg_damping

    def compute_surrogate_loss(self, theta):
        self.flat_variable_helper.set(theta)

        # Losses[0] = surrogate_loss
        return self.session.run(self.losses[0], self.feed_dict)
Esempio n. 5
0
    def create_tf_operations(self, config):
        """
        Creates TRPO training operations, i.e. the natural gradient update step
        based on the KL divergence constraint between new and old policy.
        :return:
        """
        super(TRPOModel, self).create_tf_operations(config)

        with tf.variable_scope('update'):
            losses = list()
            for name, action in config.actions:
                distribution = self.distribution[name]
                previous_distribution = tuple(
                    tf.placeholder(dtype=tf.float32,
                                   shape=util.shape(x, unknown=None))
                    for x in distribution)
                self.internal_inputs.extend(previous_distribution)
                self.internal_outputs.extend(distribution)
                if sum(1 for _ in distribution) == 2:
                    for n, x in enumerate(distribution):
                        if n == 0:
                            self.internal_inits.append(
                                np.zeros(shape=util.shape(x)[1:]))
                        else:
                            self.internal_inits.append(
                                np.ones(shape=util.shape(x)[1:]))
                else:
                    self.internal_inits.extend(
                        np.zeros(shape=util.shape(x)[1:])
                        for x in distribution)
                previous_distribution = self.distribution[name].__class__(
                    distribution=previous_distribution)

                log_prob = distribution.log_probability(
                    action=self.action[name])
                previous_log_prob = previous_distribution.log_probability(
                    action=self.action[name])
                prob_ratio = tf.minimum(tf.exp(log_prob - previous_log_prob),
                                        1000)

                self.loss_per_instance = tf.multiply(x=prob_ratio,
                                                     y=self.reward)
                surrogate_loss = -tf.reduce_mean(self.loss_per_instance,
                                                 axis=0)
                kl_divergence = distribution.kl_divergence(
                    previous_distribution)
                entropy = distribution.entropy()
                losses.append((surrogate_loss, kl_divergence, entropy))

            self.losses = [tf.reduce_mean(loss) for loss in zip(*losses)]

            # Get symbolic gradient expressions
            variables = list(tf.trainable_variables())
            gradients = tf.gradients(self.losses, variables)
            self.policy_gradient = tf.concat(
                values=[tf.reshape(grad, (-1, )) for grad in gradients],
                axis=0)  # util.prod(util.shape(v))

            fixed_distribution = distribution.__class__(
                [tf.stop_gradient(x) for x in distribution])
            fixed_kl_divergence = fixed_distribution.kl_divergence(
                distribution)

            self.tangent = tf.placeholder(tf.float32, shape=(None, ))
            offset = 0
            tangents = []
            for variable in variables:
                shape = util.shape(variable)
                size = util.prod(shape)
                tangents.append(
                    tf.reshape(self.tangent[offset:offset + size], shape))
                offset += size

            gradients = tf.gradients(fixed_kl_divergence, variables)
            gradient_vector_product = [
                tf.reduce_sum(g * t) for (g, t) in zip(gradients, tangents)
            ]

            self.flat_variable_helper = FlatVarHelper(variables)
            gradients = tf.gradients(gradient_vector_product, variables)
            self.fisher_vector_product = tf.concat(
                values=[tf.reshape(grad, (-1, )) for grad in gradients],
                axis=0)

            self.cg_optimizer = ConjugateGradientOptimizer(
                self.logger, config.cg_iterations)
Esempio n. 6
0
class TRPOModel(PolicyGradientModel):

    allows_discrete_actions = True
    allows_continuous_actions = True

    default_config = dict(optimizer=None,
                          learning_rate=None,
                          cg_damping=0.001,
                          line_search_steps=20,
                          max_kl_divergence=0.001,
                          cg_iterations=20)

    def __init__(self, config):
        config.default(TRPOModel.default_config)
        super(TRPOModel, self).__init__(config)

        self.cg_damping = config.cg_damping
        self.max_kl_divergence = config.max_kl_divergence
        self.line_search_steps = config.line_search_steps

    def create_tf_operations(self, config):
        """
        Creates TRPO training operations, i.e. the natural gradient update step
        based on the KL divergence constraint between new and old policy.
        :return:
        """
        super(TRPOModel, self).create_tf_operations(config)

        with tf.variable_scope('update'):
            losses = list()
            for name, action in config.actions:
                distribution = self.distribution[name]
                previous_distribution = tuple(
                    tf.placeholder(dtype=tf.float32,
                                   shape=util.shape(x, unknown=None))
                    for x in distribution)
                self.internal_inputs.extend(previous_distribution)
                self.internal_outputs.extend(distribution)
                if sum(1 for _ in distribution) == 2:
                    for n, x in enumerate(distribution):
                        if n == 0:
                            self.internal_inits.append(
                                np.zeros(shape=util.shape(x)[1:]))
                        else:
                            self.internal_inits.append(
                                np.ones(shape=util.shape(x)[1:]))
                else:
                    self.internal_inits.extend(
                        np.zeros(shape=util.shape(x)[1:])
                        for x in distribution)
                previous_distribution = self.distribution[name].__class__(
                    distribution=previous_distribution)

                log_prob = distribution.log_probability(
                    action=self.action[name])
                previous_log_prob = previous_distribution.log_probability(
                    action=self.action[name])
                prob_ratio = tf.minimum(tf.exp(log_prob - previous_log_prob),
                                        1000)

                self.loss_per_instance = tf.multiply(x=prob_ratio,
                                                     y=self.reward)
                surrogate_loss = -tf.reduce_mean(self.loss_per_instance,
                                                 axis=0)
                kl_divergence = distribution.kl_divergence(
                    previous_distribution)
                entropy = distribution.entropy()
                losses.append((surrogate_loss, kl_divergence, entropy))

            self.losses = [tf.reduce_mean(loss) for loss in zip(*losses)]

            # Get symbolic gradient expressions
            variables = list(tf.trainable_variables())
            gradients = tf.gradients(self.losses, variables)
            self.policy_gradient = tf.concat(
                values=[tf.reshape(grad, (-1, )) for grad in gradients],
                axis=0)  # util.prod(util.shape(v))

            fixed_distribution = distribution.__class__(
                [tf.stop_gradient(x) for x in distribution])
            fixed_kl_divergence = fixed_distribution.kl_divergence(
                distribution)

            self.tangent = tf.placeholder(tf.float32, shape=(None, ))
            offset = 0
            tangents = []
            for variable in variables:
                shape = util.shape(variable)
                size = util.prod(shape)
                tangents.append(
                    tf.reshape(self.tangent[offset:offset + size], shape))
                offset += size

            gradients = tf.gradients(fixed_kl_divergence, variables)
            gradient_vector_product = [
                tf.reduce_sum(g * t) for (g, t) in zip(gradients, tangents)
            ]

            self.flat_variable_helper = FlatVarHelper(variables)
            gradients = tf.gradients(gradient_vector_product, variables)
            self.fisher_vector_product = tf.concat(
                values=[tf.reshape(grad, (-1, )) for grad in gradients],
                axis=0)

            self.cg_optimizer = ConjugateGradientOptimizer(
                self.logger, config.cg_iterations)

    def set_session(self, session):
        super(TRPOModel, self).set_session(session)
        self.flat_variable_helper.session = session

    def update(self, batch):
        """
        Compute update for one batch of experiences using general advantage estimation
        and the constrained optimisation based on the fixed kl-divergence constraint.

        :param batch:
        :return:
        """
        self.feed_dict = {
            state: batch['states'][name]
            for name, state in self.state.items()
        }
        self.feed_dict.update({
            action: batch['actions'][name]
            for name, action in self.action.items()
        })
        self.feed_dict[self.reward] = batch['rewards']
        self.feed_dict[self.terminal] = batch['terminals']
        self.feed_dict.update({
            internal: batch['internals'][n]
            for n, internal in enumerate(self.internal_inputs)
        })

        gradient = self.session.run(self.policy_gradient, self.feed_dict)

        if np.allclose(gradient, np.zeros_like(gradient)):
            self.logger.debug('Gradient zero, skipping update')
            return

        # The details of the approximations used here to solve the constrained
        # optimisation can be found in Appendix C of the TRPO paper
        # Note that no subsampling is used, which would improve computational performance
        search_direction = self.cg_optimizer.solve(self.compute_fvp, -gradient)

        # Search direction has now been approximated as cg-solution s= A^-1g where A is
        # Fisher matrix, which is a local approximation of the
        # KL divergence constraint
        shs = 0.5 * search_direction.dot(self.compute_fvp(search_direction))
        lagrange_multiplier = np.sqrt(shs / self.max_kl_divergence)
        update_step = search_direction / (lagrange_multiplier + util.epsilon)
        negative_gradient_direction = -gradient.dot(search_direction)

        # Improve update step through simple backtracking line search
        # N.b. some implementations skip the line search
        previous_theta = self.flat_variable_helper.get()
        improved, theta = line_search(
            self.compute_surrogate_loss, previous_theta, update_step,
            negative_gradient_direction / (lagrange_multiplier + util.epsilon),
            self.line_search_steps)

        # Use line search results, otherwise take full step
        # N.B. some implementations don't use the line search
        if improved:
            self.logger.debug('Updating with line search result..')
            self.flat_variable_helper.set(theta)
        else:
            self.logger.debug('Updating with full step..')
            self.flat_variable_helper.set(previous_theta + update_step)

        # Get loss values for progress monitoring
        surrogate_loss, kl_divergence, entropy, loss_per_instance = self.session.run(
            self.losses + [self.loss_per_instance], self.feed_dict)

        # Sanity checks. Is entropy decreasing? Is KL divergence within reason? Is loss non-zero?
        self.logger.debug('Surrogate loss = ' + str(surrogate_loss))
        self.logger.debug('KL-divergence after update = ' + str(kl_divergence))
        self.logger.debug('Entropy = ' + str(entropy))
        return (surrogate_loss, kl_divergence, entropy), loss_per_instance

    def compute_fvp(self, p):
        self.feed_dict[self.tangent] = p

        return self.session.run(self.fisher_vector_product,
                                self.feed_dict) + p * self.cg_damping

    def compute_surrogate_loss(self, theta):
        self.flat_variable_helper.set(theta)

        # Losses[0] = surrogate_loss
        return self.session.run(self.losses[0], self.feed_dict)
Esempio n. 7
0
class TRPOModel(PolicyGradientModel):

    allows_discrete_actions = True
    allows_continuous_actions = True

    default_config = dict(optimizer=None,
                          max_kl_divergence=0.001,
                          cg_iterations=20,
                          cg_damping=0.001,
                          ls_max_backtracks=20,
                          ls_accept_ratio=0.01,
                          ls_override=False)

    def __init__(self, config):
        config.default(TRPOModel.default_config)
        super(TRPOModel, self).__init__(config)
        self.max_kl_divergence = config.max_kl_divergence
        self.cg_damping = config.cg_damping
        self.ls_max_backtracks = config.ls_max_backtracks
        self.ls_accept_ratio = config.ls_accept_ratio
        self.ls_override = config.ls_override

    def create_tf_operations(self, config):
        """
        Creates TRPO training operations, i.e. the natural gradient update step
        based on the KL divergence constraint between new and old policy.
        :return:
        """
        super(TRPOModel, self).create_tf_operations(config)

        with tf.variable_scope('update'):
            prob_ratios = list()
            kl_divergences = list()
            entropies = list()
            fixed_kl_divergences = list()

            for name, action in self.action.items():
                distribution = self.distribution[name]
                prev_distribution = tuple(
                    tf.placeholder(dtype=tf.float32,
                                   shape=util.shape(x, unknown=None))
                    for x in distribution)
                self.internal_inputs.extend(prev_distribution)
                self.internal_outputs.extend(distribution)
                self.internal_inits.extend(
                    np.zeros(shape=util.shape(x)[1:]) for x in distribution)
                prev_distribution = distribution.from_tensors(
                    parameters=prev_distribution,
                    deterministic=self.deterministic)

                shape_size = util.prod(config.actions[name].shape)

                log_prob = distribution.log_probability(action=action)
                prev_log_prob = prev_distribution.log_probability(
                    action=action)
                log_prob_diff = tf.minimum(x=(log_prob - prev_log_prob),
                                           y=10.0)
                prob_ratio = tf.exp(x=log_prob_diff)
                prob_ratio = tf.reshape(tensor=prob_ratio,
                                        shape=(-1, shape_size))
                prob_ratios.append(prob_ratio)

                kl_divergence = distribution.kl_divergence(
                    other=prev_distribution)
                kl_divergence = tf.reshape(tensor=kl_divergence,
                                           shape=(-1, shape_size))
                kl_divergences.append(kl_divergence)

                entropy = distribution.entropy()
                entropy = tf.reshape(tensor=entropy, shape=(-1, shape_size))
                entropies.append(entropy)

                fixed_distribution = distribution.__class__.from_tensors(
                    parameters=[tf.stop_gradient(x) for x in distribution],
                    deterministic=self.deterministic)
                fixed_kl_divergence = fixed_distribution.kl_divergence(
                    distribution)
                fixed_kl_divergence = tf.reshape(tensor=fixed_kl_divergence,
                                                 shape=(-1, shape_size))
                fixed_kl_divergences.append(fixed_kl_divergence)

            prob_ratio = tf.reduce_mean(input_tensor=tf.concat(
                values=prob_ratios, axis=1),
                                        axis=1)
            self.loss_per_instance = -prob_ratio * self.reward
            surrogate_loss = tf.reduce_mean(
                input_tensor=self.loss_per_instance, axis=0)

            kl_divergence = tf.reduce_mean(input_tensor=tf.concat(
                values=kl_divergences, axis=1),
                                           axis=1)
            kl_divergence = tf.reduce_mean(input_tensor=kl_divergence, axis=0)
            entropy = tf.reduce_mean(input_tensor=tf.concat(values=entropies,
                                                            axis=1),
                                     axis=1)
            entropy = tf.reduce_mean(input_tensor=entropy, axis=0)
            self.losses = (surrogate_loss, kl_divergence, entropy,
                           self.loss_per_instance)

            fixed_kl_divergence = tf.reduce_mean(input_tensor=tf.concat(
                values=fixed_kl_divergences, axis=1),
                                                 axis=1)

            # Get symbolic gradient expressions
            variables = list(
                tf.trainable_variables()
            )  # TODO: ideally not value function (see also for "gradients" below)
            gradients = tf.gradients(self.losses[0], variables)
            variables = [
                var for var, grad in zip(variables, gradients)
                if grad is not None
            ]
            gradients = [grad for grad in gradients if grad is not None]
            self.policy_gradient = tf.concat(
                values=[tf.reshape(grad, (-1, )) for grad in gradients],
                axis=0)  # util.prod(util.shape(v))

            self.tangent = tf.placeholder(tf.float32, shape=(None, ))
            offset = 0
            tangents = []
            for variable in variables:
                shape = util.shape(variable)
                size = util.prod(shape)
                tangents.append(
                    tf.reshape(self.tangent[offset:offset + size], shape))
                offset += size

            gradients = tf.gradients(fixed_kl_divergence, variables)
            gradient_vector_product = [
                tf.reduce_sum(g * t) for (g, t) in zip(gradients, tangents)
            ]

            self.flat_variable_helper = FlatVarHelper(variables)
            gradients = tf.gradients(gradient_vector_product, variables)
            self.fisher_vector_product = tf.concat(
                values=[tf.reshape(grad, (-1, )) for grad in gradients],
                axis=0)

            self.cg_optimizer = ConjugateGradientOptimizer(
                self.logger, config.cg_iterations)

    def set_session(self, session):
        super(TRPOModel, self).set_session(session)
        self.flat_variable_helper.session = session

    def update(self, batch):
        """
        Compute update for one batch of experiences using general advantage estimation
        and the constrained optimisation based on the fixed kl-divergence constraint.

        :param batch:
        :return:
        """
        super(TRPOModel, self).update(batch)

        self.feed_dict = {
            state: batch['states'][name]
            for name, state in self.state.items()
        }
        self.feed_dict.update({
            action: batch['actions'][name]
            for name, action in self.action.items()
        })
        self.feed_dict[self.reward] = batch['rewards']
        self.feed_dict[self.terminal] = batch['terminals']
        self.feed_dict.update({
            internal: batch['internals'][n]
            for n, internal in enumerate(self.internal_inputs)
        })

        gradient = self.session.run(self.policy_gradient, self.feed_dict)  # dL

        if np.allclose(gradient, np.zeros_like(gradient)):
            self.logger.debug('Gradient zero, skipping update.')
            return

        # The details of the approximations used here to solve the constrained
        # optimisation can be found in Appendix C of the TRPO paper
        # Note that no subsampling is used, which would improve computational performance
        search_direction = self.cg_optimizer.solve(
            self.compute_fvp, -gradient)  # x = ddKL(=F)^(-1) * -dL

        # Search direction has now been approximated as cg-solution s= A^-1g where A is
        # Fisher matrix, which is a local approximation of the
        # KL divergence constraint
        shs = 0.5 * search_direction.dot(self.compute_fvp(
            search_direction))  # (c lambda^2) = 0.5 * xT * F * x
        if shs < 0:
            self.logger.debug(
                'Computing search direction failed, skipping update.')
            return

        lagrange_multiplier = np.sqrt(shs / self.max_kl_divergence)
        update_step = search_direction / (lagrange_multiplier + util.epsilon
                                          )  # c
        negative_gradient_direction = -gradient.dot(
            search_direction)  # -dL * x

        # Improve update step through simple backtracking line search
        # N.b. some implementations skip the line search
        previous_theta = self.flat_variable_helper.get()
        improved, theta = line_search(
            self.compute_surrogate_loss, previous_theta, update_step,
            negative_gradient_direction / (lagrange_multiplier + util.epsilon),
            self.ls_max_backtracks, self.ls_accept_ratio)

        # Use line search results, otherwise take full step
        # N.B. some implementations don't use the line search
        if improved:
            self.logger.debug('Updating with line search result..')
            self.flat_variable_helper.set(theta)
        elif self.ls_override:
            self.logger.debug('Updating with full step..')
            self.flat_variable_helper.set(previous_theta + update_step)
        else:
            self.logger.debug(
                'Failed to find line search solution, skipping update.')

        # Get loss values for progress monitoring
        surrogate_loss, kl_divergence, entropy, loss_per_instance = self.session.run(
            self.losses, self.feed_dict)

        # Sanity checks. Is entropy decreasing? Is KL divergence within reason? Is loss non-zero?
        self.logger.debug('Surrogate loss = {}'.format(surrogate_loss))
        self.logger.debug(
            'KL-divergence after update = {}'.format(kl_divergence))
        self.logger.debug('Entropy = {}'.format(entropy))

        return (surrogate_loss, kl_divergence, entropy), loss_per_instance

    def compute_fvp(self, p):
        self.feed_dict[self.tangent] = p

        return self.session.run(self.fisher_vector_product,
                                self.feed_dict) + p * self.cg_damping

    def compute_surrogate_loss(self, theta):
        self.flat_variable_helper.set(theta)

        # Losses[0] = surrogate_loss
        return self.session.run(self.losses[0], self.feed_dict)
    def create_tf_operations(self, config):
        """
        Creates TRPO training operations, i.e. the natural gradient update step
        based on the KL divergence constraint between new and old policy.
        :return:
        """
        super(TRPOModel, self).create_tf_operations(config)

        with tf.variable_scope('update'):
            prob_ratios = list()
            kl_divergences = list()
            entropies = list()
            fixed_kl_divergences = list()
            for name, action in self.action.items():
                distribution = self.distribution[name]
                prev_distribution = tuple(
                    tf.placeholder(dtype=tf.float32,
                                   shape=util.shape(x, unknown=None))
                    for x in distribution)
                self.internal_inputs.extend(prev_distribution)
                self.internal_outputs.extend(distribution)
                self.internal_inits.extend(
                    np.zeros(shape=util.shape(x)[1:]) for x in distribution)
                prev_distribution = distribution.from_tensors(
                    parameters=prev_distribution,
                    deterministic=self.deterministic)

                log_prob = distribution.log_probability(action=action)
                prev_log_prob = prev_distribution.log_probability(
                    action=action)
                log_prob_diff = tf.minimum(x=(log_prob - prev_log_prob),
                                           y=10.0)
                prob_ratio = tf.exp(x=log_prob_diff)

                kl_divergence = distribution.kl_divergence(
                    other=prev_distribution)
                entropy = distribution.entropy()

                fixed_distribution = distribution.__class__.from_tensors(
                    parameters=[tf.stop_gradient(x) for x in distribution],
                    deterministic=self.deterministic)
                fixed_kl_divergence = fixed_distribution.kl_divergence(
                    distribution)

                prs_list = [prob_ratio]
                kds_list = [kl_divergence]
                es_list = [entropy]
                fkds_list = [fixed_kl_divergence]
                for _ in range(len(config.actions[name].shape)):
                    prs_list = [
                        pr for prs in prs_list
                        for pr in tf.unstack(value=prs, axis=1)
                    ]
                    kds_list = [
                        kd for kds in kds_list
                        for kd in tf.unstack(value=kds, axis=1)
                    ]
                    es_list = [
                        e for es in es_list
                        for e in tf.unstack(value=es, axis=1)
                    ]
                    fkds_list = [
                        fkd for fkds in fkds_list
                        for fkd in tf.unstack(value=fkds, axis=1)
                    ]
                prob_ratios.extend(prs_list)
                kl_divergences.extend(kds_list)
                entropies.extend(es_list)
                fixed_kl_divergences.extend(fkds_list)

            prob_ratio = tf.add_n(inputs=prob_ratios) / len(prob_ratios)
            self.loss_per_instance = -prob_ratio * self.reward
            surrogate_loss = tf.reduce_mean(
                input_tensor=self.loss_per_instance, axis=0)

            kl_divergence = tf.reduce_mean(
                input_tensor=(tf.add_n(inputs=kl_divergences) /
                              len(kl_divergences)),
                axis=0)
            entropy = tf.reduce_mean(input_tensor=(tf.add_n(inputs=entropies) /
                                                   len(entropies)),
                                     axis=0)
            self.losses = (surrogate_loss, kl_divergence, entropy,
                           self.loss_per_instance)

            fixed_kl_divergence = tf.add_n(
                inputs=fixed_kl_divergences) / len(fixed_kl_divergences)

            # Get symbolic gradient expressions
            variables = list(
                tf.trainable_variables()
            )  # TODO: ideally not value function (see also for "gradients" below)
            gradients = tf.gradients(self.losses[0], variables)
            variables = [
                var for var, grad in zip(variables, gradients)
                if grad is not None
            ]
            gradients = [grad for grad in gradients if grad is not None]
            self.policy_gradient = tf.concat(
                values=[tf.reshape(grad, (-1, )) for grad in gradients],
                axis=0)  # util.prod(util.shape(v))

            self.tangent = tf.placeholder(tf.float32, shape=(None, ))
            offset = 0
            tangents = []
            for variable in variables:
                shape = util.shape(variable)
                size = util.prod(shape)
                tangents.append(
                    tf.reshape(self.tangent[offset:offset + size], shape))
                offset += size

            gradients = tf.gradients(fixed_kl_divergence, variables)
            gradient_vector_product = [
                tf.reduce_sum(g * t) for (g, t) in zip(gradients, tangents)
            ]

            self.flat_variable_helper = FlatVarHelper(variables)
            gradients = tf.gradients(gradient_vector_product, variables)
            self.fisher_vector_product = tf.concat(
                values=[tf.reshape(grad, (-1, )) for grad in gradients],
                axis=0)

            self.cg_optimizer = ConjugateGradientOptimizer(
                self.logger, config.cg_iterations)