Ejemplo n.º 1
0
    def __init__(self, variables):
        self.session = None
        shapes = [util.shape(variable) for variable in variables]
        total_size = sum(util.prod(shape) for shape in shapes)
        self.theta = tf.placeholder(tf.float32, [total_size])
        start = 0
        assigns = []

        for (shape, variable) in zip(shapes, variables):
            size = util.prod(shape)
            assigns.append(tf.assign(variable, tf.reshape(self.theta[start:start + size], shape)))
            start += size

        self.set_op = tf.group(*assigns)
        self.get_op = tf.concat(axis=0, values=[tf.reshape(variable, (-1,)) for variable in variables])
Ejemplo n.º 2
0
    def tf_q_value(self, embedding, distr_params, action, name):
        num_action = util.prod(self.actions_spec[name]['shape'])

        mean, stddev, _ = distr_params
        flat_mean = tf.reshape(tensor=mean, shape=(-1, num_action))
        flat_stddev = tf.reshape(tensor=stddev, shape=(-1, num_action))

        # Advantage computation
        # Network outputs entries of lower triangular matrix L
        if self.l_entries[name] is None:
            l_matrix = flat_stddev

        else:
            l_matrix = tf.map_fn(fn=tf.diag, elems=flat_stddev)

            l_entries = self.l_entries[name].apply(x=embedding)

            offset = 0
            columns = list()
            for zeros, size in enumerate(xrange(num_action - 1, -1, -1), 1):
                column = tf.pad(tensor=l_entries[:, offset: offset + size], paddings=((0, 0), (zeros, 0)))
                columns.append(column)
                offset += size

            l_matrix += tf.stack(values=columns, axis=1)

        # P = LL^T
        p_matrix = tf.matmul(a=l_matrix, b=tf.transpose(a=l_matrix, perm=(0, 2, 1)))

        # A = -0.5 (a - mean)P(a - mean)
        flat_action = tf.reshape(tensor=action, shape=(-1, num_action))
        difference = flat_action - flat_mean
        advantage = tf.matmul(a=p_matrix, b=tf.expand_dims(input=difference, axis=2))
        advantage = tf.matmul(a=tf.expand_dims(input=difference, axis=1), b=advantage)
        advantage = tf.squeeze(input=(-advantage / 2.0), axis=2)

        # Q = A + V
        # State-value function
        state_value = self.state_values[name].apply(x=embedding)
        q_value = state_value + advantage

        return tf.reshape(tensor=q_value, shape=((-1,) + self.actions_spec[name]['shape']))
Ejemplo n.º 3
0
    def create_tf_operations(self, config):
        if len(config.states) > 1:
            raise Exception()

        with tf.variable_scope('mlp_value_function'):
            self.state = tf.placeholder(dtype=tf.float32, shape=(None, util.prod(next(iter(config.states))[1].shape)))
            self.returns = tf.placeholder(dtype=tf.float32, shape=(None,))

            network_builder = layered_network_builder((
                {'type': 'dense', 'size': self.size},
                {'type': 'dense', 'size': 1})
            )

            network = NeuralNetwork(network_builder=network_builder, inputs=dict(state=self.state))

            self.prediction = network.output
            loss = tf.nn.l2_loss(self.prediction - self.returns)

            optimizer = tf.train.AdamOptimizer(learning_rate=config.learning_rate)
            self.optimize = optimizer.minimize(loss)
Ejemplo n.º 4
0
    def create_training_operations(self, config):
        self.training_output = dict()
        q_values = dict()
        for name, action in self.action.items():
            flat_size = util.prod(config.actions[name].shape)
            num_actions = config.actions[name].num_actions
            shape = (-1, ) + config.actions[name].shape + (num_actions, )

            output = layers['linear'](x=self.training_network.output,
                                      size=(flat_size * num_actions))
            output = tf.reshape(tensor=output, shape=shape)

            self.training_output[name] = output
            self.action_taken[name] = tf.argmax(input=output, axis=-1)

            one_hot = tf.one_hot(indices=action, depth=num_actions)
            q_values[name] = tf.reduce_sum(input_tensor=(output * one_hot),
                                           axis=-1)

        return q_values
Ejemplo n.º 5
0
    def tf_regularization_losses(self, states, internals, update):
        losses = super(DistributionModel,
                       self).tf_regularization_losses(states=states,
                                                      internals=internals,
                                                      update=update)

        network_loss = self.network.regularization_loss()
        if network_loss is not None:
            losses['network'] = network_loss

        for distribution in self.distributions.values():
            regularization_loss = distribution.regularization_loss()
            if regularization_loss is not None:
                if 'distributions' in losses:
                    losses['distributions'] += regularization_loss
                else:
                    losses['distributions'] = regularization_loss

        if self.entropy_regularization is not None and self.entropy_regularization > 0.0:
            entropies = list()
            embedding = self.network.apply(x=states,
                                           internals=internals,
                                           update=update)
            for name, distribution in self.distributions.items():
                distr_params = distribution.parameterize(x=embedding)
                entropy = distribution.entropy(distr_params=distr_params)
                collapsed_size = util.prod(util.shape(entropy)[1:])
                entropy = tf.reshape(tensor=entropy,
                                     shape=(-1, collapsed_size))
                entropies.append(entropy)

            entropy_per_instance = tf.reduce_mean(input_tensor=tf.concat(
                values=entropies, axis=1),
                                                  axis=1)
            entropy = tf.reduce_mean(input_tensor=entropy_per_instance, axis=0)
            if 'entropy' in self.summary_labels:
                summary = tf.summary.scalar(name='entropy', tensor=entropy)
                self.summaries.append(summary)
            losses['entropy'] = -self.entropy_regularization * entropy

        return losses
Ejemplo n.º 6
0
 def create_q_deltas(self, config):
     """
     Creates the deltas (or advantage) of the Q values
     :return: A list of deltas per action
     """
     deltas = list()
     terminal_float = tf.cast(x=self.terminal, dtype=tf.float32)
     for name, action in self.action.items():
         reward = self.reward
         terminal = terminal_float
         for _ in range(len(config.actions[name].shape)):
             reward = tf.expand_dims(input=reward, axis=1)
             terminal = tf.expand_dims(input=terminal, axis=1)
         q_target = reward + (
             1.0 - terminal) * config.discount * self.target_values[name]
         delta = tf.stop_gradient(q_target) - self.q_values[name]
         delta = tf.reshape(tensor=delta,
                            shape=(-1,
                                   util.prod(config.actions[name].shape)))
         deltas.append(delta)
     return deltas
Ejemplo n.º 7
0
    def __init__(self, states_spec, actions_spec, network_spec, config):
        if any(action['type'] != 'float' or 'min_value' in action
               or 'max_value' in action for action in actions_spec.values()):
            raise TensorForceError(
                "Only unconstrained float actions valid for NAFModel.")

        with tf.name_scope(name=config.scope):
            self.state_values = dict()
            self.l_entries = dict()
            for name, action in actions_spec.items():
                num_action = util.prod(action['shape'])
                self.state_values[name] = Linear(size=num_action,
                                                 scope=(name + 'state-value'))
                self.l_entries[name] = Linear(size=(num_action *
                                                    (num_action - 1) // 2),
                                              scope=(name + '-l-entries'))

        super(QNAFModel, self).__init__(states_spec=states_spec,
                                        actions_spec=actions_spec,
                                        network_spec=network_spec,
                                        config=config)
Ejemplo n.º 8
0
    def create_target_operations(self, config):
        target_values = dict()
        for name, action in self.action_taken.items():
            flat_size = util.prod(config.actions[name].shape)
            num_actions = config.actions[name].num_actions
            shape = (-1, ) + config.actions[name].shape + (num_actions, )

            output = layers['linear'](x=self.target_network.output,
                                      size=(flat_size * num_actions))
            output = tf.reshape(tensor=output, shape=shape)

            if config.double_dqn:
                one_hot = tf.one_hot(indices=action, depth=num_actions)
                target_values[name] = tf.reduce_sum(input_tensor=(output *
                                                                  one_hot),
                                                    axis=-1)
            else:
                target_values[name] = tf.reduce_max(input_tensor=output,
                                                    axis=-1)

        return target_values
Ejemplo n.º 9
0
 def create_tf_operations(self, x, deterministic):
     flat_size = util.prod(self.shape)
     if isinstance(self.mean, float):
         bias = [self.mean for _ in range(flat_size)]
     else:
         bias = self.mean
     self.mean = layers['linear'](x=x, size=flat_size, bias=bias)
     self.mean = tf.reshape(tensor=self.mean, shape=((-1, ) + self.shape))
     # self.mean = tf.squeeze(input=self.mean, axis=1)
     if isinstance(self.log_stddev, float):
         bias = [self.log_stddev for _ in range(flat_size)]
     else:
         bias = self.log_stddev
     self.log_stddev = layers['linear'](x=x, size=flat_size, bias=bias)
     self.log_stddev = tf.reshape(tensor=self.log_stddev,
                                  shape=((-1, ) + self.shape))
     # self.log_stddev = tf.squeeze(input=self.log_stddev, axis=1)
     self.log_stddev = tf.minimum(x=self.log_stddev,
                                  y=10.0)  # prevent infinity when exp
     self.distribution = (self.mean, self.log_stddev)
     self.deterministic = deterministic
Ejemplo n.º 10
0
    def create_tf_operations(self, x, deterministic):
        # Flat mean and log standard deviation
        flat_size = util.prod(self.shape)

        # Softplus to ensure alpha and beta >= 1
        self.alpha = layers['linear'](x=x, size=flat_size, bias=self.alpha)
        self.alpha = tf.nn.softplus(features=self.alpha)
        shape = (-1, ) + self.shape
        self.alpha = tf.reshape(tensor=self.alpha, shape=shape)

        self.beta = layers['linear'](x=x, size=flat_size, bias=self.beta)
        self.beta = tf.nn.softplus(features=self.beta)
        self.beta = tf.reshape(tensor=self.beta, shape=shape)

        self.sum = self.alpha + self.beta
        self.mean = self.alpha / tf.maximum(x=self.sum, y=util.epsilon)

        self.log_norm = tf.lgamma(self.alpha) + tf.lgamma(
            self.beta) - tf.lgamma(self.sum)

        self.deterministic = deterministic
Ejemplo n.º 11
0
    def tf_kl_divergence(self, states, internals, update):
        embedding = self.network.apply(x=states,
                                       internals=internals,
                                       update=update)
        kl_divergences = list()

        for name, distribution in self.distributions.items():
            distr_params = distribution.parameterize(x=embedding)
            fixed_distr_params = tuple(
                tf.stop_gradient(input=value) for value in distr_params)
            kl_divergence = distribution.kl_divergence(
                distr_params1=fixed_distr_params, distr_params2=distr_params)
            collapsed_size = util.prod(util.shape(kl_divergence)[1:])
            kl_divergence = tf.reshape(tensor=kl_divergence,
                                       shape=(-1, collapsed_size))
            kl_divergences.append(kl_divergence)

        kl_divergence_per_instance = tf.reduce_mean(input_tensor=tf.concat(
            values=kl_divergences, axis=1),
                                                    axis=1)
        return tf.reduce_mean(input_tensor=kl_divergence_per_instance, axis=0)
Ejemplo n.º 12
0
    def __init__(self, shape, min_value, max_value, alpha=0.0, beta=0.0, scope='beta', summary_labels=()):
        """
        Beta distribution.

        Args:
            shape: Action shape.
            min_value: Minimum value of continuous actions.
            max_value: Maximum value of continuous actions.
            alpha: Optional distribution bias for the alpha value.
            beta: Optional distribution bias for the beta value.
        """
        assert min_value is None or max_value > min_value
        self.shape = shape
        self.min_value = min_value
        self.max_value = max_value
        action_size = util.prod(self.shape)

        self.alpha = Linear(size=action_size, bias=alpha, scope='alpha', summary_labels=summary_labels)
        self.beta = Linear(size=action_size, bias=beta, scope='beta', summary_labels=summary_labels)

        super(Beta, self).__init__(shape=shape, scope=scope, summary_labels=summary_labels)
Ejemplo n.º 13
0
    def tf_demo_loss(self, states, actions, terminal, reward, internals, update, reference=None):
        """
        Extends the q-model loss via the dqfd large-margin loss.
        """
        embedding = self.network.apply(x=states, internals=internals, update=update)
        deltas = list()

        for name in sorted(actions):
            action = actions[name]
            distr_params = self.distributions[name].parameterize(x=embedding)
            state_action_value = self.distributions[name].state_action_value(distr_params=distr_params, action=action)

            # Create the supervised margin loss
            # Zero for the action taken, one for all other actions, now multiply by expert margin
            if self.actions_spec[name]['type'] == 'bool':
                num_actions = 2
                action = tf.cast(x=action, dtype=util.tf_dtype('int'))
            else:
                num_actions = self.actions_spec[name]['num_actions']

            one_hot = tf.one_hot(indices=action, depth=num_actions)
            ones = tf.ones_like(tensor=one_hot, dtype=tf.float32)
            inverted_one_hot = ones - one_hot

            # max_a([Q(s,a) + l(s,a_E,a)], l(s,a_E, a) is 0 for expert action and margin value for others
            state_action_values = self.distributions[name].state_action_value(distr_params=distr_params)
            state_action_values = state_action_values + inverted_one_hot * self.expert_margin
            supervised_selector = tf.reduce_max(input_tensor=state_action_values, axis=-1)

            # J_E(Q) = max_a([Q(s,a) + l(s,a_E,a)] - Q(s,a_E)
            delta = supervised_selector - state_action_value

            action_size = util.prod(self.actions_spec[name]['shape'])
            delta = tf.reshape(tensor=delta, shape=(-1, action_size))
            deltas.append(delta)

        loss_per_instance = tf.reduce_mean(input_tensor=tf.concat(values=deltas, axis=1), axis=1)
        loss_per_instance = tf.square(x=loss_per_instance)

        return tf.reduce_mean(input_tensor=loss_per_instance, axis=0)
Ejemplo n.º 14
0
    def create_tf_operations(self, x, deterministic):
        self.min_value = tf.constant(value=self.min_value, dtype=tf.float32)
        self.max_value = tf.constant(value=self.max_value, dtype=tf.float32)

        # Flat mean and log standard deviation
        flat_size = util.prod(self.shape)
        log_eps = log(util.epsilon)

        # Softplus to ensure alpha and beta >= 1
        self.alpha = layers['linear'](x=x,
                                      size=flat_size,
                                      bias=self.alpha,
                                      scope='alpha')
        self.alpha = tf.clip_by_value(t=self.alpha,
                                      clip_value_min=log_eps,
                                      clip_value_max=-log_eps)
        self.alpha = tf.log(x=(tf.exp(x=self.alpha) +
                               1.0))  # tf.nn.softplus(features=self.alpha)

        self.beta = layers['linear'](x=x,
                                     size=flat_size,
                                     bias=self.beta,
                                     scope='beta')
        self.beta = tf.clip_by_value(t=self.beta,
                                     clip_value_min=log_eps,
                                     clip_value_max=-log_eps)
        self.beta = tf.log(x=(tf.exp(x=self.beta) +
                              1.0))  # tf.nn.softplus(features=self.beta)

        shape = (-1, ) + self.shape
        self.alpha = tf.reshape(tensor=self.alpha, shape=shape)
        self.beta = tf.reshape(tensor=self.beta, shape=shape)

        self.sum = tf.maximum(x=(self.alpha + self.beta), y=util.epsilon)
        self.mean = self.beta / self.sum

        self.log_norm = tf.lgamma(self.alpha) + tf.lgamma(
            self.beta) - tf.lgamma(self.sum)

        self.deterministic = deterministic
Ejemplo n.º 15
0
    def create_tf_operations(self, x, deterministic):
        self.deterministic = deterministic

        # Flat logits
        flat_size = util.prod(self.shape) * self.num_actions
        self.logits = layers['linear'](x=x,
                                       size=flat_size,
                                       bias=self.logits,
                                       scope='logits')

        # Reshape logits to action shape
        shape = (-1, ) + self.shape + (self.num_actions, )
        self.logits = tf.reshape(tensor=self.logits, shape=shape)

        # Softmax for corresponding probabilities
        self.probabilities = tf.nn.softmax(logits=self.logits, dim=-1)

        # Min epsilon probability for numerical stability
        self.probabilities = tf.maximum(x=self.probabilities, y=util.epsilon)

        # "Normalized" logits
        self.logits = tf.log(x=self.probabilities)
Ejemplo n.º 16
0
    def tf_demo_loss(self, states, actions, terminal, reward, internals,
                     update):
        embedding = self.network.apply(x=states,
                                       internals=internals,
                                       update=update)
        deltas = list()

        for name, distribution in self.distributions.items():
            distr_params = distribution.parameters(x=embedding)
            state_action_values = distribution.state_action_values(
                distr_params=distr_params)

            # Create the supervised margin loss
            # Zero for the action taken, one for all other actions, now multiply by expert margin
            if self.actions_spec[name]['type'] == 'bool':
                num_actions = 2
            else:
                num_actions = self.actions_spec[name]['num_actions']
            one_hot = tf.one_hot(indices=actions[name], depth=num_actions)
            ones = tf.ones_like(tensor=one_hot, dtype=tf.float32)
            inverted_one_hot = ones - one_hot

            # max_a([Q(s,a) + l(s,a_E,a)], l(s,a_E, a) is 0 for expert action and margin value for others
            expert_margin = distr_params + inverted_one_hot * self.expert_margin

            # J_E(Q) = max_a([Q(s,a) + l(s,a_E,a)] - Q(s,a_E)
            supervised_selector = tf.reduce_max(input_tensor=expert_margin,
                                                axis=-1)
            delta = supervised_selector - state_action_values
            delta = tf.reshape(
                tensor=delta,
                shape=(-1, util.prod(self.actions_spec[name]['shape'])))
            deltas.append(delta)

        loss_per_instance = tf.reduce_mean(input_tensor=tf.concat(
            values=deltas, axis=1),
                                           axis=1)
        loss_per_instance = tf.square(x=loss_per_instance)
        return tf.reduce_mean(input_tensor=loss_per_instance, axis=0)
Ejemplo n.º 17
0
    def __init__(self,
                 shape,
                 probability=0.5,
                 scope='bernoulli',
                 summary_labels=()):
        """
        Bernoulli distribution.

        Args:
            shape: Action shape.
            probability: Optional distribution bias.
        """
        self.shape = shape
        action_size = util.prod(self.shape)

        self.logit = Linear(size=action_size,
                            bias=log(probability),
                            scope='logit')

        super(Bernoulli, self).__init__(shape=shape,
                                        scope=scope,
                                        summary_labels=summary_labels)
Ejemplo n.º 18
0
 def tf_compare(self, states, internals, actions, terminal, reward, update,
                reference):
     reward = self.fn_reward_estimation(states=states,
                                        internals=internals,
                                        terminal=terminal,
                                        reward=reward,
                                        update=update)
     embedding = self.network.apply(x=states,
                                    internals=internals,
                                    update=update)
     log_probs = list()
     for name in sorted(self.distributions):
         distribution = self.distributions[name]
         distr_params = distribution.parameterize(x=embedding)
         log_prob = distribution.log_probability(distr_params=distr_params,
                                                 action=actions[name])
         collapsed_size = util.prod(util.shape(log_prob)[1:])
         log_prob = tf.reshape(tensor=log_prob, shape=(-1, collapsed_size))
         log_probs.append(log_prob)
     log_prob = tf.reduce_mean(input_tensor=tf.concat(values=log_probs,
                                                      axis=1),
                               axis=1)
     prob_ratio = tf.exp(x=(log_prob - reference))
     if self.likelihood_ratio_clipping is None:
         gain_per_instance = prob_ratio * reward
     else:
         clipped_prob_ratio = tf.clip_by_value(
             t=prob_ratio,
             clip_value_min=(1.0 / (1.0 + self.likelihood_ratio_clipping)),
             clip_value_max=(1.0 + self.likelihood_ratio_clipping))
         gain_per_instance = tf.minimum(x=(prob_ratio * reward),
                                        y=(clipped_prob_ratio * reward))
     gain = tf.reduce_mean(input_tensor=gain_per_instance, axis=0)
     losses = self.fn_regularization_losses(states=states,
                                            internals=internals,
                                            update=update)
     if len(losses) > 0:
         gain -= tf.add_n(inputs=list(losses.values()))
     return gain
Ejemplo n.º 19
0
    def __init__(self, shape, min_value, max_value, alpha=0.0, beta=0.0, scope='beta', summary_labels=()):
        """
        Beta distribution used for continuous actions. In particular, the Beta distribution
        allows to bound action values with min and max values.

        Args:
            shape: Shape of actions
            min_value: Min value of all actions for the given shape
            max_value: Max value of all actions for the given shape
            alpha: Concentration parameter of the Beta distribution
            beta: Concentration parameter of the Beta distribution
        """
        assert min_value is None or max_value > min_value
        self.shape = shape
        self.min_value = min_value
        self.max_value = max_value
        action_size = util.prod(self.shape)

        with tf.name_scope(name=scope):
            self.alpha = Linear(size=action_size, bias=alpha, scope='alpha')
            self.beta = Linear(size=action_size, bias=beta, scope='beta')

        super(Beta, self).__init__(scope, summary_labels)
Ejemplo n.º 20
0
    def create_tf_operations(self, x, deterministic):
        # Flat logits
        flat_size = util.prod(self.shape) * self.num_actions
        self.logits = layers['linear'](x=x, size=flat_size, bias=self.logits)

        # Reshape logits to action shape
        shape = (-1, ) + self.shape + (self.num_actions, )
        self.logits = tf.reshape(tensor=self.logits, shape=shape)

        # Linearly shift logits for numerical stability
        self.logits -= tf.reduce_max(input_tensor=self.logits,
                                     axis=-1,
                                     keep_dims=True)

        # Softmax for corresponding probabilities
        self.probabilities = tf.nn.softmax(logits=self.logits, dim=-1)

        # "normalized" logits
        self.logits = tf.log(x=self.probabilities)

        # General distribution values
        self.distribution = (self.logits, )
        self.deterministic = deterministic
Ejemplo n.º 21
0
    def tf_loss_per_instance(
        self,
        states,
        internals,
        actions,
        terminal,
        reward,
        next_states,
        next_internals,
        update,
        reference=None
    ):
        embedding = self.network.apply(x=states, internals=internals, update=update)
        log_probs = list()

        for name, distribution in self.distributions.items():
            distr_params = distribution.parameterize(x=embedding)
            log_prob = distribution.log_probability(distr_params=distr_params, action=actions[name])
            collapsed_size = util.prod(util.shape(log_prob)[1:])
            log_prob = tf.reshape(tensor=log_prob, shape=(-1, collapsed_size))
            log_probs.append(log_prob)
        log_prob = tf.reduce_mean(input_tensor=tf.concat(values=log_probs, axis=1), axis=1)
        return -log_prob * reward
Ejemplo n.º 22
0
    def create_tf_operations(self, config):
        """Create training graph. For DQFD, we build the double-dqn training graph and
        modify the double_q_loss function according to eq. 5

        Args:
            config: Config dict.

        Returns:

        """
        super(DQFDModel, self).create_tf_operations(config)

        with tf.name_scope('supervised-update'):
            deltas = list()
            for name, action in self.action.items():
                # Create the supervised margin loss
                # Zero for the action taken, one for all other actions, now multiply by expert margin
                one_hot = tf.one_hot(indices=action, depth=config.actions[name].num_actions)
                ones = tf.ones_like(tensor=one_hot, dtype=tf.float32)
                inverted_one_hot = ones - one_hot

                # max_a([Q(s,a) + l(s,a_E,a)], l(s,a_E, a) is 0 for expert action and margin value for others
                expert_margin = self.training_output[name] + inverted_one_hot * config.expert_margin

                # J_E(Q) = max_a([Q(s,a) + l(s,a_E,a)] - Q(s,a_E)
                supervised_selector = tf.reduce_max(input_tensor=expert_margin, axis=-1)
                delta = supervised_selector - self.q_values[name]
                delta = tf.reshape(tensor=delta, shape=(-1, util.prod(config.actions[name].shape)))
                deltas.append(delta)

            delta = tf.reduce_mean(input_tensor=tf.concat(values=deltas, axis=1), axis=1)
            supervised_loss_per_instance = tf.square(delta)
            supervised_loss = tf.reduce_mean(input_tensor=supervised_loss_per_instance)

            # Combining double q loss with supervised loss
            dqfd_loss = self.q_loss + supervised_loss * config.supervised_weight
            self.dqfd_optimize = self.optimizer.minimize(dqfd_loss)
 def tf_pg_loss_per_instance(self, states, internals, actions, terminal, reward, update):
     embedding = self.network.apply(x=states, internals=internals, update=update)
     prob_ratios = list()
     for name, distribution in self.distributions.items():
         distr_params = distribution.parameterize(x=embedding)
         log_prob = distribution.log_probability(distr_params=distr_params, action=actions[name])
         # works the same?
         # fixed_distr_params = tuple(tf.stop_gradient(input=x) for x in distr_params)
         # fixed_log_prob = distribution.log_probability(distr_params=fixed_distr_params, action=actions[name])
         fixed_log_prob = tf.stop_gradient(input=log_prob)
         prob_ratio = tf.exp(x=(log_prob - fixed_log_prob))
         collapsed_size = util.prod(util.shape(prob_ratio)[1:])
         prob_ratio = tf.reshape(tensor=prob_ratio, shape=(-1, collapsed_size))
         prob_ratios.append(prob_ratio)
     prob_ratio = tf.reduce_mean(input_tensor=tf.concat(values=prob_ratios, axis=1), axis=1)
     if self.likelihood_ratio_clipping is None:
         return -prob_ratio * reward
     else:
         clipped_prob_ratio = tf.clip_by_value(
             t=prob_ratio,
             clip_value_min=(1.0 / (1.0 + self.likelihood_ratio_clipping)),
             clip_value_max=(1.0 + self.likelihood_ratio_clipping)
         )
         return -tf.minimum(x=(prob_ratio * reward), y=(clipped_prob_ratio * reward))
Ejemplo n.º 24
0
    def create_tf_operations(self, state, scope='mlp_baseline'):
        with tf.variable_scope(scope) as scope:
            self.state = tf.placeholder(dtype=tf.float32,
                                        shape=(None, util.prod(state.shape)))
            self.returns = tf.placeholder(dtype=tf.float32, shape=(None, ))

            layers = []
            for size in self.sizes:
                layers.append({'type': 'dense', 'size': size})

            layers.append({'type': 'linear', 'size': 1})

            network = NeuralNetwork(
                network_builder=layered_network_builder(layers),
                inputs=dict(state=self.state))

            self.prediction = tf.squeeze(input=network.output, axis=1)
            loss = tf.nn.l2_loss(self.prediction - self.returns)

            optimizer = tf.train.AdamOptimizer(
                learning_rate=self.learning_rate)

            variables = tf.contrib.framework.get_variables(scope=scope)
            self.optimize = optimizer.minimize(loss, var_list=variables)
Ejemplo n.º 25
0
 def tf_apply(self, x, update):
     return tf.reshape(tensor=x, shape=(-1, util.prod(util.shape(x)[1:])))
Ejemplo n.º 26
0
def flatten(x):
    with tf.variable_scope('flatten'):
        x = tf.reshape(tensor=x, shape=(-1, util.prod(x.get_shape().as_list()[1:])))
    return x
Ejemplo n.º 27
0
    def create_tf_operations(self, config):
        super(CategoricalDQNModel, self).create_tf_operations(config)

        # Placeholders
        with tf.variable_scope('placeholder'):
            self.next_state = dict()
            for name, state in config.states.items():
                self.next_state[name] = tf.placeholder(
                    dtype=util.tf_dtype(state.type),
                    shape=(None, ) + tuple(state.shape),
                    name=name)

        # setup constants delta_z and z. z represents the discretized scaling over vmin -> vmax
        scaling_increment = (self.distribution_max - self.distribution_min) / (
            self.num_atoms - 1)  # delta_z in the paper
        quantized_steps = self.distribution_min + np.arange(
            self.num_atoms) * scaling_increment  # z in the paper

        num_actions = {
            name: action.num_actions
            for name, action in config.actions
        }

        # creating networks
        network_builder = util.get_function(fct=config.network)

        # Training network
        with tf.variable_scope('training') as training_scope:
            self.training_network = NeuralNetwork(
                network_builder=network_builder,
                inputs=self.state,
                summary_level=config.tf_summary_level)
            self.network_internal_index = len(self.internal_inputs)
            self.internal_inputs.extend(self.training_network.internal_inputs)
            self.internal_outputs.extend(
                self.training_network.internal_outputs)
            self.internal_inits.extend(self.training_network.internal_inits)
            training_output_logits, training_output_probabilities, training_qval, action_taken = self._create_action_outputs(
                self.training_network.output, quantized_steps, self.num_atoms,
                config, self.action, num_actions)
            # stack to preserve action_taken shape like (batch_size, num_actions)
            for action in self.action:
                if len(action_taken[action]) > 1:
                    self.action_taken[action] = tf.stack(action_taken[action],
                                                         axis=1)
                else:
                    self.action_taken[action] = action_taken[action][0]

                # summarize expected reward histogram
                if config.tf_summary_level >= 1:
                    for action_shaped in range(len(action_taken[action])):
                        for action_ind in range(num_actions[action]):
                            tf.summary.histogram(
                                '{}-{}-{}-output-distribution'.format(
                                    action, action_shaped, action_ind),
                                training_output_probabilities[action]
                                [action_shaped][:, action_ind] *
                                quantized_steps)

            self.training_variables = tf.contrib.framework.get_variables(
                scope=training_scope)

        # Target network
        with tf.variable_scope('target') as target_scope:
            self.target_network = NeuralNetwork(
                network_builder=network_builder, inputs=self.next_state)
            self.next_internal_inputs = list(
                self.target_network.internal_inputs)
            _, target_output_probabilities, target_qval, target_action = self._create_action_outputs(
                self.target_network.output, quantized_steps, self.num_atoms,
                config, self.action, num_actions)

            self.target_variables = tf.contrib.framework.get_variables(
                scope=target_scope)

        with tf.name_scope('update'):
            # broadcast rewards and discounted quantization. Shape (batchsize, num_atoms). T_z_j in the paper
            reward = tf.expand_dims(self.reward, axis=1)
            terminal = tf.expand_dims(tf.cast(x=self.terminal,
                                              dtype=tf.float32),
                                      axis=1)
            broadcasted_rewards = reward + (1.0 - terminal) * (
                quantized_steps * self.discount)
            # clip into distribution_min, distribution_max
            quantized_discounted_reward = tf.clip_by_value(
                broadcasted_rewards, self.distribution_min,
                self.distribution_max)
            # compute quantization indecies. b, l, u in the paper
            closest_quantization = (quantized_discounted_reward -
                                    self.distribution_min) / scaling_increment
            lower_ind = tf.floor(closest_quantization)
            upper_ind = tf.ceil(closest_quantization)

            # create shared selections for later use
            dynamic_batch_size = tf.shape(self.reward)[0]
            batch_selection = tf.range(0, dynamic_batch_size)
            # tile expects a tensor of same shape, we are just repeating the selection num_atoms times across the last dimension
            batch_tiled_selection = tf.reshape(
                tf.tile(tf.reshape(batch_selection, (-1, 1)),
                        [1, self.num_atoms]), [-1])
            # combine with lower and upper ind, same as zip(flatten(batch_tiled_selection), flatten(lower_ind))
            # also cast to int32 to use as index
            batch_lower_inds = tf.stack(
                (batch_tiled_selection,
                 tf.reshape(tf.cast(lower_ind, tf.int32), [-1])),
                axis=1)
            batch_upper_inds = tf.stack(
                (batch_tiled_selection,
                 tf.reshape(tf.cast(upper_ind, tf.int32), [-1])),
                axis=1)

            # create loss for each action
            for action in self.action:
                # if shape of action != () we need to process each action head separately
                for action_ind in range(
                        max([util.prod(config.actions[action].shape), 1])):
                    # project onto the supports
                    # tensorflow indexing is still not great, we stack these two and use gather_nd later
                    target_batch_action_selection = tf.stack(
                        (batch_selection, target_action[action][action_ind]),
                        axis=1)

                    # distribute probability scaled by distance
                    # in numpy the equivalent is target_output_probabilities[action][batch_selection, target_action]
                    target_probabilities_of_action = tf.gather_nd(
                        target_output_probabilities[action][action_ind],
                        target_batch_action_selection)
                    distance_lower = target_probabilities_of_action * (
                        closest_quantization - lower_ind)
                    distance_upper = target_probabilities_of_action * (
                        upper_ind - closest_quantization)

                    # sum distances aligned into quantized bins. m in the paper
                    # scatter_nd actually sums the values into a zeros tensor instead of overwriting
                    # this is pretty much a huge hack refer to https://github.com/tensorflow/tensorflow/issues/8102
                    target_quantized_probabilities_lower = tf.scatter_nd(
                        batch_lower_inds, tf.reshape(distance_lower, [-1]),
                        (dynamic_batch_size, self.num_atoms))
                    target_quantized_probabilities_upper = tf.scatter_nd(
                        batch_upper_inds, tf.reshape(distance_upper, [-1]),
                        (dynamic_batch_size, self.num_atoms))
                    # no gradient should flow back to the target network
                    target_quantized_probabilities = tf.stop_gradient(
                        target_quantized_probabilities_lower +
                        target_quantized_probabilities_upper)

                    # we must check if input action has shape
                    if len(self.action[action].shape) > 1:
                        input_action = self.action[action][:, action_ind]
                    else:
                        input_action = self.action[action]
                    # now we have target probabilities loss is categorical cross entropy using logits
                    # compare to the actions we actually took
                    training_action_selection = tf.stack(
                        (batch_selection, input_action), axis=1)
                    probabilities_for_action = tf.gather_nd(
                        training_output_probabilities[action][action_ind],
                        training_action_selection)
                    self.loss_per_instance = -tf.reduce_sum(
                        target_quantized_probabilities *
                        tf.log(probabilities_for_action + util.epsilon),
                        axis=-1)
                    loss = tf.reduce_mean(self.loss_per_instance)
                    tf.losses.add_loss(loss)

                    tf.summary.scalar(
                        'cce-loss-{}-{}'.format(action, action_ind), loss)

        # Update target network
        with tf.name_scope("update_target"):
            self.target_network_update = list()
            for v_source, v_target in zip(self.training_variables,
                                          self.target_variables):
                update = v_target.assign_sub(config.update_target_weight *
                                             (v_target - v_source))
                self.target_network_update.append(update)
Ejemplo n.º 28
0
    def create_tf_operations(self, config):
        super(NAFModel, self).create_tf_operations(config)
        num_actions = sum(
            util.prod(config.actions[name].shape)
            for name in sorted(self.action))

        # Get hidden layers from network generator, then add NAF outputs, same for target network
        with tf.variable_scope('training'):
            network_builder = util.get_function(fct=config.network)
            self.training_network = NeuralNetwork(
                network_builder=network_builder, inputs=self.state)
            self.internal_inputs.extend(self.training_network.internal_inputs)
            self.internal_outputs.extend(
                self.training_network.internal_outputs)
            self.internal_inits.extend(self.training_network.internal_inits)

        with tf.variable_scope('training_outputs') as scope:
            # Action outputs
            flat_mean = layers['linear'](x=self.training_network.output,
                                         size=num_actions)
            n = 0
            for name in sorted(self.action):
                shape = config.actions[name].shape
                self.action_taken[name] = tf.reshape(
                    tensor=flat_mean[:, n:n + util.prod(shape)],
                    shape=((-1, ) + shape))
                n += util.prod(shape)

            # Advantage computation
            # Network outputs entries of lower triangular matrix L
            lower_triangular_size = num_actions * (num_actions + 1) // 2
            l_entries = layers['linear'](x=self.training_network.output,
                                         size=lower_triangular_size)

            l_matrix = tf.exp(
                x=tf.map_fn(fn=tf.diag, elems=l_entries[:, :num_actions]))

            if num_actions > 1:
                offset = num_actions
                l_columns = list()
                for zeros, size in enumerate(xrange(num_actions - 1, -1, -1),
                                             1):
                    column = tf.pad(tensor=l_entries[:, offset:offset + size],
                                    paddings=((0, 0), (zeros, 0)))
                    l_columns.append(column)
                    offset += size
                l_matrix += tf.stack(values=l_columns, axis=1)

            # P = LL^T
            p_matrix = tf.matmul(a=l_matrix,
                                 b=tf.transpose(a=l_matrix, perm=(0, 2, 1)))

            flat_action = list()
            for name in sorted(self.action):
                shape = config.actions[name].shape
                flat_action.append(
                    tf.reshape(tensor=self.action[name],
                               shape=(-1, util.prod(shape))))
            flat_action = tf.concat(values=flat_action, axis=1)
            difference = flat_action - flat_mean

            # A = -0.5 (a - mean)P(a - mean)
            advantage = tf.matmul(a=p_matrix,
                                  b=tf.expand_dims(input=difference, axis=2))
            advantage = tf.matmul(a=tf.expand_dims(input=difference, axis=1),
                                  b=advantage)
            advantage = tf.squeeze(input=(-advantage / 2.0), axis=2)

            # Q = A + V
            # State-value function
            value = layers['linear'](x=self.training_network.output,
                                     size=num_actions)
            q_value = value + advantage
            training_output_vars = tf.contrib.framework.get_variables(
                scope=scope)

        with tf.variable_scope('target'):
            network_builder = util.get_function(fct=config.network)
            self.target_network = NeuralNetwork(
                network_builder=network_builder, inputs=self.state)
            self.internal_inputs.extend(self.target_network.internal_inputs)
            self.internal_outputs.extend(self.target_network.internal_outputs)
            self.internal_inits.extend(self.target_network.internal_inits)

        with tf.variable_scope('target_outputs') as scope:
            # State-value function
            target_value = layers['linear'](x=self.target_network.output,
                                            size=num_actions)
            target_output_vars = tf.contrib.framework.get_variables(
                scope=scope)

        with tf.name_scope('update'):
            reward = tf.expand_dims(input=self.reward[:-1], axis=1)
            terminal = tf.expand_dims(input=tf.cast(x=self.terminal[:-1],
                                                    dtype=tf.float32),
                                      axis=1)
            q_target = reward + (1.0 -
                                 terminal) * config.discount * target_value[1:]
            delta = q_target - q_value[:-1]
            delta = tf.reduce_mean(input_tensor=delta, axis=1)
            self.loss_per_instance = tf.square(x=delta)

            # We observe issues with numerical stability in some tests, gradient clipping can help
            if config.clip_gradients > 0.0:
                huber_loss = tf.where(
                    condition=(tf.abs(delta) < config.clip_gradients),
                    x=(0.5 * self.loss_per_instance),
                    y=(tf.abs(delta) - 0.5))
                loss = tf.reduce_mean(input_tensor=huber_loss, axis=0)
            else:
                loss = tf.reduce_mean(input_tensor=self.loss_per_instance,
                                      axis=0)
            tf.losses.add_loss(loss)

        with tf.name_scope('update_target'):
            # Combine hidden layer variables and output layer variables
            training_vars = self.training_network.variables + training_output_vars
            target_vars = self.target_network.variables + target_output_vars

            self.target_network_update = list()
            for v_source, v_target in zip(training_vars, target_vars):
                update = v_target.assign_sub(config.update_target_weight *
                                             (v_target - v_source))
                self.target_network_update.append(update)
Ejemplo n.º 29
0
    def act(self, states, deterministic=False):
        """
        Return action(s) for given state(s). First, the states are preprocessed using the given preprocessing
        configuration. Then, the states are passed to the model to calculate the desired action(s) to execute.

        After obtaining the actions, exploration might be added by the agent, depending on the exploration
        configuration.

        Args:
            states: One state (usually a value tuple) or dict of states if multiple states are expected.
            deterministic: If true, no exploration and sampling is applied.
        Returns:
            Scalar value of the action or dict of multiple actions the agent wants to execute.

        """

        self.current_internals = self.next_internals

        if self.unique_state:
            self.current_states = dict(state=np.asarray(states))
        else:
            self.current_states = {
                name: np.asarray(state)
                for name, state in states.items()
            }

        # Preprocessing
        for name, preprocessing in self.preprocessing.items():
            self.current_states[name] = preprocessing.process(
                state=self.current_states[name])

        # Retrieve action
        self.current_actions, self.next_internals, self.timestep = self.model.act(
            states=self.current_states,
            internals=self.current_internals,
            deterministic=deterministic)

        # Exploration
        if not deterministic:
            for name, exploration in self.exploration.items():

                if self.actions_spec[name]['type'] == 'bool':
                    if random() < exploration(episode=self.episode,
                                              timestep=self.timestep):
                        shape = self.actions_spec[name]['shape']
                        self.current_actions[name] = (
                            np.random.random_sample(size=shape) < 0.5)

                elif self.actions_spec[name]['type'] == 'int':
                    if random() < exploration(episode=self.episode,
                                              timestep=self.timestep):
                        shape = self.actions_spec[name]['shape']
                        num_actions = self.actions_spec[name]['num_actions']
                        self.current_actions[name] = np.random.randint(
                            low=num_actions, size=shape)

                elif self.actions_spec[name]['type'] == 'float':
                    explore = (lambda: exploration(episode=self.episode,
                                                   timestep=self.timestep))
                    shape = self.actions_spec[name]['shape']
                    exploration = np.array(
                        [explore() for _ in xrange(util.prod(shape))])

                    if 'min_value' in self.actions_spec[name]:
                        exploration = np.clip(
                            a=exploration,
                            a_min=self.actions_spec[name]['min_value'],
                            a_max=self.actions_spec[name]['max_value'])

                    self.current_actions[name] += np.reshape(
                        exploration, shape)

        if self.unique_action:
            return self.current_actions['action']
        else:
            return self.current_actions
Ejemplo n.º 30
0
 def processed_shape(self, shape):
     if shape[0] == -1:
         return -1, util.prod(shape[1:])
     return util.prod(shape),
Ejemplo n.º 31
0
    def create_tf_operations(self, config):
        """
        Creates PPO training operations, i.e. the SGD update
        based on the trust region loss.
        :return:
        """
        super(PPOModel, self).create_tf_operations(config)

        with tf.variable_scope('update'):
            prob_ratios = list()
            entropy_penalties = list()

            # for diagnostics
            kl_divergences = list()
            entropies = list()
            self.distribution_tensors = dict()
            self.prev_distribution_tensors = dict()

            for name, action in self.action.items():
                shape_size = util.prod(config.actions[name].shape)
                distribution = self.distribution[name]
                fixed_distribution = distribution.__class__.from_tensors(
                    tensors=[
                        tf.stop_gradient(x)
                        for x in distribution.get_tensors()
                    ],
                    deterministic=self.deterministic)

                # Standard policy gradient log likelihood computation
                log_prob = distribution.log_probability(action=action)
                fixed_log_prob = fixed_distribution.log_probability(
                    action=action)
                log_prob_diff = log_prob - fixed_log_prob
                prob_ratio = tf.exp(x=log_prob_diff)
                prob_ratio = tf.reshape(tensor=prob_ratio,
                                        shape=(-1, shape_size))
                prob_ratios.append(prob_ratio)

                entropy = distribution.entropy()
                entropy_penalty = -config.entropy_penalty * entropy
                entropy_penalty = tf.reshape(tensor=entropy_penalty,
                                             shape=(-1, shape_size))
                entropy_penalties.append(entropy_penalty)

                self.distribution_tensors[name] = list(
                    distribution.get_tensors())
                prev_distribution = list(
                    tf.placeholder(dtype=tf.float32,
                                   shape=util.shape(tensor, unknown=None))
                    for tensor in distribution.get_tensors())
                self.prev_distribution_tensors[name] = prev_distribution
                prev_distribution = distribution.from_tensors(
                    tensors=prev_distribution,
                    deterministic=self.deterministic)

                kl_divergence = prev_distribution.kl_divergence(
                    other=distribution)
                kl_divergence = tf.reshape(tensor=kl_divergence,
                                           shape=(-1, shape_size))
                kl_divergences.append(kl_divergence)

                entropy = tf.reshape(tensor=entropy, shape=(-1, shape_size))
                entropies.append(entropy)

            # The surrogate loss in PPO is the minimum of clipped loss and
            # target advantage * prob_ratio, which is the CPO loss
            # Presentation on conservative policy iteration:
            # https://www.cs.cmu.edu/~jcl/presentation/RL/RL.ps
            prob_ratio = tf.reduce_mean(input_tensor=tf.concat(
                values=prob_ratios, axis=1),
                                        axis=1)
            tf.summary.histogram('prob_ratio', prob_ratio)
            tf.summary.scalar('mean_prob_ratio',
                              tf.reduce_mean(input_tensor=prob_ratio, axis=0))

            clipped_prob_ratio = tf.clip_by_value(prob_ratio,
                                                  1.0 - config.loss_clipping,
                                                  1.0 + config.loss_clipping)
            self.loss_per_instance = -tf.minimum(
                x=(prob_ratio * self.reward),
                y=(clipped_prob_ratio * self.reward))
            self.surrogate_loss = tf.reduce_mean(
                input_tensor=self.loss_per_instance,
                axis=0,
                name='surrogate_loss')
            tf.losses.add_loss(self.surrogate_loss)

            # Mean over actions, mean over batch
            entropy_penalty = tf.reduce_mean(input_tensor=tf.concat(
                values=entropy_penalties, axis=1),
                                             axis=1)
            self.entropy_penalty = tf.reduce_mean(input_tensor=entropy_penalty,
                                                  axis=0,
                                                  name='entropy_penalty')
            tf.losses.add_loss(self.entropy_penalty)

            kl_divergence = tf.reduce_mean(input_tensor=tf.concat(
                values=kl_divergences, axis=1),
                                           axis=1)
            self.kl_divergence = tf.reduce_mean(input_tensor=kl_divergence,
                                                axis=0)
            tf.summary.scalar('kl_divergence', self.kl_divergence)

            entropy = tf.reduce_mean(input_tensor=tf.concat(values=entropies,
                                                            axis=1),
                                     axis=1)
            self.entropy = tf.reduce_mean(input_tensor=entropy, axis=0)
            tf.summary.scalar('entropy', self.entropy)
Ejemplo n.º 32
0
    def tf_loss_per_instance(self, states, internals, actions, terminal,
                             reward, update):
        # TEMP: Random sampling fix
        if self.random_sampling_fix:
            next_states = self.get_states(states=self.next_state_inputs)
            next_states = {
                name: tf.stop_gradient(input=state)
                for name, state in next_states.items()
            }

            embedding, next_internals = self.network.apply(
                x=states,
                internals=internals,
                update=update,
                return_internals=True)

            # Both networks can use the same internals, could that be a problem?
            # Otherwise need to handle internals indices correctly everywhere
            target_embedding = self.target_network.apply(
                x=next_states, internals=next_internals, update=update)

        else:
            embedding = self.network.apply(
                x={name: state[:-1]
                   for name, state in states.items()},
                internals=[internal[:-1] for internal in internals],
                update=update)

            # Both networks can use the same internals, could that be a problem?
            # Otherwise need to handle internals indices correctly everywhere
            target_embedding = self.target_network.apply(
                x={name: state[1:]
                   for name, state in states.items()},
                internals=[internal[1:] for internal in internals],
                update=update)

            actions = {name: action[:-1] for name, action in actions.items()}
            terminal = terminal[:-1]
            reward = reward[:-1]

        deltas = list()
        for name, distribution in self.distributions.items():
            target_distribution = self.target_distributions[name]

            distr_params = distribution.parameterize(x=embedding)
            target_distr_params = target_distribution.parameterize(
                x=target_embedding)

            q_value = self.tf_q_value(embedding=embedding,
                                      distr_params=distr_params,
                                      action=actions[name],
                                      name=name)

            if self.double_q_model:
                action_taken = distribution.sample(distr_params=distr_params,
                                                   deterministic=True)
            else:
                action_taken = target_distribution.sample(
                    distr_params=target_distr_params, deterministic=True)

            next_q_value = target_distribution.state_action_value(
                distr_params=target_distr_params, action=action_taken)

            delta = self.tf_q_delta(q_value=q_value,
                                    next_q_value=next_q_value,
                                    terminal=terminal,
                                    reward=reward)

            collapsed_size = util.prod(util.shape(delta)[1:])
            delta = tf.reshape(tensor=delta, shape=(-1, collapsed_size))

            deltas.append(delta)

        # Surrogate loss as the mean squared error between actual observed rewards and expected rewards
        loss_per_instance = tf.reduce_mean(input_tensor=tf.concat(
            values=deltas, axis=1),
                                           axis=1)

        # Optional Huber loss
        if self.huber_loss is not None and self.huber_loss > 0.0:
            return tf.where(
                condition=(tf.abs(x=loss_per_instance) <= self.huber_loss),
                x=(0.5 * tf.square(x=loss_per_instance)),
                y=(self.huber_loss *
                   (tf.abs(x=loss_per_instance) - 0.5 * self.huber_loss)))
        else:
            return tf.square(x=loss_per_instance)
Ejemplo n.º 33
0
    def create_tf_operations(self, config):
        """
        Creates TRPO training operations, i.e. the natural gradient update step
        based on the KL divergence constraint between new and old policy.
        :return:
        """
        super(TRPOModel, self).create_tf_operations(config)

        with tf.variable_scope('update'):
            losses = list()
            for name, action in config.actions:
                distribution = self.distribution[name]
                previous_distribution = tuple(tf.placeholder(dtype=tf.float32, shape=util.shape(x, unknown=None)) for x in distribution)
                self.internal_inputs.extend(previous_distribution)
                self.internal_outputs.extend(distribution)
                if sum(1 for _ in distribution) == 2:
                    for n, x in enumerate(distribution):
                        if n == 0:
                            self.internal_inits.append(np.zeros(shape=util.shape(x)[1:]))
                        else:
                            self.internal_inits.append(np.ones(shape=util.shape(x)[1:]))
                else:
                    self.internal_inits.extend(np.zeros(shape=util.shape(x)[1:]) for x in distribution)
                previous_distribution = self.distribution[name].__class__(distribution=previous_distribution)

                log_prob = distribution.log_probability(action=self.action[name])
                previous_log_prob = previous_distribution.log_probability(action=self.action[name])
                prob_ratio = tf.minimum(tf.exp(log_prob - previous_log_prob), 1000)

                self.loss_per_instance = tf.multiply(x=prob_ratio, y=self.reward)
                surrogate_loss = -tf.reduce_mean(self.loss_per_instance, axis=0)
                kl_divergence = distribution.kl_divergence(previous_distribution)
                entropy = distribution.entropy()
                losses.append((surrogate_loss, kl_divergence, entropy))

            self.losses = [tf.reduce_mean(loss) for loss in zip(*losses)]

            # Get symbolic gradient expressions
            variables = list(tf.trainable_variables())  # TODO: ideally not value function (see also for "gradients" below)
            gradients = tf.gradients(self.losses, variables)
            variables = [var for var, grad in zip(variables, gradients) if grad is not None]
            gradients = [grad for grad in gradients if grad is not None]
            self.policy_gradient = tf.concat(values=[tf.reshape(grad, (-1,)) for grad in gradients], axis=0)  # util.prod(util.shape(v))

            fixed_distribution = distribution.__class__([tf.stop_gradient(x) for x in distribution])
            fixed_kl_divergence = fixed_distribution.kl_divergence(distribution)

            self.tangent = tf.placeholder(tf.float32, shape=(None,))
            offset = 0
            tangents = []
            for variable in variables:
                shape = util.shape(variable)
                size = util.prod(shape)
                tangents.append(tf.reshape(self.tangent[offset:offset + size], shape))
                offset += size

            gradients = tf.gradients(fixed_kl_divergence, variables)
            gradient_vector_product = [tf.reduce_sum(g * t) for (g, t) in zip(gradients, tangents)]

            self.flat_variable_helper = FlatVarHelper(variables)
            gradients = tf.gradients(gradient_vector_product, variables)
            self.fisher_vector_product = tf.concat(values=[tf.reshape(grad, (-1,)) for grad in gradients], axis=0)

            self.cg_optimizer = ConjugateGradientOptimizer(self.logger, config.cg_iterations)
Ejemplo n.º 34
0
    def create_tf_operations(self, config):
        super(QModel, self).create_tf_operations(config)

        # Placeholders
        with tf.variable_scope('placeholder'):
            self.next_state = dict()
            for name, state in config.states.items():
                self.next_state[name] = tf.placeholder(
                    dtype=util.tf_dtype(state.type),
                    shape=(None, ) + tuple(state.shape),
                    name=name)

        network_builder = util.get_function(fct=config.network)

        # Training network
        with tf.variable_scope('training') as training_scope:
            self.training_network = NeuralNetwork(
                network_builder=network_builder, inputs=self.state)
            self.internal_inputs.extend(self.training_network.internal_inputs)
            self.internal_outputs.extend(
                self.training_network.internal_outputs)
            self.internal_inits.extend(self.training_network.internal_inits)
            self.q_values = self.create_training_operations(config)
            self.training_variables = tf.contrib.framework.get_variables(
                scope=training_scope)

        # Target network
        with tf.variable_scope('target') as target_scope:
            self.target_network = NeuralNetwork(
                network_builder=network_builder, inputs=self.next_state)
            self.internal_inputs.extend(self.target_network.internal_inputs)
            self.internal_outputs.extend(self.target_network.internal_outputs)
            self.internal_inits.extend(self.target_network.internal_inits)
            self.target_values = self.create_target_operations(config)
            self.target_variables = tf.contrib.framework.get_variables(
                scope=target_scope)

        with tf.name_scope('update'):
            deltas = list()
            terminal_float = tf.cast(x=self.terminal, dtype=tf.float32)
            for name, action in self.action.items():
                reward = self.reward
                terminal = terminal_float
                for _ in range(len(config.actions[name].shape)):
                    reward = tf.expand_dims(input=reward, axis=1)
                    terminal = tf.expand_dims(input=terminal, axis=1)
                q_target = reward + (
                    1.0 -
                    terminal) * config.discount * self.target_values[name]
                delta = tf.stop_gradient(q_target) - self.q_values[name]
                delta = tf.reshape(
                    tensor=delta,
                    shape=(-1, util.prod(config.actions[name].shape)))
                deltas.append(delta)

            # Surrogate loss as the mean squared error between actual observed rewards and expected rewards
            delta = tf.reduce_mean(input_tensor=tf.concat(values=deltas,
                                                          axis=1),
                                   axis=1)
            self.loss_per_instance = tf.square(delta)

            # If loss clipping is used, calculate the huber loss
            if config.clip_loss > 0.0:
                huber_loss = tf.where(
                    condition=(tf.abs(delta) < config.clip_gradients),
                    x=(0.5 * self.loss_per_instance),
                    y=(tf.abs(delta) - 0.5))
                self.q_loss = tf.reduce_mean(input_tensor=huber_loss, axis=0)
            else:
                self.q_loss = tf.reduce_mean(
                    input_tensor=self.loss_per_instance, axis=0)
            tf.losses.add_loss(self.q_loss)

        # Update target network
        with tf.name_scope('update-target'):
            self.target_network_update = list()
            for v_source, v_target in zip(self.training_variables,
                                          self.target_variables):
                update = v_target.assign_sub(config.update_target_weight *
                                             (v_target - v_source))
                self.target_network_update.append(update)