class SimpleQModel(Model):
    # Default config values
    default_config = {
        "alpha": 0.01,
        "gamma": 0.99,
        "network_layers": [{
            "type": "linear",
            "num_outputs": 16
        }]
    }

    def __init__(self, config, scope):
        """
        Initialize model, build network and tensorflow ops

        :param config: Config object or dict
        :param scope: tensorflow scope name
        """
        super(SimpleQModel, self).__init__(config, scope)
        self.action_count = self.config.actions

        self.random = np.random.RandomState()

        with tf.device(self.config.tf_device):
            # Create state placeholder
            # self.batch_shape is [None] (set in Model.__init__)
            self.state = tf.placeholder(tf.float32,
                                        self.batch_shape +
                                        list(self.config.state_shape),
                                        name="state")

            # Create neural network
            output_layer = [{
                "type": "linear",
                "num_outputs": self.action_count
            }]
            self.network = NeuralNetwork(self.config.network_layers +
                                         output_layer,
                                         self.state,
                                         scope=self.scope + "network")
            self.network_out = self.network.get_output()

            # Create operations
            self.create_ops()
            self.init_op = tf.global_variables_initializer()

            # Create optimizer
            self.optimizer = tf.train.GradientDescentOptimizer(
                learning_rate=self.config.alpha)

    def get_action(self, state, episode=1):
        """
        Get action for a given state

        :param state: ndarray containing the state
        :param episode: number of episode (for epsilon decay and alike)
        :return: action
        """

        # self.exploration is initialized in Model.__init__ and provides an API for different explorations methods,
        # such as epsilon greedy.
        epsilon = self.exploration(episode,
                                   self.total_states)  # returns a float

        if self.random.random_sample() < epsilon:
            action = self.random.randint(0, self.action_count)
        else:
            action = self.session.run(self.q_action, {self.state: [state]})[0]

        self.total_states += 1
        return action

    def update(self, batch):
        """
        Update model parameters

        :param batch: memory batch
        :return:
        """
        # Get Q values for next states
        next_q = self.session.run(self.network_out,
                                  {self.state: batch['next_states']})

        # Bellmann equation Q = r + y * Q'
        q_targets = batch['rewards'] + (1. - batch['terminals'].astype(float)) \
                                       * self.config.gamma * np.max(next_q, axis=1)

        self.session.run(
            self.optimize_op, {
                self.state: batch['states'],
                self.actions: batch['actions'],
                self.q_targets: q_targets
            })

    def initialize(self):
        """
        Initialize model variables
        :return:
        """
        self.session.run(self.init_op)

    def create_ops(self):
        """
        Create tensorflow ops

        :return:
        """
        with tf.name_scope(self.scope):
            with tf.name_scope("predict"):
                self.q_action = tf.argmax(self.network_out, axis=1)

            with tf.name_scope("update"):
                # These are the target Q values, i.e. the actual rewards plus the expected values of the next states
                # (Bellman equation).
                self.q_targets = tf.placeholder(tf.float32, [None],
                                                name='q_targets')

                # Actions that have been taken.
                self.actions = tf.placeholder(tf.int32, [None], name='actions')

                # We need the Q values of the current states to calculate the difference ("loss") between the
                # expected values and the new values (q targets). Therefore we do a forward-pass
                # and reduce the results to the actions that have been taken.

                # One_hot tensor of the actions that have been taken.
                actions_one_hot = tf.one_hot(self.actions,
                                             self.action_count,
                                             1.0,
                                             0.0,
                                             name='action_one_hot')

                # Training output, reduced to the actions that have been taken.
                q_values_actions_taken = tf.reduce_sum(self.network_out *
                                                       actions_one_hot,
                                                       axis=1,
                                                       name='q_acted')

                # The loss is the difference between the q_targets and the expected q values.
                self.loss = tf.reduce_sum(
                    tf.square(self.q_targets - q_values_actions_taken))
                self.optimize_op = self.optimizer.minimize(self.loss)
Exemple #2
0
class NAFModel(Model):
    default_config = NAFModelConfig

    def __init__(self, config, scope, define_network=None):
        """
        Training logic for NAFs.

        :param config: Configuration parameters
        """
        super(NAFModel, self).__init__(config, scope)
        self.action_count = self.config.actions
        self.tau = self.config.tau
        self.epsilon = self.config.epsilon
        self.gamma = self.config.gamma
        self.batch_size = self.config.batch_size

        if self.config.deterministic_mode:
            self.random = global_seed()
        else:
            self.random = np.random.RandomState()

        self.state = tf.placeholder(tf.float32, self.batch_shape + list(self.config.state_shape), name="state")
        self.next_states = tf.placeholder(tf.float32, self.batch_shape + list(self.config.state_shape),
                                          name="next_states")

        self.actions = tf.placeholder(tf.float32, [None, self.action_count], name='actions')
        self.terminals = tf.placeholder(tf.float32, [None], name='terminals')
        self.rewards = tf.placeholder(tf.float32, [None], name='rewards')
        self.q_targets = tf.placeholder(tf.float32, [None], name='q_targets')
        self.target_network_update = []
        self.episode = 0

        # Get hidden layers from network generator, then add NAF outputs, same for target network
        scope = '' if self.config.tf_scope is None else self.config.tf_scope + '-'

        if define_network is None:
            define_network = NeuralNetwork.layered_network(self.config.network_layers)

        self.training_model = NeuralNetwork(define_network, [self.state], scope=scope + 'training')
        self.target_model = NeuralNetwork(define_network, [self.next_states], scope=scope + 'target')

        # Create output fields
        self.training_v, self.mu, self.advantage, self.q, self.training_output_vars = self.create_outputs(
            self.training_model.get_output(), 'outputs_training')
        self.target_v, _, _, _, self.target_output_vars = self.create_outputs(self.target_model.get_output(),
                                                                              'outputs_target')
        self.create_training_operations()
        self.saver = tf.train.Saver()
        self.session.run(tf.global_variables_initializer())

    def get_action(self, state, episode=1):
        """
        Returns naf action(s) as given by the mean output of the network.

        :param state: Current state
        :param episode: Current episode
        :return: action
        """
        action = self.session.run(self.mu, {self.state: [state]})[0] + self.exploration(episode, self.total_states)
        self.total_states += 1

        return action

    def update(self, batch):
        """
        Executes a NAF update on a training batch.

        :param batch:=
        :return:
        """
        float_terminals = batch['terminals'].astype(float)

        q_targets = batch['rewards'] + (1. - float_terminals) * self.gamma * np.squeeze(
            self.get_target_value_estimate(batch['next_states']))

        self.session.run([self.optimize_op, self.loss, self.training_v, self.advantage, self.q], {
            self.q_targets: q_targets,
            self.actions: batch['actions'],
            self.state: batch['states']})

    def create_outputs(self, last_hidden_layer, scope):
        """
        Creates NAF specific outputs.

        :param last_hidden_layer: Points to last hidden layer
        :param scope: TF name scope

        :return Output variables and all TF variables created in this scope
        """

        with tf.name_scope(scope):
            # State-value function
            v = linear(last_hidden_layer, {'num_outputs': 1, 'weights_regularizer': self.config.weights_regularizer,
                                           'weights_regularizer_args': [self.config.weights_regularizer_args]}, scope + 'v')

            # Action outputs
            mu = linear(last_hidden_layer, {'num_outputs': self.action_count, 'weights_regularizer': self.config.weights_regularizer,
                                            'weights_regularizer_args': [self.config.weights_regularizer_args]}, scope + 'mu')

            # Advantage computation
            # Network outputs entries of lower triangular matrix L
            lower_triangular_size = int(self.action_count * (self.action_count + 1) / 2)
            l_entries = linear(last_hidden_layer, {'num_outputs': lower_triangular_size,
                                                   'weights_regularizer': self.config.weights_regularizer,
                                                   'weights_regularizer_args': [self.config.weights_regularizer_args]},
                               scope + 'l')

            # Iteratively construct matrix. Extra verbose comment here
            l_rows = []
            offset = 0

            for i in xrange(self.action_count):
                # Diagonal elements are exponentiated, otherwise gradient often 0
                # Slice out lower triangular entries from flat representation through moving offset

                diagonal = tf.exp(tf.slice(l_entries, (0, offset), (-1, 1)))

                n = self.action_count - i - 1
                # Slice out non-zero non-diagonal entries, - 1 because we already took the diagonal
                non_diagonal = tf.slice(l_entries, (0, offset + 1), (-1, n))

                # Fill up row with zeros
                row = tf.pad(tf.concat(axis=1, values=(diagonal, non_diagonal)), ((0, 0), (i, 0)))
                offset += (self.action_count - i)
                l_rows.append(row)

            # Stack rows to matrix
            l_matrix = tf.transpose(tf.stack(l_rows, axis=1), (0, 2, 1))

            # P = LL^T
            p_matrix = tf.matmul(l_matrix, tf.transpose(l_matrix, (0, 2, 1)))

            # Need to adjust dimensions to multiply with P.
            action_diff = tf.expand_dims(self.actions - mu, -1)

            # A = -0.5 (a - mu)P(a - mu)
            advantage = -0.5 * tf.matmul(tf.transpose(action_diff, [0, 2, 1]),
                                               tf.matmul(p_matrix, action_diff))
            advantage = tf.reshape(advantage, [-1, 1])

            with tf.name_scope('q_values'):
                # Q = A + V
                q_value = v + advantage

        # Get all variables under this scope for target network update
        return v, mu, advantage, q_value, get_variables(scope)

    def create_training_operations(self):
        """
        NAF update logic.
        """

        with tf.name_scope("update"):
            # MSE
            self.loss = tf.reduce_mean(tf.squared_difference(self.q_targets, tf.squeeze(self.q)),
                                       name='loss')
            self.optimize_op = self.optimizer.minimize(self.loss)

        with tf.name_scope("update_target"):
            # Combine hidden layer variables and output layer variables
            self.training_vars = self.training_model.get_variables() + self.training_output_vars
            self.target_vars = self.target_model.get_variables() + self.target_output_vars

            for v_source, v_target in zip(self.training_vars, self.target_vars):
                update = v_target.assign_sub(self.tau * (v_target - v_source))

                self.target_network_update.append(update)

    def get_target_value_estimate(self, next_states):
        """
        Estimate of next state V value through target network.

        :param next_states:
        :return:
        """

        return self.session.run(self.target_v, {self.next_states: next_states})

    def update_target_network(self):
        """
        Updates target network.

        :return:
        """
        self.session.run(self.target_network_update)
Exemple #3
0
class DQNModel(Model):
    default_config = DQNModelConfig

    def __init__(self, config, scope, define_network=None):
        """
        Training logic for DQN.

        :param config: Configuration dict
        """
        super(DQNModel, self).__init__(config, scope)

        self.action_count = self.config.actions
        self.tau = self.config.tau
        self.gamma = self.config.gamma
        self.batch_size = self.config.batch_size

        self.double_dqn = self.config.double_dqn

        self.clip_value = None
        if self.config.clip_gradients:
            self.clip_value = self.config.clip_value

        if self.config.deterministic_mode:
            self.random = global_seed()
        else:
            self.random = np.random.RandomState()

        self.target_network_update = []

        # output layer
        output_layer_config = [{
            "type": "linear",
            "num_outputs": self.config.actions,
            "trainable": True
        }]

        self.device = self.config.tf_device
        if self.device == 'replica':
            self.device = tf.train.replica_device_setter(
                ps_tasks=1, worker_device=self.config.tf_worker_device)

        with tf.device(self.device):
            # Input placeholders
            self.state = tf.placeholder(tf.float32,
                                        self.batch_shape +
                                        list(self.config.state_shape),
                                        name="state")
            self.next_states = tf.placeholder(tf.float32,
                                              self.batch_shape +
                                              list(self.config.state_shape),
                                              name="next_states")
            self.terminals = tf.placeholder(tf.float32,
                                            self.batch_shape,
                                            name='terminals')
            self.rewards = tf.placeholder(tf.float32,
                                          self.batch_shape,
                                          name='rewards')

            if define_network is None:
                define_network = NeuralNetwork.layered_network(
                    self.config.network_layers + output_layer_config)

            self.training_model = NeuralNetwork(define_network, [self.state],
                                                scope=self.scope + 'training')
            self.target_model = NeuralNetwork(define_network,
                                              [self.next_states],
                                              scope=self.scope + 'target')

            self.training_output = self.training_model.get_output()
            self.target_output = self.target_model.get_output()

            # Create training operations
            self.create_training_operations()
            self.optimizer = tf.train.RMSPropOptimizer(self.alpha,
                                                       momentum=0.95,
                                                       epsilon=0.01)

        self.training_output = self.training_model.get_output()
        self.target_output = self.target_model.get_output()

        self.init_op = tf.global_variables_initializer()

        self.saver = tf.train.Saver()
        self.writer = tf.summary.FileWriter('logs',
                                            graph=tf.get_default_graph())

    def initialize(self):
        self.session.run(self.init_op)

    def get_action(self, state, episode=1):
        """
        Returns the predicted action for a given state.

        :param state: State tensor
        :param episode: Current episode
        :return: action number
        """

        epsilon = self.exploration(episode, self.total_states)

        if self.random.random_sample() < epsilon:
            action = self.random.randint(0, self.action_count)
        else:
            action = self.session.run(self.dqn_action,
                                      {self.state: [state]})[0]

        self.total_states += 1
        return action

    def update(self, batch):
        """
        Perform a single training step and updates the target network.

        :param batch: Mini batch to use for training
        :return: void
        """

        # Compute estimated future value
        float_terminals = batch['terminals'].astype(float)
        q_targets = batch['rewards'] + (1. - float_terminals) \
                                     * self.gamma * self.get_target_values(batch['next_states'])

        self.session.run(
            [self.optimize_op, self.training_output], {
                self.q_targets: q_targets,
                self.actions: batch['actions'],
                self.state: batch['states']
            })

    def get_variables(self):
        return self.training_model.get_variables()

    def assign_variables(self, values):
        assign_variables_ops = [
            variable.assign(value)
            for variable, value in zip(self.get_variables(), values)
        ]
        self.session.run(tf.group(assign_variables_ops))

    def get_gradients(self):
        return self.grads_and_vars

    def apply_gradients(self, grads_and_vars):
        apply_gradients_op = self.optimizer.apply_gradients(grads_and_vars)
        self.session.run(apply_gradients_op)

    def create_training_operations(self):
        """
        Create graph operations for compute_surrogate_loss computation and
        target network updates.

        :return:
        """
        with tf.name_scope(self.scope):
            with tf.name_scope("predict"):
                self.dqn_action = tf.argmax(self.training_output,
                                            axis=1,
                                            name='dqn_action')

            with tf.name_scope("targets"):
                if self.double_dqn:
                    selector = tf.one_hot(self.dqn_action,
                                          self.action_count,
                                          name='selector')
                    self.target_values = tf.reduce_sum(tf.multiply(
                        self.target_output, selector),
                                                       axis=1,
                                                       name='target_values')
                else:
                    self.target_values = tf.reduce_max(self.target_output,
                                                       axis=1,
                                                       name='target_values')

            with tf.name_scope("update"):
                # Self.q_targets gets fed the actual observed rewards and expected future rewards
                self.q_targets = tf.placeholder(tf.float32, [None],
                                                name='q_targets')

                # Self.actions gets fed the actual actions that have been taken
                self.actions = tf.placeholder(tf.int32, [None], name='actions')

                # One_hot tensor of the actions that have been taken
                actions_one_hot = tf.one_hot(self.actions,
                                             self.action_count,
                                             1.0,
                                             0.0,
                                             name='action_one_hot')

                # Training output, so we get the expected rewards given the actual states and actions
                q_values_actions_taken = tf.reduce_sum(self.training_output *
                                                       actions_one_hot,
                                                       axis=1,
                                                       name='q_acted')

                # Surrogate loss as the mean squared error between actual observed rewards and expected rewards
                delta = self.q_targets - q_values_actions_taken

                # if gradient clipping is used, calculate the huber loss
                if self.config.clip_gradients:
                    huber_loss = tf.where(
                        tf.abs(delta) < self.clip_value,
                        0.5 * tf.square(delta),
                        tf.abs(delta) - 0.5)
                    self.loss = tf.reduce_mean(huber_loss,
                                               name='compute_surrogate_loss')
                else:
                    self.loss = tf.reduce_mean(tf.square(delta),
                                               name='compute_surrogate_loss')

                self.grads_and_vars = self.optimizer.compute_gradients(
                    self.loss)
                self.optimize_op = self.optimizer.apply_gradients(
                    self.grads_and_vars)

            # Update target network with update weight tau
            with tf.name_scope("update_target"):
                for v_source, v_target in zip(
                        self.training_model.get_variables(),
                        self.target_model.get_variables()):
                    update = v_target.assign_sub(self.tau *
                                                 (v_target - v_source))
                    self.target_network_update.append(update)

    def get_target_values(self, next_states):
        """
        Estimate of next state Q values.
        :param next_states:
        :return:
        """
        if self.double_dqn:
            return self.session.run(self.target_values, {
                self.state: next_states,
                self.next_states: next_states
            })
        else:
            return self.session.run(self.target_values,
                                    {self.next_states: next_states})

    def update_target_network(self):
        """
        Updates target network.

        :return:
        """
        self.session.run(self.target_network_update)