Beispiel #1
0
    def create_tf_operations(self, config):
        super(DQNModel, self).create_tf_operations(config)

        num_actions = {name: action.num_actions for name, action in config.actions}

        # Training network
        with tf.variable_scope('training'):
            self.training_network = NeuralNetwork(config.network, inputs=self.state)

            self.internal_inputs.extend(self.training_network.internal_inputs)
            self.internal_outputs.extend(self.training_network.internal_outputs)
            self.internal_inits.extend(self.training_network.internal_inits)
            training_output = dict()

            for action in self.action:
                training_output[action] = layers['linear'](x=self.training_network.output, size=num_actions[action])
                self.action_taken[action] = tf.argmax(training_output[action], axis=1)

        # Target network
        with tf.variable_scope('target'):
            self.target_network = NeuralNetwork(config.network, inputs=self.state)
            self.internal_inputs.extend(self.target_network.internal_inputs)
            self.internal_outputs.extend(self.target_network.internal_outputs)
            self.internal_inits.extend(self.target_network.internal_inits)
            target_value = dict()

            for action in self.action:
                target_output = layers['linear'](x=self.target_network.output, size=num_actions[action])
                if config.double_dqn:
                    selector = tf.one_hot(self.action_taken[action], num_actions[action])
                    target_value[action] = tf.reduce_sum(tf.multiply(target_output, selector), axis=1)
                else:
                    target_value[action] = tf.reduce_max(target_output, axis=1)

        with tf.name_scope('update'):
            for action in self.action:
                # One_hot tensor of the actions that have been taken
                action_one_hot = tf.one_hot(self.action[action][:-1], num_actions[action])
                # Training output, so we get the expected rewards given the actual states and actions
                q_value = tf.reduce_sum(training_output[action][:-1] * action_one_hot, axis=1)

                # Surrogate loss as the mean squared error between actual observed rewards and expected rewards
                q_target = self.reward[:-1] + (1.0 - tf.cast(self.terminal[:-1], tf.float32)) * self.discount * target_value[action][1:]
                delta = q_target - q_value

                # If gradient clipping is used, calculate the huber loss
                if config.clip_gradients > 0.0:
                    huber_loss = tf.where(tf.abs(delta) < config.clip_gradients, 0.5 * tf.square(delta), tf.abs(delta) - 0.5)
                    loss = tf.reduce_mean(huber_loss)
                else:
                    loss = tf.reduce_mean(tf.square(delta))
                tf.losses.add_loss(loss)

        # Update target network
        with tf.name_scope("update_target"):
            self.target_network_update = list()
            for v_source, v_target in zip(self.training_network.variables, self.target_network.variables):
                update = v_target.assign_sub(config.update_target_weight * (v_target - v_source))
                self.target_network_update.append(update)
Beispiel #2
0
    def create_tf_operations(self, config):
        super(QModel, self).create_tf_operations(config)

        # Placeholders
        with tf.variable_scope('placeholder'):
            self.next_state = dict()
            for name, state in config.states.items():
                self.next_state[name] = tf.placeholder(dtype=util.tf_dtype(state.type), shape=(None,) + tuple(state.shape), name=('next_' + name))

        network_builder = util.get_function(fct=config.network)

        # Training network
        with tf.variable_scope('training') as training_scope:
            self.training_network = NeuralNetwork(network_builder=network_builder, inputs=self.state, summary_level=config.tf_summary_level)
            self.network_internal_index = len(self.internal_inputs)
            self.internal_inputs.extend(self.training_network.internal_inputs)
            self.internal_outputs.extend(self.training_network.internal_outputs)
            self.internal_inits.extend(self.training_network.internal_inits)
            self.q_values = self.create_training_operations(config)
            self.training_variables = tf.contrib.framework.get_variables(scope=training_scope)

        # Target network
        with tf.variable_scope('target') as target_scope:
            self.target_network = NeuralNetwork(network_builder=network_builder, inputs=self.next_state)
            self.next_internal_inputs = list(self.target_network.internal_inputs)
            self.target_values = self.create_target_operations(config)
            self.target_variables = tf.contrib.framework.get_variables(scope=target_scope)

        with tf.name_scope('update'):
            deltas = self.create_q_deltas(config)

            # Surrogate loss as the mean squared error between actual observed rewards and expected rewards
            delta = tf.reduce_mean(input_tensor=tf.concat(values=deltas, axis=1), axis=1)
            self.loss_per_instance = tf.square(delta)

            # If loss clipping is used, calculate the huber loss
            if config.clip_loss > 0.0:
                huber_loss = tf.where(condition=(tf.abs(delta) < config.clip_loss), x=(0.5 * self.loss_per_instance),
                                      y=config.clip_loss * tf.abs(delta) - 0.5 * config.clip_loss ** 2)
                self.q_loss = tf.reduce_mean(input_tensor=huber_loss, axis=0)
            else:
                self.q_loss = tf.reduce_mean(input_tensor=self.loss_per_instance, axis=0)
            tf.losses.add_loss(self.q_loss)

        # for each loss over an action create a summary
        if len(self.q_loss.shape) > 1:
            for action_ind in range(self.q_loss.shape[1]):
                tf.summary.scalar('q-loss-action-{}'.format(action_ind), self.q_loss[action_ind])
        else:
            tf.summary.scalar('q-loss', self.q_loss)

        # Update target network
        with tf.name_scope('update-target'):
            self.target_network_update = list()
            for v_source, v_target in zip(self.training_variables, self.target_variables):
                update = v_target.assign_sub(config.update_target_weight * (v_target - v_source))
                self.target_network_update.append(update)
    def create_tf_operations(self, config):
        if len(config.states) > 1:
            raise Exception()

        with tf.variable_scope('mlp_value_function'):
            self.state = tf.placeholder(
                dtype=tf.float32,
                shape=(None, util.prod(next(iter(config.states))[1].shape)))
            self.returns = tf.placeholder(dtype=tf.float32, shape=(None, ))

            network_builder = layered_network_builder(({
                'type': 'dense',
                'size': self.size
            }, {
                'type': 'dense',
                'size': 1
            }))

            network = NeuralNetwork(network_builder=network_builder,
                                    inputs=dict(state=self.state))

            self.prediction = network.output
            loss = tf.nn.l2_loss(self.prediction - self.returns)

            optimizer = tf.train.AdamOptimizer(
                learning_rate=config.learning_rate)
            self.optimize = optimizer.minimize(loss)
Beispiel #4
0
    def create_tf_operations(self, config):
        if len(config.states) > 1:
            raise Exception()

        with tf.variable_scope('mlp_value_function'):
            self.state = tf.placeholder(
                dtype=tf.float32,
                shape=(None, util.prod(next(iter(config.states))[1].shape)))
            self.returns = tf.placeholder(dtype=tf.float32, shape=(None, ))
            self.updates = int(
                config.batch_size / self.update_batch_size) * self.epochs
            self.batch_size = config.batch_size

            layers = []
            for _ in xrange(self.hidden_layers):
                layers.append({'type': 'dense', 'size': self.size})
            layers.append({'type': 'linear', 'size': 1})

            network = NeuralNetwork(
                network_builder=layered_network_builder(layers),
                inputs=dict(state=self.state))

            self.prediction = network.output
            loss = tf.nn.l2_loss(self.prediction - self.returns)

            optimizer = tf.train.AdamOptimizer(
                learning_rate=config.learning_rate)

            self.optimize = optimizer.minimize(loss)
Beispiel #5
0
    def create_tf_operations(self, state, batch_size, scope='cnn_baseline'):

        with tf.variable_scope(scope):
            self.state = tf.placeholder(dtype=tf.float32, shape=(None, util.prod(state.shape)))
            self.returns = tf.placeholder(dtype=tf.float32, shape=(None,))
            self.updates = int(batch_size / self.update_batch_size) * self.epochs
            self.batch_size = batch_size

            layers = []
            for size in self.sizes:
                layers.append({'type': 'conv2d', 'size': size, 'stride': 1, 'window': 3})

            # First layer has larger window
            layers[0]['window'] = 5

            # TODO append maxpooling
            layers.append({'type': 'linear', 'size': 1})

            network = NeuralNetwork(network_builder=layered_network_builder(layers),
                                    inputs=dict(state=self.state))

            self.prediction = network.output
            loss = tf.nn.l2_loss(self.prediction - self.returns)

            optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)

            self.optimize = optimizer.minimize(loss)
Beispiel #6
0
    def create_tf_operations(self, state, scope='cnn_baseline'):
        with tf.variable_scope(scope) as scope:
            self.state = tf.placeholder(dtype=tf.float32, shape=(None,) + tuple(state.shape))
            self.returns = tf.placeholder(dtype=tf.float32, shape=(None,))

            layers = []
            for size in self.cnn_sizes:
                layers.append({'type': 'conv2d', 'size': size, 'stride': 1, 'window': 3})

            # First layer has larger window
            layers[0]['window'] = 5
            layers.append({'type': 'flatten'})
            for size in self.dense_sizes:
                layers.append({'type': 'dense', 'size': size})
            layers.append({'type': 'linear', 'size': 1})

            network = NeuralNetwork(network_builder=layered_network_builder(layers),
                                    inputs=dict(state=self.state))

            self.prediction = tf.squeeze(input=network.output, axis=1)
            loss = tf.nn.l2_loss(self.prediction - self.returns)

            optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)

            variables = tf.contrib.framework.get_variables(scope=scope)
            self.optimize = optimizer.minimize(loss, var_list=variables)
Beispiel #7
0
    def create_tf_operations(self, state, batch_size, scope='mlp_baseline'):
        with tf.variable_scope(scope):
            self.state = tf.placeholder(dtype=tf.float32,
                                        shape=(None, util.prod(state.shape)))
            self.returns = tf.placeholder(dtype=tf.float32, shape=(None, ))
            self.updates = int(
                batch_size / self.update_batch_size) * self.epochs
            self.batch_size = batch_size

            layers = []
            for size in self.sizes:
                layers.append({'type': 'dense', 'size': size})

            layers.append({'type': 'linear', 'size': 1})

            network = NeuralNetwork(
                network_builder=layered_network_builder(layers),
                inputs=dict(state=self.state))

            self.prediction = network.output
            loss = tf.nn.l2_loss(self.prediction - self.returns)

            optimizer = tf.train.AdamOptimizer(
                learning_rate=self.learning_rate)

            self.optimize = optimizer.minimize(loss)
    def create_tf_operations(self, config):
        super(PolicyGradientModel, self).create_tf_operations(config)

        with tf.variable_scope('value_function'):
            network_builder = util.get_function(fct=config.network)
            self.network = NeuralNetwork(network_builder=network_builder,
                                         inputs=self.state)
            self.internal_inputs.extend(self.network.internal_inputs)
            self.internal_outputs.extend(self.network.internal_outputs)
            self.internal_inits.extend(self.network.internal_inits)

        with tf.variable_scope('distribution'):
            for action, distribution in self.distribution.items():
                with tf.variable_scope(action):
                    distribution.create_tf_operations(
                        x=self.network.output,
                        deterministic=self.deterministic)
                self.action_taken[action] = distribution.sample()

        if self.baseline:
            with tf.variable_scope('baseline'):
                # Generate one baseline per state input, later average their predictions
                for name, state in config.states:
                    self.baseline[name].create_tf_operations(
                        state, scope='baseline_' + name)
    def create_tf_operations(self, config):
        """
        Create tensorflow ops

        :return:
        """
        super(SimpleQModel, self).create_tf_operations(config)

        with tf.name_scope("simpleq"):

            self.network = NeuralNetwork(config.network, inputs=self.state)
            self.network_output = layers['linear'](x=self.network.output,
                                                   size=self.action_count)

            with tf.name_scope("predict"):
                self.q_action = tf.argmax(self.network_output, axis=1)

            with tf.name_scope("update"):
                # We need the Q values of the current states to calculate the difference ("loss") between the
                # expected values and the new values (q targets). Therefore we do a forward-pass
                # and reduce the results to the actions that have been taken.

                # One_hot tensor of the actions that have been taken.
                actions_one_hot = tf.one_hot(self.action['action'][:-1],
                                             self.action_count,
                                             1.0,
                                             0.0,
                                             name='action_one_hot')

                # Training output, reduced to the actions that have been taken.
                q_values_actions_taken = tf.reduce_sum(
                    self.network_output[:-1] * actions_one_hot,
                    axis=1,
                    name='q_acted')

                # Expected values for the next states
                q_output = tf.reduce_max(self.network_output[1:],
                                         axis=1,
                                         name='q_expected')

                # Bellmann equation Q = r + y * Q'
                q_targets = self.reward[:-1] + (1. - tf.cast(self.terminal[:-1], tf.float32)) \
                                               * self.gamma * q_output

                # The loss is the difference between the q_targets and the expected q values.
                self.loss = tf.reduce_sum(
                    tf.square(q_targets - q_values_actions_taken))
                # self.optimize_op = self.optimizer.minimize(self.loss)

                tf.losses.add_loss(self.loss)
    def create_tf_operations(self, config):
        super(PolicyGradientModel, self).create_tf_operations(config)

        with tf.variable_scope('value_function'):
            self.network = NeuralNetwork(config.network, inputs=self.state)
            self.internal_inputs.extend(self.network.internal_inputs)
            self.internal_outputs.extend(self.network.internal_outputs)
            self.internal_inits.extend(self.network.internal_inits)

        with tf.variable_scope('distribution'):
            for action, distribution in self.distribution.items():
                distribution.create_tf_operations(x=self.network.output,
                                                  sample=config.sample_actions)
                self.action_taken[action] = distribution.value

        if self.baseline:
            with tf.variable_scope('baseline'):
                self.baseline.create_tf_operations(config)
Beispiel #11
0
    def create_tf_operations(self, state, scope='mlp_baseline'):
        with tf.variable_scope(scope) as scope:
            self.state = tf.placeholder(dtype=tf.float32,
                                        shape=(None, util.prod(state.shape)))
            self.returns = tf.placeholder(dtype=tf.float32, shape=(None, ))

            layers = []
            for size in self.sizes:
                layers.append({'type': 'dense', 'size': size})

            layers.append({'type': 'linear', 'size': 1})

            network = NeuralNetwork(
                network_builder=layered_network_builder(layers),
                inputs=dict(state=self.state))

            self.prediction = tf.squeeze(input=network.output, axis=1)
            loss = tf.nn.l2_loss(self.prediction - self.returns)

            optimizer = tf.train.AdamOptimizer(
                learning_rate=self.learning_rate)

            variables = tf.contrib.framework.get_variables(scope=scope)
            self.optimize = optimizer.minimize(loss, var_list=variables)
    def create_tf_operations(self, config):
        super(CategoricalDQNModel, self).create_tf_operations(config)

        # Placeholders
        with tf.variable_scope('placeholder'):
            self.next_state = dict()
            for name, state in config.states.items():
                self.next_state[name] = tf.placeholder(
                    dtype=util.tf_dtype(state.type),
                    shape=(None, ) + tuple(state.shape),
                    name=name)

        # setup constants delta_z and z. z represents the discretized scaling over vmin -> vmax
        scaling_increment = (self.distribution_max - self.distribution_min) / (
            self.num_atoms - 1)  # delta_z in the paper
        quantized_steps = self.distribution_min + np.arange(
            self.num_atoms) * scaling_increment  # z in the paper

        num_actions = {
            name: action.num_actions
            for name, action in config.actions
        }

        # creating networks
        network_builder = util.get_function(fct=config.network)

        # Training network
        with tf.variable_scope('training') as training_scope:
            self.training_network = NeuralNetwork(
                network_builder=network_builder,
                inputs=self.state,
                summary_level=config.tf_summary_level)
            self.network_internal_index = len(self.internal_inputs)
            self.internal_inputs.extend(self.training_network.internal_inputs)
            self.internal_outputs.extend(
                self.training_network.internal_outputs)
            self.internal_inits.extend(self.training_network.internal_inits)
            training_output_logits, training_output_probabilities, training_qval, action_taken = self._create_action_outputs(
                self.training_network.output, quantized_steps, self.num_atoms,
                config, self.action, num_actions)
            # stack to preserve action_taken shape like (batch_size, num_actions)
            for action in self.action:
                if len(action_taken[action]) > 1:
                    self.action_taken[action] = tf.stack(action_taken[action],
                                                         axis=1)
                else:
                    self.action_taken[action] = action_taken[action][0]

                # summarize expected reward histogram
                if config.tf_summary_level >= 1:
                    for action_shaped in range(len(action_taken[action])):
                        for action_ind in range(num_actions[action]):
                            tf.summary.histogram(
                                '{}-{}-{}-output-distribution'.format(
                                    action, action_shaped, action_ind),
                                training_output_probabilities[action]
                                [action_shaped][:, action_ind] *
                                quantized_steps)

            self.training_variables = tf.contrib.framework.get_variables(
                scope=training_scope)

        # Target network
        with tf.variable_scope('target') as target_scope:
            self.target_network = NeuralNetwork(
                network_builder=network_builder, inputs=self.next_state)
            self.next_internal_inputs = list(
                self.target_network.internal_inputs)
            _, target_output_probabilities, target_qval, target_action = self._create_action_outputs(
                self.target_network.output, quantized_steps, self.num_atoms,
                config, self.action, num_actions)

            self.target_variables = tf.contrib.framework.get_variables(
                scope=target_scope)

        with tf.name_scope('update'):
            # broadcast rewards and discounted quantization. Shape (batchsize, num_atoms). T_z_j in the paper
            reward = tf.expand_dims(self.reward, axis=1)
            terminal = tf.expand_dims(tf.cast(x=self.terminal,
                                              dtype=tf.float32),
                                      axis=1)
            broadcasted_rewards = reward + (1.0 - terminal) * (
                quantized_steps * self.discount)
            # clip into distribution_min, distribution_max
            quantized_discounted_reward = tf.clip_by_value(
                broadcasted_rewards, self.distribution_min,
                self.distribution_max)
            # compute quantization indecies. b, l, u in the paper
            closest_quantization = (quantized_discounted_reward -
                                    self.distribution_min) / scaling_increment
            lower_ind = tf.floor(closest_quantization)
            upper_ind = tf.ceil(closest_quantization)

            # create shared selections for later use
            dynamic_batch_size = tf.shape(self.reward)[0]
            batch_selection = tf.range(0, dynamic_batch_size)
            # tile expects a tensor of same shape, we are just repeating the selection num_atoms times across the last dimension
            batch_tiled_selection = tf.reshape(
                tf.tile(tf.reshape(batch_selection, (-1, 1)),
                        [1, self.num_atoms]), [-1])
            # combine with lower and upper ind, same as zip(flatten(batch_tiled_selection), flatten(lower_ind))
            # also cast to int32 to use as index
            batch_lower_inds = tf.stack(
                (batch_tiled_selection,
                 tf.reshape(tf.cast(lower_ind, tf.int32), [-1])),
                axis=1)
            batch_upper_inds = tf.stack(
                (batch_tiled_selection,
                 tf.reshape(tf.cast(upper_ind, tf.int32), [-1])),
                axis=1)

            # create loss for each action
            for action in self.action:
                # if shape of action != () we need to process each action head separately
                for action_ind in range(
                        max([util.prod(config.actions[action].shape), 1])):
                    # project onto the supports
                    # tensorflow indexing is still not great, we stack these two and use gather_nd later
                    target_batch_action_selection = tf.stack(
                        (batch_selection, target_action[action][action_ind]),
                        axis=1)

                    # distribute probability scaled by distance
                    # in numpy the equivalent is target_output_probabilities[action][batch_selection, target_action]
                    target_probabilities_of_action = tf.gather_nd(
                        target_output_probabilities[action][action_ind],
                        target_batch_action_selection)
                    distance_lower = target_probabilities_of_action * (
                        closest_quantization - lower_ind)
                    distance_upper = target_probabilities_of_action * (
                        upper_ind - closest_quantization)

                    # sum distances aligned into quantized bins. m in the paper
                    # scatter_nd actually sums the values into a zeros tensor instead of overwriting
                    # this is pretty much a huge hack refer to https://github.com/tensorflow/tensorflow/issues/8102
                    target_quantized_probabilities_lower = tf.scatter_nd(
                        batch_lower_inds, tf.reshape(distance_lower, [-1]),
                        (dynamic_batch_size, self.num_atoms))
                    target_quantized_probabilities_upper = tf.scatter_nd(
                        batch_upper_inds, tf.reshape(distance_upper, [-1]),
                        (dynamic_batch_size, self.num_atoms))
                    # no gradient should flow back to the target network
                    target_quantized_probabilities = tf.stop_gradient(
                        target_quantized_probabilities_lower +
                        target_quantized_probabilities_upper)

                    # we must check if input action has shape
                    if len(self.action[action].shape) > 1:
                        input_action = self.action[action][:, action_ind]
                    else:
                        input_action = self.action[action]
                    # now we have target probabilities loss is categorical cross entropy using logits
                    # compare to the actions we actually took
                    training_action_selection = tf.stack(
                        (batch_selection, input_action), axis=1)
                    probabilities_for_action = tf.gather_nd(
                        training_output_probabilities[action][action_ind],
                        training_action_selection)
                    self.loss_per_instance = -tf.reduce_sum(
                        target_quantized_probabilities *
                        tf.log(probabilities_for_action + util.epsilon),
                        axis=-1)
                    loss = tf.reduce_mean(self.loss_per_instance)
                    tf.losses.add_loss(loss)

                    tf.summary.scalar(
                        'cce-loss-{}-{}'.format(action, action_ind), loss)

        # Update target network
        with tf.name_scope("update_target"):
            self.target_network_update = list()
            for v_source, v_target in zip(self.training_variables,
                                          self.target_variables):
                update = v_target.assign_sub(config.update_target_weight *
                                             (v_target - v_source))
                self.target_network_update.append(update)
Beispiel #13
0
    def create_tf_operations(self, config):
        super(NAFModel, self).create_tf_operations(config)
        num_actions = sum(
            util.prod(config.actions[name].shape)
            for name in sorted(self.action))

        # Get hidden layers from network generator, then add NAF outputs, same for target network
        with tf.variable_scope('training'):
            network_builder = util.get_function(fct=config.network)
            self.training_network = NeuralNetwork(
                network_builder=network_builder, inputs=self.state)
            self.internal_inputs.extend(self.training_network.internal_inputs)
            self.internal_outputs.extend(
                self.training_network.internal_outputs)
            self.internal_inits.extend(self.training_network.internal_inits)

        with tf.variable_scope('training_outputs') as scope:
            # Action outputs
            flat_mean = layers['linear'](x=self.training_network.output,
                                         size=num_actions)
            n = 0
            for name in sorted(self.action):
                shape = config.actions[name].shape
                self.action_taken[name] = tf.reshape(
                    tensor=flat_mean[:, n:n + util.prod(shape)],
                    shape=((-1, ) + shape))
                n += util.prod(shape)

            # Advantage computation
            # Network outputs entries of lower triangular matrix L
            lower_triangular_size = num_actions * (num_actions + 1) // 2
            l_entries = layers['linear'](x=self.training_network.output,
                                         size=lower_triangular_size)

            l_matrix = tf.exp(
                x=tf.map_fn(fn=tf.diag, elems=l_entries[:, :num_actions]))

            if num_actions > 1:
                offset = num_actions
                l_columns = list()
                for zeros, size in enumerate(xrange(num_actions - 1, -1, -1),
                                             1):
                    column = tf.pad(tensor=l_entries[:, offset:offset + size],
                                    paddings=((0, 0), (zeros, 0)))
                    l_columns.append(column)
                    offset += size
                l_matrix += tf.stack(values=l_columns, axis=1)

            # P = LL^T
            p_matrix = tf.matmul(a=l_matrix,
                                 b=tf.transpose(a=l_matrix, perm=(0, 2, 1)))

            flat_action = list()
            for name in sorted(self.action):
                shape = config.actions[name].shape
                flat_action.append(
                    tf.reshape(tensor=self.action[name],
                               shape=(-1, util.prod(shape))))
            flat_action = tf.concat(values=flat_action, axis=1)
            difference = flat_action - flat_mean

            # A = -0.5 (a - mean)P(a - mean)
            advantage = tf.matmul(a=p_matrix,
                                  b=tf.expand_dims(input=difference, axis=2))
            advantage = tf.matmul(a=tf.expand_dims(input=difference, axis=1),
                                  b=advantage)
            advantage = tf.squeeze(input=(-advantage / 2.0), axis=2)

            # Q = A + V
            # State-value function
            value = layers['linear'](x=self.training_network.output,
                                     size=num_actions)
            q_value = value + advantage
            training_output_vars = tf.contrib.framework.get_variables(
                scope=scope)

        with tf.variable_scope('target'):
            network_builder = util.get_function(fct=config.network)
            self.target_network = NeuralNetwork(
                network_builder=network_builder, inputs=self.state)
            self.internal_inputs.extend(self.target_network.internal_inputs)
            self.internal_outputs.extend(self.target_network.internal_outputs)
            self.internal_inits.extend(self.target_network.internal_inits)

        with tf.variable_scope('target_outputs') as scope:
            # State-value function
            target_value = layers['linear'](x=self.target_network.output,
                                            size=num_actions)
            target_output_vars = tf.contrib.framework.get_variables(
                scope=scope)

        with tf.name_scope('update'):
            reward = tf.expand_dims(input=self.reward[:-1], axis=1)
            terminal = tf.expand_dims(input=tf.cast(x=self.terminal[:-1],
                                                    dtype=tf.float32),
                                      axis=1)
            q_target = reward + (1.0 -
                                 terminal) * config.discount * target_value[1:]
            delta = q_target - q_value[:-1]
            delta = tf.reduce_mean(input_tensor=delta, axis=1)
            self.loss_per_instance = tf.square(x=delta)

            # We observe issues with numerical stability in some tests, gradient clipping can help
            if config.clip_gradients > 0.0:
                huber_loss = tf.where(
                    condition=(tf.abs(delta) < config.clip_gradients),
                    x=(0.5 * self.loss_per_instance),
                    y=(tf.abs(delta) - 0.5))
                loss = tf.reduce_mean(input_tensor=huber_loss, axis=0)
            else:
                loss = tf.reduce_mean(input_tensor=self.loss_per_instance,
                                      axis=0)
            tf.losses.add_loss(loss)

        with tf.name_scope('update_target'):
            # Combine hidden layer variables and output layer variables
            training_vars = self.training_network.variables + training_output_vars
            target_vars = self.target_network.variables + target_output_vars

            self.target_network_update = list()
            for v_source, v_target in zip(training_vars, target_vars):
                update = v_target.assign_sub(config.update_target_weight *
                                             (v_target - v_source))
                self.target_network_update.append(update)
    def create_tf_operations(self, config):
        super(DQNModel, self).create_tf_operations(config)

        flat_action_sizes = {
            name: util.prod(action.shape) * action.num_actions
            for name, action in config.actions
        }
        action_shapes = {
            name: (-1, ) + action.shape + (action.num_actions, )
            for name, action in config.actions
        }

        # Training network
        with tf.variable_scope('training'):
            network_builder = util.get_function(fct=config.network)
            self.training_network = NeuralNetwork(
                network_builder=network_builder, inputs=self.state)
            self.internal_inputs.extend(self.training_network.internal_inputs)
            self.internal_outputs.extend(
                self.training_network.internal_outputs)
            self.internal_inits.extend(self.training_network.internal_inits)

            self.training_output = dict()
            for action in self.action:
                output = layers['linear'](x=self.training_network.output,
                                          size=flat_action_sizes[action])
                self.training_output[action] = tf.reshape(
                    tensor=output, shape=action_shapes[action])
                self.action_taken[action] = tf.argmax(
                    self.training_output[action], axis=-1)

        # Target network
        with tf.variable_scope('target'):
            network_builder = util.get_function(fct=config.network)
            self.target_network = NeuralNetwork(
                network_builder=network_builder, inputs=self.state)
            self.internal_inputs.extend(self.target_network.internal_inputs)
            self.internal_outputs.extend(self.target_network.internal_outputs)
            self.internal_inits.extend(self.target_network.internal_inits)

            target_value = dict()
            for action in self.action:
                output = layers['linear'](x=self.target_network.output,
                                          size=flat_action_sizes[action])
                output = tf.reshape(tensor=output, shape=action_shapes[action])
                if config.double_dqn:
                    selector = tf.one_hot(indices=self.action_taken[action],
                                          depth=action_shapes[action][1])
                    target_value[action] = tf.reduce_sum(
                        input_tensor=(output * selector), axis=-1)
                else:
                    target_value[action] = tf.reduce_max(input_tensor=output,
                                                         axis=-1)

        with tf.name_scope('update'):
            self.actions_one_hot = dict()
            self.q_values = dict()
            deltas = list()
            for action in self.action:
                # One_hot tensor of the actions that have been taken
                self.actions_one_hot[action] = tf.one_hot(
                    indices=self.action[action][:-1],
                    depth=config.actions[action].num_actions)

                # Training output, so we get the expected rewards given the actual states and actions
                self.q_values[action] = tf.reduce_sum(
                    input_tensor=(self.training_output[action][:-1] *
                                  self.actions_one_hot[action]),
                    axis=-1)

                reward = self.reward[:-1]
                terminal = tf.cast(x=self.terminal[:-1], dtype=tf.float32)
                for _ in range(len(config.actions[action].shape)):
                    reward = tf.expand_dims(input=reward, axis=1)
                    terminal = tf.expand_dims(input=terminal, axis=1)

                # Surrogate loss as the mean squared error between actual observed rewards and expected rewards
                q_target = reward + (
                    1.0 -
                    terminal) * config.discount * target_value[action][1:]
                delta = q_target - self.q_values[action]

                ds_list = [delta]
                for _ in range(len(config.actions[action].shape)):
                    ds_list = [
                        d for ds in ds_list
                        for d in tf.unstack(value=ds, axis=1)
                    ]
                deltas.extend(ds_list)

            delta = tf.add_n(inputs=deltas) / len(deltas)
            self.loss_per_instance = tf.square(delta)

            # If gradient clipping is used, calculate the huber loss
            if config.clip_loss > 0.0:
                huber_loss = tf.where(
                    condition=(tf.abs(delta) < config.clip_gradients),
                    x=(0.5 * self.loss_per_instance),
                    y=(tf.abs(delta) - 0.5))
                loss = tf.reduce_mean(input_tensor=huber_loss, axis=0)
            else:
                loss = tf.reduce_mean(input_tensor=self.loss_per_instance,
                                      axis=0)
            self.dqn_loss = loss
            tf.losses.add_loss(loss)

        # Update target network
        with tf.name_scope('update_target'):
            self.target_network_update = list()
            for v_source, v_target in zip(self.training_network.variables,
                                          self.target_network.variables):
                update = v_target.assign_sub(config.update_target_weight *
                                             (v_target - v_source))
                self.target_network_update.append(update)
Beispiel #15
0
    def create_tf_operations(self, config):
        """Create training graph. For DQFD, we build the double-dqn training graph and
        modify the double_q_loss function according to eq. 5
        
        Args:
            config: Config dict.

        Returns:

        """
        super(DQFDModel, self).create_tf_operations(config)

        num_actions = {
            name: action.num_actions
            for name, action in config.actions
        }

        # placeholders
        with tf.variable_scope('placeholders'):
            self.q_targets = tf.placeholder(tf.float32, (None, ),
                                            name='q_targets')

        # Training network
        with tf.variable_scope('training'):
            self.training_network = NeuralNetwork(
                config.network,
                inputs={name: state
                        for name, state in self.state.items()})
            self.internal_inputs.extend(self.training_network.internal_inputs)
            self.internal_outputs.extend(
                self.training_network.internal_outputs)
            self.internal_inits.extend(self.training_network.internal_inits)

            training_output = dict()

            for action in self.action:
                training_output[action] = layers['linear'](
                    x=self.training_network.output, size=num_actions[action])
                self.action_taken[action] = tf.argmax(training_output[action],
                                                      axis=1)

        # Target network
        with tf.variable_scope('target'):
            self.target_network = NeuralNetwork(
                config.network,
                inputs={name: state
                        for name, state in self.state.items()})
            self.internal_inputs.extend(self.target_network.internal_inputs)
            self.internal_outputs.extend(self.target_network.internal_outputs)
            self.internal_inits.extend(self.target_network.internal_inits)

            target_value = dict()

            for action in self.action:
                target_output = layers['linear'](x=self.target_network.output,
                                                 size=num_actions[action])
                selector = tf.one_hot(self.action_taken[action],
                                      num_actions[action])
                target_value[action] = tf.reduce_sum(tf.multiply(
                    target_output, selector),
                                                     axis=1)

        with tf.name_scope("update"):
            self.dqfd_opt = []

            for action in self.action:
                # Self.q_targets gets fed the actual observed rewards and expected future rewards
                # One_hot tensor of the actions that have been taken
                action_one_hot = tf.one_hot(self.action[action][:-1],
                                            num_actions[action])

                # Training output, so we get the expected rewards given the actual states and actions
                q_value = tf.reduce_sum(training_output[action][:-1] *
                                        action_one_hot,
                                        axis=1)

                # Surrogate loss as the mean squared error between actual observed rewards and expected rewards
                q_target = self.reward[:-1] + (
                    1.0 - tf.cast(self.terminal[:-1], tf.float32)
                ) * self.discount * target_value[action][1:]
                delta = q_target - q_value
                self.loss_per_instance = tf.square(delta)

                # If gradient clipping is used, calculate the huber loss
                if config.clip_gradients > 0.0:
                    huber_loss = tf.where(
                        tf.abs(delta) < config.clip_gradients,
                        0.5 * self.loss_per_instance,
                        tf.abs(delta) - 0.5)
                    double_q_loss = tf.reduce_mean(huber_loss)
                else:
                    double_q_loss = tf.reduce_mean(self.loss_per_instance)

                # Use the existing loss structure from the model here, then compute dqfd loss separately
                tf.losses.add_loss(double_q_loss)

                # Create the supervised margin loss
                mask = tf.ones_like(action_one_hot, dtype=tf.float32)

                # Zero for the action taken, one for all other actions, now multiply by expert margin
                inverted_one_hot = mask - action_one_hot

                # max_a([Q(s,a) + l(s,a_E,a)], l(s,a_E, a) is 0 for expert action and margin value for others
                expert_margin = training_output[action][:-1] + tf.multiply(
                    inverted_one_hot, config.expert_margin)

                supervised_selector = tf.reduce_max(
                    expert_margin, axis=1, name='expert_margin_selector')

                # J_E(Q) = max_a([Q(s,a) + l(s,a_E,a)] - Q(s,a_E)
                supervised_loss = supervised_selector - q_value

                # Combining double q loss with supervised loss
                dqfd_loss = double_q_loss + tf.multiply(
                    tf.reduce_mean(supervised_loss), config.supervised_weight)

                # This decomposition is not necessary, we just want to be able to export gradients
                dqfd_grads_and_vars = self.optimizer.compute_gradients(
                    dqfd_loss)

                self.dqfd_opt.append(
                    self.optimizer.apply_gradients(dqfd_grads_and_vars))

        # Update target network according to update weight
        self.target_network_update = []

        with tf.name_scope("update_target"):
            for v_source, v_target in zip(self.training_network.variables,
                                          self.target_network.variables):
                update = v_target.assign_sub(config.update_target_weight *
                                             (v_target - v_source))
                self.target_network_update.append(update)
Beispiel #16
0
    def create_tf_operations(self, config):
        super(NAFModel, self).create_tf_operations(config)

        # Get hidden layers from network generator, then add NAF outputs, same for target network
        with tf.variable_scope('training'):
            self.training_network = NeuralNetwork(config.network,
                                                  inputs=self.state)
            self.internal_inputs.extend(self.training_network.internal_inputs)
            self.internal_outputs.extend(
                self.training_network.internal_outputs)
            self.internal_inits.extend(self.training_network.internal_inits)

        with tf.variable_scope('training_outputs'):
            num_actions = len(self.action)
            # Action outputs
            mean = layers['linear'](x=self.training_network.output,
                                    size=num_actions)
            for n, action in enumerate(sorted(self.action)):
                # mean = tf.Print(mean,[mean])
                self.action_taken[action] = mean[n]

            # Advantage computation
            # Network outputs entries of lower triangular matrix L
            lower_triangular_size = num_actions * (num_actions + 1) // 2
            l_entries = layers['linear'](x=self.training_network.output,
                                         size=lower_triangular_size)

            l_matrix = tf.exp(tf.map_fn(tf.diag, l_entries[:, :num_actions]))

            if num_actions > 1:
                offset = num_actions
                l_columns = list()
                for zeros, size in enumerate(xrange(num_actions - 1, 0, -1),
                                             1):
                    column = tf.pad(l_entries[:, offset:offset + size],
                                    ((0, 0), (zeros, 0)))
                    l_columns.append(column)
                    offset += size
                l_matrix += tf.stack(l_columns, 1)

            # P = LL^T
            p_matrix = tf.matmul(l_matrix, tf.transpose(l_matrix, (0, 2, 1)))
            # p_matrix = tf.Print(p_matrix, [p_matrix])

            # l_rows = []
            # offset = 0
            # for i in xrange(num_actions):
            #     # Diagonal elements are exponentiated, otherwise gradient often 0
            #     # Slice out lower triangular entries from flat representation through moving offset
            #     diagonal = tf.exp(l_entries[:, offset])  # tf.slice(l_entries, (0, offset), (-1, 1))
            #     n = config.actions - i - 1
            #     # Slice out non-zero non-diagonal entries, - 1 because we already took the diagonal
            #     non_diagonal = l_entries[:, offset + 1: offset + n + 1]  # tf.slice(l_entries, (0, offset + 1), (-1, n))
            #     # Fill up row with zeros
            #     row = tf.pad(tf.concat(axis=1, values=(diagonal, non_diagonal)), ((0, 0), (i, 0)))
            #     offset += (num_actions - i)
            #     l_rows.append(row)
            #
            # # Stack rows to matrix
            # l_matrix = tf.transpose(tf.stack(l_rows, axis=1), (0, 2, 1))

            actions = tf.stack(
                values=[self.action[name] for name in sorted(self.action)],
                axis=1)
            action_diff = actions - mean

            # A = -0.5 (a - mean)P(a - mean)
            advantage = -tf.matmul(
                tf.expand_dims(action_diff, 1),
                tf.matmul(p_matrix, tf.expand_dims(action_diff, 2))) / 2
            advantage = tf.squeeze(advantage, 2)

            # Q = A + V
            # State-value function
            value = layers['linear'](x=self.training_network.output, size=1)
            q_value = tf.squeeze(value + advantage, 1)
            training_output_vars = get_variables('training_outputs')

        with tf.variable_scope('target'):
            self.target_network = NeuralNetwork(config.network,
                                                inputs=self.state)
            self.internal_inputs.extend(self.target_network.internal_inputs)
            self.internal_outputs.extend(self.target_network.internal_outputs)
            self.internal_inits.extend(self.target_network.internal_inits)
            target_value = dict()

        with tf.variable_scope('target_outputs'):
            # State-value function
            target_value_output = layers['linear'](
                x=self.target_network.output, size=1)
            for action in self.action:
                # Naf directly outputs V(s)
                target_value[action] = target_value_output

            target_output_vars = get_variables('target_outputs')

        with tf.name_scope("update"):
            for action in self.action:
                q_target = self.reward[:-1] + (1.0 - tf.cast(self.terminal[:-1], tf.float32)) * config.discount\
                                              * target_value[action][1:]
                delta = q_target - q_value[:-1]

                # We observe issues with numerical stability in some tests, gradient clipping can help
                if config.clip_gradients > 0.0:
                    huber_loss = tf.where(
                        tf.abs(delta) < config.clip_gradients,
                        tf.multiply(tf.square(delta), 0.5),
                        tf.abs(delta) - 0.5)
                    loss = tf.reduce_mean(huber_loss)
                else:
                    loss = tf.reduce_mean(tf.square(delta))
                # loss = tf.Print(loss, [loss])
                tf.losses.add_loss(loss)

        with tf.name_scope("update_target"):
            # Combine hidden layer variables and output layer variables
            training_vars = self.training_network.variables + training_output_vars
            target_vars = self.target_network.variables + target_output_vars

            self.target_network_update = list()
            for v_source, v_target in zip(training_vars, target_vars):
                update = v_target.assign_sub(config.update_target_weight *
                                             (v_target - v_source))
                self.target_network_update.append(update)
Beispiel #17
0
    def create_tf_operations(self, config):
        super(QModel, self).create_tf_operations(config)

        # Placeholders
        with tf.variable_scope('placeholder'):
            self.next_state = dict()
            for name, state in config.states.items():
                self.next_state[name] = tf.placeholder(
                    dtype=util.tf_dtype(state.type),
                    shape=(None, ) + tuple(state.shape),
                    name=name)

        network_builder = util.get_function(fct=config.network)

        # Training network
        with tf.variable_scope('training') as training_scope:
            self.training_network = NeuralNetwork(
                network_builder=network_builder, inputs=self.state)
            self.internal_inputs.extend(self.training_network.internal_inputs)
            self.internal_outputs.extend(
                self.training_network.internal_outputs)
            self.internal_inits.extend(self.training_network.internal_inits)
            self.q_values = self.create_training_operations(config)
            self.training_variables = tf.contrib.framework.get_variables(
                scope=training_scope)

        # Target network
        with tf.variable_scope('target') as target_scope:
            self.target_network = NeuralNetwork(
                network_builder=network_builder, inputs=self.next_state)
            self.internal_inputs.extend(self.target_network.internal_inputs)
            self.internal_outputs.extend(self.target_network.internal_outputs)
            self.internal_inits.extend(self.target_network.internal_inits)
            self.target_values = self.create_target_operations(config)
            self.target_variables = tf.contrib.framework.get_variables(
                scope=target_scope)

        with tf.name_scope('update'):
            deltas = list()
            terminal_float = tf.cast(x=self.terminal, dtype=tf.float32)
            for name, action in self.action.items():
                reward = self.reward
                terminal = terminal_float
                for _ in range(len(config.actions[name].shape)):
                    reward = tf.expand_dims(input=reward, axis=1)
                    terminal = tf.expand_dims(input=terminal, axis=1)
                q_target = reward + (
                    1.0 -
                    terminal) * config.discount * self.target_values[name]
                delta = tf.stop_gradient(q_target) - self.q_values[name]
                delta = tf.reshape(
                    tensor=delta,
                    shape=(-1, util.prod(config.actions[name].shape)))
                deltas.append(delta)

            # Surrogate loss as the mean squared error between actual observed rewards and expected rewards
            delta = tf.reduce_mean(input_tensor=tf.concat(values=deltas,
                                                          axis=1),
                                   axis=1)
            self.loss_per_instance = tf.square(delta)

            # If loss clipping is used, calculate the huber loss
            if config.clip_loss > 0.0:
                huber_loss = tf.where(
                    condition=(tf.abs(delta) < config.clip_gradients),
                    x=(0.5 * self.loss_per_instance),
                    y=(tf.abs(delta) - 0.5))
                self.q_loss = tf.reduce_mean(input_tensor=huber_loss, axis=0)
            else:
                self.q_loss = tf.reduce_mean(
                    input_tensor=self.loss_per_instance, axis=0)
            tf.losses.add_loss(self.q_loss)

        # Update target network
        with tf.name_scope('update-target'):
            self.target_network_update = list()
            for v_source, v_target in zip(self.training_variables,
                                          self.target_variables):
                update = v_target.assign_sub(config.update_target_weight *
                                             (v_target - v_source))
                self.target_network_update.append(update)