def create_tf_operations(self, config):
        super(PolicyGradientModel, self).create_tf_operations(config)

        with tf.variable_scope('value_function'):
            network_builder = util.get_function(fct=config.network)
            self.network = NeuralNetwork(network_builder=network_builder,
                                         inputs=self.state)
            self.internal_inputs.extend(self.network.internal_inputs)
            self.internal_outputs.extend(self.network.internal_outputs)
            self.internal_inits.extend(self.network.internal_inits)

        with tf.variable_scope('distribution'):
            for action, distribution in self.distribution.items():
                with tf.variable_scope(action):
                    distribution.create_tf_operations(
                        x=self.network.output,
                        deterministic=self.deterministic)
                self.action_taken[action] = distribution.sample()

        if self.baseline:
            with tf.variable_scope('baseline'):
                # Generate one baseline per state input, later average their predictions
                for name, state in config.states:
                    self.baseline[name].create_tf_operations(
                        state, scope='baseline_' + name)
Beispiel #2
0
    def create_tf_operations(self, config):
        super(QModel, self).create_tf_operations(config)

        # Placeholders
        with tf.variable_scope('placeholder'):
            self.next_state = dict()
            for name, state in config.states.items():
                self.next_state[name] = tf.placeholder(dtype=util.tf_dtype(state.type), shape=(None,) + tuple(state.shape), name=('next_' + name))

        network_builder = util.get_function(fct=config.network)

        # Training network
        with tf.variable_scope('training') as training_scope:
            self.training_network = NeuralNetwork(network_builder=network_builder, inputs=self.state, summary_level=config.tf_summary_level)
            self.network_internal_index = len(self.internal_inputs)
            self.internal_inputs.extend(self.training_network.internal_inputs)
            self.internal_outputs.extend(self.training_network.internal_outputs)
            self.internal_inits.extend(self.training_network.internal_inits)
            self.q_values = self.create_training_operations(config)
            self.training_variables = tf.contrib.framework.get_variables(scope=training_scope)

        # Target network
        with tf.variable_scope('target') as target_scope:
            self.target_network = NeuralNetwork(network_builder=network_builder, inputs=self.next_state)
            self.next_internal_inputs = list(self.target_network.internal_inputs)
            self.target_values = self.create_target_operations(config)
            self.target_variables = tf.contrib.framework.get_variables(scope=target_scope)

        with tf.name_scope('update'):
            deltas = self.create_q_deltas(config)

            # Surrogate loss as the mean squared error between actual observed rewards and expected rewards
            delta = tf.reduce_mean(input_tensor=tf.concat(values=deltas, axis=1), axis=1)
            self.loss_per_instance = tf.square(delta)

            # If loss clipping is used, calculate the huber loss
            if config.clip_loss > 0.0:
                huber_loss = tf.where(condition=(tf.abs(delta) < config.clip_loss), x=(0.5 * self.loss_per_instance),
                                      y=config.clip_loss * tf.abs(delta) - 0.5 * config.clip_loss ** 2)
                self.q_loss = tf.reduce_mean(input_tensor=huber_loss, axis=0)
            else:
                self.q_loss = tf.reduce_mean(input_tensor=self.loss_per_instance, axis=0)
            tf.losses.add_loss(self.q_loss)

        # for each loss over an action create a summary
        if len(self.q_loss.shape) > 1:
            for action_ind in range(self.q_loss.shape[1]):
                tf.summary.scalar('q-loss-action-{}'.format(action_ind), self.q_loss[action_ind])
        else:
            tf.summary.scalar('q-loss', self.q_loss)

        # Update target network
        with tf.name_scope('update-target'):
            self.target_network_update = list()
            for v_source, v_target in zip(self.training_variables, self.target_variables):
                update = v_target.assign_sub(config.update_target_weight * (v_target - v_source))
                self.target_network_update.append(update)
    def create_tf_operations(self, config):
        super(CategoricalDQNModel, self).create_tf_operations(config)

        # Placeholders
        with tf.variable_scope('placeholder'):
            self.next_state = dict()
            for name, state in config.states.items():
                self.next_state[name] = tf.placeholder(
                    dtype=util.tf_dtype(state.type),
                    shape=(None, ) + tuple(state.shape),
                    name=name)

        # setup constants delta_z and z. z represents the discretized scaling over vmin -> vmax
        scaling_increment = (self.distribution_max - self.distribution_min) / (
            self.num_atoms - 1)  # delta_z in the paper
        quantized_steps = self.distribution_min + np.arange(
            self.num_atoms) * scaling_increment  # z in the paper

        num_actions = {
            name: action.num_actions
            for name, action in config.actions
        }

        # creating networks
        network_builder = util.get_function(fct=config.network)

        # Training network
        with tf.variable_scope('training') as training_scope:
            self.training_network = NeuralNetwork(
                network_builder=network_builder,
                inputs=self.state,
                summary_level=config.tf_summary_level)
            self.network_internal_index = len(self.internal_inputs)
            self.internal_inputs.extend(self.training_network.internal_inputs)
            self.internal_outputs.extend(
                self.training_network.internal_outputs)
            self.internal_inits.extend(self.training_network.internal_inits)
            training_output_logits, training_output_probabilities, training_qval, action_taken = self._create_action_outputs(
                self.training_network.output, quantized_steps, self.num_atoms,
                config, self.action, num_actions)
            # stack to preserve action_taken shape like (batch_size, num_actions)
            for action in self.action:
                if len(action_taken[action]) > 1:
                    self.action_taken[action] = tf.stack(action_taken[action],
                                                         axis=1)
                else:
                    self.action_taken[action] = action_taken[action][0]

                # summarize expected reward histogram
                if config.tf_summary_level >= 1:
                    for action_shaped in range(len(action_taken[action])):
                        for action_ind in range(num_actions[action]):
                            tf.summary.histogram(
                                '{}-{}-{}-output-distribution'.format(
                                    action, action_shaped, action_ind),
                                training_output_probabilities[action]
                                [action_shaped][:, action_ind] *
                                quantized_steps)

            self.training_variables = tf.contrib.framework.get_variables(
                scope=training_scope)

        # Target network
        with tf.variable_scope('target') as target_scope:
            self.target_network = NeuralNetwork(
                network_builder=network_builder, inputs=self.next_state)
            self.next_internal_inputs = list(
                self.target_network.internal_inputs)
            _, target_output_probabilities, target_qval, target_action = self._create_action_outputs(
                self.target_network.output, quantized_steps, self.num_atoms,
                config, self.action, num_actions)

            self.target_variables = tf.contrib.framework.get_variables(
                scope=target_scope)

        with tf.name_scope('update'):
            # broadcast rewards and discounted quantization. Shape (batchsize, num_atoms). T_z_j in the paper
            reward = tf.expand_dims(self.reward, axis=1)
            terminal = tf.expand_dims(tf.cast(x=self.terminal,
                                              dtype=tf.float32),
                                      axis=1)
            broadcasted_rewards = reward + (1.0 - terminal) * (
                quantized_steps * self.discount)
            # clip into distribution_min, distribution_max
            quantized_discounted_reward = tf.clip_by_value(
                broadcasted_rewards, self.distribution_min,
                self.distribution_max)
            # compute quantization indecies. b, l, u in the paper
            closest_quantization = (quantized_discounted_reward -
                                    self.distribution_min) / scaling_increment
            lower_ind = tf.floor(closest_quantization)
            upper_ind = tf.ceil(closest_quantization)

            # create shared selections for later use
            dynamic_batch_size = tf.shape(self.reward)[0]
            batch_selection = tf.range(0, dynamic_batch_size)
            # tile expects a tensor of same shape, we are just repeating the selection num_atoms times across the last dimension
            batch_tiled_selection = tf.reshape(
                tf.tile(tf.reshape(batch_selection, (-1, 1)),
                        [1, self.num_atoms]), [-1])
            # combine with lower and upper ind, same as zip(flatten(batch_tiled_selection), flatten(lower_ind))
            # also cast to int32 to use as index
            batch_lower_inds = tf.stack(
                (batch_tiled_selection,
                 tf.reshape(tf.cast(lower_ind, tf.int32), [-1])),
                axis=1)
            batch_upper_inds = tf.stack(
                (batch_tiled_selection,
                 tf.reshape(tf.cast(upper_ind, tf.int32), [-1])),
                axis=1)

            # create loss for each action
            for action in self.action:
                # if shape of action != () we need to process each action head separately
                for action_ind in range(
                        max([util.prod(config.actions[action].shape), 1])):
                    # project onto the supports
                    # tensorflow indexing is still not great, we stack these two and use gather_nd later
                    target_batch_action_selection = tf.stack(
                        (batch_selection, target_action[action][action_ind]),
                        axis=1)

                    # distribute probability scaled by distance
                    # in numpy the equivalent is target_output_probabilities[action][batch_selection, target_action]
                    target_probabilities_of_action = tf.gather_nd(
                        target_output_probabilities[action][action_ind],
                        target_batch_action_selection)
                    distance_lower = target_probabilities_of_action * (
                        closest_quantization - lower_ind)
                    distance_upper = target_probabilities_of_action * (
                        upper_ind - closest_quantization)

                    # sum distances aligned into quantized bins. m in the paper
                    # scatter_nd actually sums the values into a zeros tensor instead of overwriting
                    # this is pretty much a huge hack refer to https://github.com/tensorflow/tensorflow/issues/8102
                    target_quantized_probabilities_lower = tf.scatter_nd(
                        batch_lower_inds, tf.reshape(distance_lower, [-1]),
                        (dynamic_batch_size, self.num_atoms))
                    target_quantized_probabilities_upper = tf.scatter_nd(
                        batch_upper_inds, tf.reshape(distance_upper, [-1]),
                        (dynamic_batch_size, self.num_atoms))
                    # no gradient should flow back to the target network
                    target_quantized_probabilities = tf.stop_gradient(
                        target_quantized_probabilities_lower +
                        target_quantized_probabilities_upper)

                    # we must check if input action has shape
                    if len(self.action[action].shape) > 1:
                        input_action = self.action[action][:, action_ind]
                    else:
                        input_action = self.action[action]
                    # now we have target probabilities loss is categorical cross entropy using logits
                    # compare to the actions we actually took
                    training_action_selection = tf.stack(
                        (batch_selection, input_action), axis=1)
                    probabilities_for_action = tf.gather_nd(
                        training_output_probabilities[action][action_ind],
                        training_action_selection)
                    self.loss_per_instance = -tf.reduce_sum(
                        target_quantized_probabilities *
                        tf.log(probabilities_for_action + util.epsilon),
                        axis=-1)
                    loss = tf.reduce_mean(self.loss_per_instance)
                    tf.losses.add_loss(loss)

                    tf.summary.scalar(
                        'cce-loss-{}-{}'.format(action, action_ind), loss)

        # Update target network
        with tf.name_scope("update_target"):
            self.target_network_update = list()
            for v_source, v_target in zip(self.training_variables,
                                          self.target_variables):
                update = v_target.assign_sub(config.update_target_weight *
                                             (v_target - v_source))
                self.target_network_update.append(update)
Beispiel #4
0
    def create_tf_operations(self, config):
        super(NAFModel, self).create_tf_operations(config)
        num_actions = sum(
            util.prod(config.actions[name].shape)
            for name in sorted(self.action))

        # Get hidden layers from network generator, then add NAF outputs, same for target network
        with tf.variable_scope('training'):
            network_builder = util.get_function(fct=config.network)
            self.training_network = NeuralNetwork(
                network_builder=network_builder, inputs=self.state)
            self.internal_inputs.extend(self.training_network.internal_inputs)
            self.internal_outputs.extend(
                self.training_network.internal_outputs)
            self.internal_inits.extend(self.training_network.internal_inits)

        with tf.variable_scope('training_outputs') as scope:
            # Action outputs
            flat_mean = layers['linear'](x=self.training_network.output,
                                         size=num_actions)
            n = 0
            for name in sorted(self.action):
                shape = config.actions[name].shape
                self.action_taken[name] = tf.reshape(
                    tensor=flat_mean[:, n:n + util.prod(shape)],
                    shape=((-1, ) + shape))
                n += util.prod(shape)

            # Advantage computation
            # Network outputs entries of lower triangular matrix L
            lower_triangular_size = num_actions * (num_actions + 1) // 2
            l_entries = layers['linear'](x=self.training_network.output,
                                         size=lower_triangular_size)

            l_matrix = tf.exp(
                x=tf.map_fn(fn=tf.diag, elems=l_entries[:, :num_actions]))

            if num_actions > 1:
                offset = num_actions
                l_columns = list()
                for zeros, size in enumerate(xrange(num_actions - 1, -1, -1),
                                             1):
                    column = tf.pad(tensor=l_entries[:, offset:offset + size],
                                    paddings=((0, 0), (zeros, 0)))
                    l_columns.append(column)
                    offset += size
                l_matrix += tf.stack(values=l_columns, axis=1)

            # P = LL^T
            p_matrix = tf.matmul(a=l_matrix,
                                 b=tf.transpose(a=l_matrix, perm=(0, 2, 1)))

            flat_action = list()
            for name in sorted(self.action):
                shape = config.actions[name].shape
                flat_action.append(
                    tf.reshape(tensor=self.action[name],
                               shape=(-1, util.prod(shape))))
            flat_action = tf.concat(values=flat_action, axis=1)
            difference = flat_action - flat_mean

            # A = -0.5 (a - mean)P(a - mean)
            advantage = tf.matmul(a=p_matrix,
                                  b=tf.expand_dims(input=difference, axis=2))
            advantage = tf.matmul(a=tf.expand_dims(input=difference, axis=1),
                                  b=advantage)
            advantage = tf.squeeze(input=(-advantage / 2.0), axis=2)

            # Q = A + V
            # State-value function
            value = layers['linear'](x=self.training_network.output,
                                     size=num_actions)
            q_value = value + advantage
            training_output_vars = tf.contrib.framework.get_variables(
                scope=scope)

        with tf.variable_scope('target'):
            network_builder = util.get_function(fct=config.network)
            self.target_network = NeuralNetwork(
                network_builder=network_builder, inputs=self.state)
            self.internal_inputs.extend(self.target_network.internal_inputs)
            self.internal_outputs.extend(self.target_network.internal_outputs)
            self.internal_inits.extend(self.target_network.internal_inits)

        with tf.variable_scope('target_outputs') as scope:
            # State-value function
            target_value = layers['linear'](x=self.target_network.output,
                                            size=num_actions)
            target_output_vars = tf.contrib.framework.get_variables(
                scope=scope)

        with tf.name_scope('update'):
            reward = tf.expand_dims(input=self.reward[:-1], axis=1)
            terminal = tf.expand_dims(input=tf.cast(x=self.terminal[:-1],
                                                    dtype=tf.float32),
                                      axis=1)
            q_target = reward + (1.0 -
                                 terminal) * config.discount * target_value[1:]
            delta = q_target - q_value[:-1]
            delta = tf.reduce_mean(input_tensor=delta, axis=1)
            self.loss_per_instance = tf.square(x=delta)

            # We observe issues with numerical stability in some tests, gradient clipping can help
            if config.clip_gradients > 0.0:
                huber_loss = tf.where(
                    condition=(tf.abs(delta) < config.clip_gradients),
                    x=(0.5 * self.loss_per_instance),
                    y=(tf.abs(delta) - 0.5))
                loss = tf.reduce_mean(input_tensor=huber_loss, axis=0)
            else:
                loss = tf.reduce_mean(input_tensor=self.loss_per_instance,
                                      axis=0)
            tf.losses.add_loss(loss)

        with tf.name_scope('update_target'):
            # Combine hidden layer variables and output layer variables
            training_vars = self.training_network.variables + training_output_vars
            target_vars = self.target_network.variables + target_output_vars

            self.target_network_update = list()
            for v_source, v_target in zip(training_vars, target_vars):
                update = v_target.assign_sub(config.update_target_weight *
                                             (v_target - v_source))
                self.target_network_update.append(update)
Beispiel #5
0
    def create_tf_operations(self, config):
        super(QModel, self).create_tf_operations(config)

        # Placeholders
        with tf.variable_scope('placeholder'):
            self.next_state = dict()
            for name, state in config.states.items():
                self.next_state[name] = tf.placeholder(
                    dtype=util.tf_dtype(state.type),
                    shape=(None, ) + tuple(state.shape),
                    name=name)

        network_builder = util.get_function(fct=config.network)

        # Training network
        with tf.variable_scope('training') as training_scope:
            self.training_network = NeuralNetwork(
                network_builder=network_builder, inputs=self.state)
            self.internal_inputs.extend(self.training_network.internal_inputs)
            self.internal_outputs.extend(
                self.training_network.internal_outputs)
            self.internal_inits.extend(self.training_network.internal_inits)
            self.q_values = self.create_training_operations(config)
            self.training_variables = tf.contrib.framework.get_variables(
                scope=training_scope)

        # Target network
        with tf.variable_scope('target') as target_scope:
            self.target_network = NeuralNetwork(
                network_builder=network_builder, inputs=self.next_state)
            self.internal_inputs.extend(self.target_network.internal_inputs)
            self.internal_outputs.extend(self.target_network.internal_outputs)
            self.internal_inits.extend(self.target_network.internal_inits)
            self.target_values = self.create_target_operations(config)
            self.target_variables = tf.contrib.framework.get_variables(
                scope=target_scope)

        with tf.name_scope('update'):
            deltas = list()
            terminal_float = tf.cast(x=self.terminal, dtype=tf.float32)
            for name, action in self.action.items():
                reward = self.reward
                terminal = terminal_float
                for _ in range(len(config.actions[name].shape)):
                    reward = tf.expand_dims(input=reward, axis=1)
                    terminal = tf.expand_dims(input=terminal, axis=1)
                q_target = reward + (
                    1.0 -
                    terminal) * config.discount * self.target_values[name]
                delta = tf.stop_gradient(q_target) - self.q_values[name]
                delta = tf.reshape(
                    tensor=delta,
                    shape=(-1, util.prod(config.actions[name].shape)))
                deltas.append(delta)

            # Surrogate loss as the mean squared error between actual observed rewards and expected rewards
            delta = tf.reduce_mean(input_tensor=tf.concat(values=deltas,
                                                          axis=1),
                                   axis=1)
            self.loss_per_instance = tf.square(delta)

            # If loss clipping is used, calculate the huber loss
            if config.clip_loss > 0.0:
                huber_loss = tf.where(
                    condition=(tf.abs(delta) < config.clip_gradients),
                    x=(0.5 * self.loss_per_instance),
                    y=(tf.abs(delta) - 0.5))
                self.q_loss = tf.reduce_mean(input_tensor=huber_loss, axis=0)
            else:
                self.q_loss = tf.reduce_mean(
                    input_tensor=self.loss_per_instance, axis=0)
            tf.losses.add_loss(self.q_loss)

        # Update target network
        with tf.name_scope('update-target'):
            self.target_network_update = list()
            for v_source, v_target in zip(self.training_variables,
                                          self.target_variables):
                update = v_target.assign_sub(config.update_target_weight *
                                             (v_target - v_source))
                self.target_network_update.append(update)
Beispiel #6
0
    def create_tf_operations(self, config):
        super(DQNModel, self).create_tf_operations(config)

        num_actions = {name: action.num_actions for name, action in config.actions}

        # Training network
        with tf.variable_scope('training'):
            network_builder = util.get_function(fct=config.network)
            self.training_network = NeuralNetwork(network_builder=network_builder, inputs=self.state)
            self.internal_inputs.extend(self.training_network.internal_inputs)
            self.internal_outputs.extend(self.training_network.internal_outputs)
            self.internal_inits.extend(self.training_network.internal_inits)
            training_output = dict()

            for action in self.action:
                training_output[action] = layers['linear'](x=self.training_network.output, size=num_actions[action])
                self.action_taken[action] = tf.argmax(training_output[action], axis=1)

        # Target network
        with tf.variable_scope('target'):
            network_builder = util.get_function(fct=config.network)
            self.target_network = NeuralNetwork(network_builder=network_builder, inputs=self.state)
            self.internal_inputs.extend(self.target_network.internal_inputs)
            self.internal_outputs.extend(self.target_network.internal_outputs)
            self.internal_inits.extend(self.target_network.internal_inits)
            target_value = dict()

            for action in self.action:
                target_output = layers['linear'](x=self.target_network.output, size=num_actions[action])
                if config.double_dqn:
                    selector = tf.one_hot(self.action_taken[action], num_actions[action])
                    target_value[action] = tf.reduce_sum(tf.multiply(target_output, selector), axis=1)
                else:
                    target_value[action] = tf.reduce_max(target_output, axis=1)

        with tf.name_scope('update'):
            for action in self.action:
                # One_hot tensor of the actions that have been taken
                action_one_hot = tf.one_hot(self.action[action][:-1], num_actions[action])
                # Training output, so we get the expected rewards given the actual states and actions
                q_value = tf.reduce_sum(training_output[action][:-1] * action_one_hot, axis=1)

                # Surrogate loss as the mean squared error between actual observed rewards and expected rewards
                q_target = self.reward[:-1] + (1.0 - tf.cast(self.terminal[:-1], tf.float32)) * self.discount * target_value[action][1:]
                delta = q_target - q_value
                self.loss_per_instance = tf.square(delta)

                # If gradient clipping is used, calculate the huber loss
                if config.clip_gradients > 0.0:
                    huber_loss = tf.where(tf.abs(delta) < config.clip_gradients, 0.5 * self.loss_per_instance, tf.abs(delta) - 0.5)
                    loss = tf.reduce_mean(huber_loss)
                else:
                    loss = tf.reduce_mean(self.loss_per_instance)
                tf.losses.add_loss(loss)

        # Update target network
        with tf.name_scope("update_target"):
            self.target_network_update = list()
            for v_source, v_target in zip(self.training_network.variables, self.target_network.variables):
                update = v_target.assign_sub(config.update_target_weight * (v_target - v_source))
                self.target_network_update.append(update)
    def create_tf_operations(self, config):
        super(DQNModel, self).create_tf_operations(config)

        flat_action_sizes = {
            name: util.prod(action.shape) * action.num_actions
            for name, action in config.actions
        }
        action_shapes = {
            name: (-1, ) + action.shape + (action.num_actions, )
            for name, action in config.actions
        }

        # Training network
        with tf.variable_scope('training'):
            network_builder = util.get_function(fct=config.network)
            self.training_network = NeuralNetwork(
                network_builder=network_builder, inputs=self.state)
            self.internal_inputs.extend(self.training_network.internal_inputs)
            self.internal_outputs.extend(
                self.training_network.internal_outputs)
            self.internal_inits.extend(self.training_network.internal_inits)

            self.training_output = dict()
            for action in self.action:
                output = layers['linear'](x=self.training_network.output,
                                          size=flat_action_sizes[action])
                self.training_output[action] = tf.reshape(
                    tensor=output, shape=action_shapes[action])
                self.action_taken[action] = tf.argmax(
                    self.training_output[action], axis=-1)

        # Target network
        with tf.variable_scope('target'):
            network_builder = util.get_function(fct=config.network)
            self.target_network = NeuralNetwork(
                network_builder=network_builder, inputs=self.state)
            self.internal_inputs.extend(self.target_network.internal_inputs)
            self.internal_outputs.extend(self.target_network.internal_outputs)
            self.internal_inits.extend(self.target_network.internal_inits)

            target_value = dict()
            for action in self.action:
                output = layers['linear'](x=self.target_network.output,
                                          size=flat_action_sizes[action])
                output = tf.reshape(tensor=output, shape=action_shapes[action])
                if config.double_dqn:
                    selector = tf.one_hot(indices=self.action_taken[action],
                                          depth=action_shapes[action][1])
                    target_value[action] = tf.reduce_sum(
                        input_tensor=(output * selector), axis=-1)
                else:
                    target_value[action] = tf.reduce_max(input_tensor=output,
                                                         axis=-1)

        with tf.name_scope('update'):
            self.actions_one_hot = dict()
            self.q_values = dict()
            deltas = list()
            for action in self.action:
                # One_hot tensor of the actions that have been taken
                self.actions_one_hot[action] = tf.one_hot(
                    indices=self.action[action][:-1],
                    depth=config.actions[action].num_actions)

                # Training output, so we get the expected rewards given the actual states and actions
                self.q_values[action] = tf.reduce_sum(
                    input_tensor=(self.training_output[action][:-1] *
                                  self.actions_one_hot[action]),
                    axis=-1)

                reward = self.reward[:-1]
                terminal = tf.cast(x=self.terminal[:-1], dtype=tf.float32)
                for _ in range(len(config.actions[action].shape)):
                    reward = tf.expand_dims(input=reward, axis=1)
                    terminal = tf.expand_dims(input=terminal, axis=1)

                # Surrogate loss as the mean squared error between actual observed rewards and expected rewards
                q_target = reward + (
                    1.0 -
                    terminal) * config.discount * target_value[action][1:]
                delta = q_target - self.q_values[action]

                ds_list = [delta]
                for _ in range(len(config.actions[action].shape)):
                    ds_list = [
                        d for ds in ds_list
                        for d in tf.unstack(value=ds, axis=1)
                    ]
                deltas.extend(ds_list)

            delta = tf.add_n(inputs=deltas) / len(deltas)
            self.loss_per_instance = tf.square(delta)

            # If gradient clipping is used, calculate the huber loss
            if config.clip_loss > 0.0:
                huber_loss = tf.where(
                    condition=(tf.abs(delta) < config.clip_gradients),
                    x=(0.5 * self.loss_per_instance),
                    y=(tf.abs(delta) - 0.5))
                loss = tf.reduce_mean(input_tensor=huber_loss, axis=0)
            else:
                loss = tf.reduce_mean(input_tensor=self.loss_per_instance,
                                      axis=0)
            self.dqn_loss = loss
            tf.losses.add_loss(loss)

        # Update target network
        with tf.name_scope('update_target'):
            self.target_network_update = list()
            for v_source, v_target in zip(self.training_network.variables,
                                          self.target_network.variables):
                update = v_target.assign_sub(config.update_target_weight *
                                             (v_target - v_source))
                self.target_network_update.append(update)
Beispiel #8
0
    def create_tf_operations(self, config):
        super(NAFModel, self).create_tf_operations(config)

        # Get hidden layers from network generator, then add NAF outputs, same for target network
        with tf.variable_scope('training'):
            network_builder = util.get_function(fct=config.network)
            self.training_network = NeuralNetwork(network_builder=network_builder, inputs=self.state)
            self.internal_inputs.extend(self.training_network.internal_inputs)
            self.internal_outputs.extend(self.training_network.internal_outputs)
            self.internal_inits.extend(self.training_network.internal_inits)

        with tf.variable_scope('training_outputs'):
            num_actions = len(self.action)
            # Action outputs
            mean = layers['linear'](x=self.training_network.output, size=num_actions)
            for n, action in enumerate(sorted(self.action)):
                # mean = tf.Print(mean,[mean])
                self.action_taken[action] = mean[n]

            # Advantage computation
            # Network outputs entries of lower triangular matrix L
            lower_triangular_size = num_actions * (num_actions + 1) // 2
            l_entries = layers['linear'](x=self.training_network.output, size=lower_triangular_size)

            l_matrix = tf.exp(tf.map_fn(tf.diag, l_entries[:, :num_actions]))

            if num_actions > 1:
                offset = num_actions
                l_columns = list()
                for zeros, size in enumerate(xrange(num_actions - 1, 0, -1), 1):
                    column = tf.pad(l_entries[:, offset: offset + size], ((0, 0), (zeros, 0)))
                    l_columns.append(column)
                    offset += size
                l_matrix += tf.stack(l_columns, 1)

            # P = LL^T
            p_matrix = tf.matmul(l_matrix, tf.transpose(l_matrix, (0, 2, 1)))
            # p_matrix = tf.Print(p_matrix, [p_matrix])

            # l_rows = []
            # offset = 0
            # for i in xrange(num_actions):
            #     # Diagonal elements are exponentiated, otherwise gradient often 0
            #     # Slice out lower triangular entries from flat representation through moving offset
            #     diagonal = tf.exp(l_entries[:, offset])  # tf.slice(l_entries, (0, offset), (-1, 1))
            #     n = config.actions - i - 1
            #     # Slice out non-zero non-diagonal entries, - 1 because we already took the diagonal
            #     non_diagonal = l_entries[:, offset + 1: offset + n + 1]  # tf.slice(l_entries, (0, offset + 1), (-1, n))
            #     # Fill up row with zeros
            #     row = tf.pad(tf.concat(axis=1, values=(diagonal, non_diagonal)), ((0, 0), (i, 0)))
            #     offset += (num_actions - i)
            #     l_rows.append(row)
            #
            # # Stack rows to matrix
            # l_matrix = tf.transpose(tf.stack(l_rows, axis=1), (0, 2, 1))

            actions = tf.stack(values=[self.action[name] for name in sorted(self.action)], axis=1)
            action_diff = actions - mean

            # A = -0.5 (a - mean)P(a - mean)
            advantage = -tf.matmul(tf.expand_dims(action_diff, 1), tf.matmul(p_matrix, tf.expand_dims(action_diff, 2))) / 2
            advantage = tf.squeeze(advantage, 2)

            # Q = A + V
            # State-value function
            value = layers['linear'](x=self.training_network.output, size=1)
            q_value = tf.squeeze(value + advantage, 1)
            training_output_vars = tf.contrib.framework.get_variables('training_outputs')

        with tf.variable_scope('target'):
            network_builder = util.get_function(fct=config.network)
            self.target_network = NeuralNetwork(network_builder=network_builder, inputs=self.state)
            self.internal_inputs.extend(self.target_network.internal_inputs)
            self.internal_outputs.extend(self.target_network.internal_outputs)
            self.internal_inits.extend(self.target_network.internal_inits)
            target_value = dict()

        with tf.variable_scope('target_outputs'):
            # State-value function
            target_value_output = layers['linear'](x=self.target_network.output, size=1)
            for action in self.action:
                # Naf directly outputs V(s)
                target_value[action] = target_value_output

            target_output_vars = tf.contrib.framework.get_variables('target_outputs')

        with tf.name_scope("update"):
            for action in self.action:
                q_target = self.reward[:-1] + (1.0 - tf.cast(self.terminal[:-1], tf.float32)) * config.discount * target_value[action][1:]
                delta = q_target - q_value[:-1]
                self.loss_per_instance = tf.square(delta)

                # We observe issues with numerical stability in some tests, gradient clipping can help
                if config.clip_gradients > 0.0:
                    huber_loss = tf.where(tf.abs(delta) < config.clip_gradients, 0.5 * self.loss_per_instance, tf.abs(delta) - 0.5)
                    loss = tf.reduce_mean(huber_loss)
                else:
                    loss = tf.reduce_mean(self.loss_per_instance)
                tf.losses.add_loss(loss)

        with tf.name_scope("update_target"):
            # Combine hidden layer variables and output layer variables
            training_vars = self.training_network.variables + training_output_vars
            target_vars = self.target_network.variables + target_output_vars

            self.target_network_update = list()
            for v_source, v_target in zip(training_vars, target_vars):
                update = v_target.assign_sub(config.update_target_weight * (v_target - v_source))
                self.target_network_update.append(update)