Ejemplo n.º 1
0
    def build_critic_models(self):
        state_branch = self.Sequential()
        state_branch.add(
            self.Dense(self.hidden_layers[0],
                       input_shape=(self.env_spec['state_dim'], ),
                       activation=self.hidden_layers_activation,
                       init='lecun_uniform'))

        action_branch = self.Sequential()
        action_branch.add(
            self.Dense(self.hidden_layers[0],
                       input_shape=(self.env_spec['action_dim'], ),
                       activation=self.hidden_layers_activation,
                       init='lecun_uniform'))

        input_layer = self.Merge([state_branch, action_branch], mode='concat')

        model = self.Sequential()
        model.add(input_layer)

        if (len(self.hidden_layers) > 1):
            for i in range(1, len(self.hidden_layers)):
                model.add(
                    self.Dense(self.hidden_layers[i],
                               init='lecun_uniform',
                               activation=self.hidden_layers_activation))

        model.add(
            self.Dense(1,
                       init='lecun_uniform',
                       activation=self.output_layer_activation))
        logger.info('Critic model summary')
        model.summary()
        self.critic = model
        self.target_critic = clone_model(self.critic)
Ejemplo n.º 2
0
    def build_model(self):
        self.model = self.build_critic_models()
        self.target_model = clone_model(self.model)

        self.critic_states = self.model.inputs[0]
        self.critic_actions = self.model.inputs[1]
        self.out = self.model.output
        self.network_params = self.model.trainable_weights

        self.target_critic_states = self.target_model.inputs[0]
        self.target_critic_actions = self.target_model.inputs[1]
        self.target_out = self.target_model.output
        self.target_network_params = self.target_model.trainable_weights

        # Op for updating target network
        self.update_target_network_op = []
        for i, t_w in enumerate(self.target_network_params):
            op = t_w.assign(
                self.tf.multiply(
                    self.tau, self.network_params[i]
                ) + self.tf.multiply(1. - self.tau, t_w))
            self.update_target_network_op.append(op)

        # custom loss and optimization Op
        self.y = self.tf.placeholder(self.tf.float32, [None, 1])
        self.loss = self.tf.losses.mean_squared_error(self.y, self.out)
        self.optimize = self.tf.train.AdamOptimizer(
            self.critic_lr).minimize(self.loss)

        self.action_gradient = self.tf.gradients(self.out, self.critic_actions)
        return self.model
Ejemplo n.º 3
0
    def build_model(self):
        self.model = self.build_critic_models()
        self.target_model = clone_model(self.model)

        self.critic_states = self.model.inputs[0]
        self.critic_actions = self.model.inputs[1]
        self.out = self.model.output
        self.network_params = self.model.trainable_weights

        self.target_critic_states = self.target_model.inputs[0]
        self.target_critic_actions = self.target_model.inputs[1]
        self.target_out = self.target_model.output
        self.target_network_params = self.target_model.trainable_weights

        # Op for updating target network
        self.update_target_network_op = []
        for i, t_w in enumerate(self.target_network_params):
            op = t_w.assign(
                self.tf.multiply(self.tau, self.network_params[i]) +
                self.tf.multiply(1. - self.tau, t_w))
            self.update_target_network_op.append(op)

        # custom loss and optimization Op
        self.y = self.tf.placeholder(self.tf.float32, [None, 1])
        self.loss = self.tf.losses.mean_squared_error(self.y, self.out)
        self.optimize = self.tf.train.AdamOptimizer(self.critic_lr).minimize(
            self.loss)

        self.action_gradient = self.tf.gradients(self.out, self.critic_actions)
        return self.model
Ejemplo n.º 4
0
    def build_model(self):
        super(DoubleDQN, self).build_model()

        model_2 = clone_model(self.model)
        logger.info("Model 2 summary")
        model_2.summary()
        self.model_2 = model_2

        logger.info("Models 1 and 2 built")
        return self.model, self.model_2
Ejemplo n.º 5
0
 def build_actor_models(self):
     model = self.Sequential()
     self.build_hidden_layers(model)
     model.add(
         self.Dense(self.env_spec['action_dim'],
                    init='lecun_uniform',
                    activation=self.output_layer_activation))
     logger.info('Actor model summary')
     model.summary()
     self.actor = model
     self.target_actor = clone_model(self.actor)
Ejemplo n.º 6
0
def test_clone_sequential_model():
    seq = Sequential()
    seq.add(Dense(8, input_shape=(3, )))
    seq.compile(optimizer='sgd', loss='mse')

    clone = clone_model(seq)
    clone.compile(optimizer='sgd', loss='mse')

    ins = np.random.random((4, 3))
    y_pred_seq = seq.predict_on_batch(ins)
    y_pred_clone = clone.predict_on_batch(ins)
    assert y_pred_seq.shape == y_pred_clone.shape
    assert_allclose(y_pred_seq, y_pred_clone)
Ejemplo n.º 7
0
def test_clone_sequential_model():
    seq = Sequential()
    seq.add(Dense(8, input_shape=(3,)))
    seq.compile(optimizer='sgd', loss='mse')

    clone = clone_model(seq)
    clone.compile(optimizer='sgd', loss='mse')

    ins = np.random.random((4, 3))
    y_pred_seq = seq.predict_on_batch(ins)
    y_pred_clone = clone.predict_on_batch(ins)
    assert y_pred_seq.shape == y_pred_clone.shape
    assert_allclose(y_pred_seq, y_pred_clone)
Ejemplo n.º 8
0
 def __init__(self,
              model,
              env,
              policy,
              target_model_update=1,
              gamma=.99,
              processor=None):
     self.model = model
     self.target_model = clone_model(self.model)
     self.target_model_update = target_model_update
     self.env = env
     self.processor = processor
     self.gamma = gamma
     self.policy = policy
Ejemplo n.º 9
0
def test_clone_graph_model():
    in1 = Input(shape=(2, ))
    in2 = Input(shape=(3, ))
    x = Dense(8)(merge([in1, in2], mode='concat'))
    graph = Model([in1, in2], x)
    graph.compile(optimizer='sgd', loss='mse')

    clone = clone_model(graph)
    clone.compile(optimizer='sgd', loss='mse')

    ins = [np.random.random((4, 2)), np.random.random((4, 3))]
    y_pred_graph = graph.predict_on_batch(ins)
    y_pred_clone = clone.predict_on_batch(ins)
    assert y_pred_graph.shape == y_pred_clone.shape
    assert_allclose(y_pred_graph, y_pred_clone)
Ejemplo n.º 10
0
def test_clone_graph_model():
    in1 = Input(shape=(2,))
    in2 = Input(shape=(3,))
    x = Dense(8)(Concatenate()([in1, in2]))
    graph = Model([in1, in2], x)
    graph.compile(optimizer='sgd', loss='mse')

    clone = clone_model(graph)
    clone.compile(optimizer='sgd', loss='mse')

    ins = [np.random.random((4, 2)), np.random.random((4, 3))]
    y_pred_graph = graph.predict_on_batch(ins)
    y_pred_clone = clone.predict_on_batch(ins)
    assert y_pred_graph.shape == y_pred_clone.shape
    assert_allclose(y_pred_graph, y_pred_clone)
Ejemplo n.º 11
0
    def compile(self, optimizer, metrics=[]):
        """ Calculate the quantile huber loss, see the paper for details. """
        metrics += [self.mean_q]  # register default metrics
        metrics += [self.max_q]

        # We never train the target model, hence we can set the optimizer and loss arbitrarily.
        self.target_model = clone_model(self.model, self.custom_model_objects)
        self.target_model.compile(optimizer='sgd', loss='mse')
        self.model.compile(optimizer='sgd', loss='mse')

        # Compile model.
        # Create trainable model. The problem is that we need to mask the output since we only
        # ever want to update the Q values for a certain action. The way we achieve this is by
        # using a custom Lambda layer that computes the loss. This gives us the necessary flexibility
        # to mask out certain parameters by passing in multiple inputs to the Lambda layer.
        y_pred = self.model.output
        tau = self.model.input[1]
        y_true = Input(name='y_true',
                       shape=(
                           self.nb_sampled_quantiles,
                           self.nb_actions,
                       ))
        mask = Input(name='mask', shape=(self.nb_actions, ))
        loss_out = Lambda(self.clipped_masked_quantile_error,
                          output_shape=(1, ),
                          name='loss')([y_true, y_pred, tau, mask])
        ins = [self.model.input] if type(
            self.model.input) is not list else self.model.input
        trainable_model = Model(inputs=ins + [y_true, mask],
                                outputs=[loss_out, y_pred])
        assert len(trainable_model.output_names) == 2
        combined_metrics = {trainable_model.output_names[1]: metrics}
        losses = [
            lambda y_true, y_pred: y_pred,  # loss is computed in Lambda layer
            lambda y_true, y_pred: K.zeros_like(
                y_pred),  # we only include this for the metrics
        ]
        trainable_model.compile(optimizer=optimizer,
                                loss=losses,
                                metrics=combined_metrics)
        self.trainable_model = trainable_model

        self.compiled = True
Ejemplo n.º 12
0
    def build_model(self):
        self.model = super(Actor, self).build_model()
        self.target_model = clone_model(self.model)

        self.actor_states = self.model.inputs[0]
        self.out = self.model.output
        self.scaled_out = self.tf.multiply(
            self.out, self.env_spec['action_bound_high'])
        self.network_params = self.model.trainable_weights

        self.target_actor_states = self.target_model.inputs[0]
        self.target_out = self.target_model.output
        self.target_scaled_out = self.tf.multiply(
            self.target_out, self.env_spec['action_bound_high'])
        self.target_network_params = self.target_model.trainable_weights

        # Op for updating target network
        self.update_target_network_op = []
        for i, t_w in enumerate(self.target_network_params):
            op = t_w.assign(
                self.tf.multiply(
                    self.tau, self.network_params[i]
                ) + self.tf.multiply(1. - self.tau, t_w))
            self.update_target_network_op.append(op)

        # will be fed as self.action_gradient: critic_grads
        self.action_gradient = self.tf.placeholder(
            self.tf.float32, [None, self.env_spec['action_dim']])

        # actor model gradient op, to be fed from critic
        self.actor_gradients = self.tf.gradients(
            self.scaled_out, self.network_params, -self.action_gradient)

        # Optimization op
        self.optimize = self.tf.train.AdamOptimizer(self.lr).apply_gradients(
            zip(self.actor_gradients, self.network_params))
        return self.model
Ejemplo n.º 13
0
    def build_model(self):
        self.model = super(Actor, self).build_model()
        self.target_model = clone_model(self.model)

        self.actor_states = self.model.inputs[0]
        self.out = self.model.output
        self.scaled_out = self.tf.multiply(self.out,
                                           self.env_spec['action_bound_high'])
        self.network_params = self.model.trainable_weights

        self.target_actor_states = self.target_model.inputs[0]
        self.target_out = self.target_model.output
        self.target_scaled_out = self.tf.multiply(
            self.target_out, self.env_spec['action_bound_high'])
        self.target_network_params = self.target_model.trainable_weights

        # Op for updating target network
        self.update_target_network_op = []
        for i, t_w in enumerate(self.target_network_params):
            op = t_w.assign(
                self.tf.multiply(self.tau, self.network_params[i]) +
                self.tf.multiply(1. - self.tau, t_w))
            self.update_target_network_op.append(op)

        # will be fed as self.action_gradient: critic_grads
        self.action_gradient = self.tf.placeholder(
            self.tf.float32, [None, self.env_spec['action_dim']])

        # actor model gradient op, to be fed from critic
        self.actor_gradients = self.tf.gradients(self.scaled_out,
                                                 self.network_params,
                                                 -self.action_gradient)

        # Optimization op
        self.optimize = self.tf.train.AdamOptimizer(self.lr).apply_gradients(
            zip(self.actor_gradients, self.network_params))
        return self.model
Ejemplo n.º 14
0
    def __init__(self,
                 policy,
                 model,
                 action_provider,
                 memory,
                 goal: Goal,
                 temporal_offsets,
                 batch_size=64,
                 target_model_update=100,
                 default_measurements=None):
        """

        :param policy:
        :param model:
        :param action_provider:
        :param memory:
        :param goal_function:
        :param temporal_offsets: list of ascending ints indicating the temporal offsets.
        :param default_measurements: Used to fill the future_measurements, iff episode ended before. If 'None', the
                                    latest observed measurements before episode end will be used.
        """
        self.policy = policy
        self.model = model  # Inputs of models need to be of form [observation, action, goal_params]
        self.action_provider = action_provider
        self.memory = memory
        self.batch_size = batch_size
        self.temporal_offsets = temporal_offsets
        self.goal = goal
        self.target_model = clone_model(self.model, {})
        self.step = 0
        self.target_model_update = target_model_update
        self.default_measurements = default_measurements

        self.samples = []

        self.current_metrics = []
    def test_mlp_distributional_network_with_prior(self):
        net = NetworkMLPDistributional(3, 4, nb_hidden_layers=2, nb_hidden_neurons=100, duel=False, prior=True,
                                       prior_scale_factor=1, nb_quantiles=8, nb_cos_embeddings=64)
        self.assertTrue(net.model.trainable)
        for layer in net.model.layers:
            if 'prior' in layer.name and not not layer.weights:
                self.assertFalse(layer.trainable)

        state = np.random.rand(32, 1, 3)
        tau = np.random.rand(32, 1, 8)
        net_input = [state, tau]
        out = net.model.predict(net_input)
        self.assertEqual(np.shape(out), (32, 8, 4))

        state = np.random.rand(1, 1, 3)
        tau = np.random.rand(1, 1, 8)
        for i in range(0, 8):
            tau[:, :, i] = tau[:, :, 0]
        net_input = [state, tau]
        out = net.model.predict(net_input)
        self.assertTrue(np.abs((out[0, 0, :] - out[0, :, :]) < 1e-6).all())   # Equal values of tau -> equal Z_tau

        # Cos embedding
        state = np.random.rand(1, 1, 3)
        tau = np.zeros((1, 1, 8))
        tau[0, 0, 0] = 0
        tau[0, 0, 1] = 1/64
        tau[0, 0, 2] = 0.5
        net_input = [state, tau]
        cos_embedding_layer = K.Model(inputs=net.model.inputs, outputs=net.model.get_layer('cos_tau').output)
        cos_embedding_output = cos_embedding_layer.predict(net_input)
        self.assertTrue((cos_embedding_output[0, 0, :] == 1).all())
        self.assertTrue(all([np.isclose(cos_embedding_output[0, 1, i], np.cos(np.pi*i*1/64), atol=1e-7)
                             for i in range(cos_embedding_output.shape[2])]))

        # Merge
        state = np.random.rand(1, 1, 3)
        tau = np.random.rand(1, 1, 8)
        net_input = [state, tau]
        tau_net_layer = K.Model(inputs=net.model.inputs, outputs=net.model.get_layer('fc_tau_trainable').output)
        state_net_layer = K.Model(inputs=net.model.inputs, outputs=net.model.get_layer('fc_state_extra_dim_trainable').output)
        merge_layer = K.Model(inputs=net.model.inputs, outputs=net.model.get_layer('merge_trainable').output)
        tau_net_output = tau_net_layer.predict(net_input)
        state_net_output = state_net_layer.predict(net_input)
        merge_output = merge_layer.predict(net_input)
        self.assertTrue(np.isclose(tau_net_output[None, :, 0, :] * state_net_output, merge_output[:, 0, :]).all())

        plot_model(net.model, to_file='mlp_distributional_with_prior.png', show_shapes=True)

        # Test clone model, mainly to see that no custom objects are missing
        state = np.random.rand(1, 1, 3)
        tau = np.random.rand(1, 1, 8)
        net_input = [state, tau]
        target_model = clone_model(net.model)
        target_model.compile(optimizer='sgd', loss='mse')
        out = net.model.predict(net_input)
        out_clone = target_model.predict(net_input)
        self.assertTrue((out == out_clone).all())

        # Window length > 1
        net = NetworkMLPDistributional(3, 4, nb_hidden_layers=2, nb_hidden_neurons=100, duel=False, prior=True,
                                       prior_scale_factor=1, nb_quantiles=8, nb_cos_embeddings=64, window_length=5)
        self.assertTrue(net.model.trainable)
        for layer in net.model.layers:
            if 'prior' in layer.name and not not layer.weights:
                self.assertFalse(layer.trainable)

        state = np.random.rand(32, 5, 3)
        tau = np.random.rand(32, 1, 8)
        net_input = [state, tau]
        out = net.model.predict(net_input)
        self.assertEqual(np.shape(out), (32, 8, 4))

        state = np.random.rand(1, 5, 3)
        tau = np.random.rand(1, 1, 8)
        for i in range(0, 8):
            tau[:, :, i] = tau[:, 0, 0]
        net_input = [state, tau]
        out = net.model.predict(net_input)
        self.assertTrue(
            np.abs((out[0, 0, :] - out[0, :, :]) < 1e-6).all())  # Equal values of tau should give equal values of Z_tau

        plot_model(net.model, to_file='mlp_window_5_distributional_with_prior.png', show_shapes=True)
Ejemplo n.º 16
0
    def __init__(self, env: gym.Env, **kwargs):

        super(MACE, self).__init__(**kwargs)
        self.nb_actions = env.action_space.shape[0]

        obs_input_actor = Input(shape=(1, ) + env.observation_space.shape,
                                name='observation_input')
        x_ac = Flatten()(obs_input_actor)
        x_ac = Dense(units=256, activation='relu')(x_ac)

        obs_input_critic = Input(shape=(1, ) + env.observation_space.shape,
                                 name='observation_input')
        x_cr = Flatten()(obs_input_critic)
        x_cr = Dense(units=256, activation='relu')(x_cr)

        x_critic = Dense(units=128, activation='relu')(x_cr)
        value = Dense(units=1)(x_critic)

        x_actor = Dense(units=128, activation='relu')(x_ac)
        action = Dense(units=self.nb_actions, activation='tanh')(x_actor)

        actor = Model(inputs=obs_input_actor, outputs=action)
        critic = Model(inputs=obs_input_critic, outputs=value)

        metrics = []
        metrics += [mean_q]
        critic_metrics = metrics

        critic_optimizer = Adam(lr=1e-3)
        actor_optimizer = Adam(lr=1e-3)

        #        critic_optimizer = SGD(lr=1e-4, momentum=0.9)
        #        actor_optimizer = SGD(lr=1e-3, momentum=0.9)

        self.actor = actor
        self.critic = critic

        self.target_actor = clone_model(self.actor)
        self.target_actor.compile(optimizer='sgd', loss='mse')
        self.target_critic = clone_model(self.critic)
        self.target_critic.compile(optimizer='sgd', loss='mse')

        self.target_model_update = 1e-3
        #self.target_model_update=500

        if self.target_model_update < 1.:
            # We use the `AdditionalUpdatesOptimizer` to efficiently soft-update the target model.
            critic_updates = get_soft_target_model_updates(
                self.target_critic, self.critic, self.target_model_update)
            critic_optimizer = AdditionalUpdatesOptimizer(
                critic_optimizer, critic_updates)
            actor_updates = get_soft_target_model_updates(
                self.target_actor, self.actor, self.target_model_update)
            actor_optimizer = AdditionalUpdatesOptimizer(
                actor_optimizer, actor_updates)

        self.delta_clip = np.inf

        def clipped_error(y_true, y_pred):
            return K.mean(huber_loss(y_true, y_pred, self.delta_clip), axis=-1)

        actor.compile(actor_optimizer, loss='mse')
        critic.compile(critic_optimizer, loss='mse', metrics=critic_metrics)

        self.compiled = True

        self.memory = SequentialMemory(limit=100000, window_length=1)
        self.memory_interval = 1
        self.memory_actor = SequentialMemory(limit=100000, window_length=1)
        self.memory_critic = SequentialMemory(limit=100000, window_length=1)

        self.nb_steps_warmup = 50000

        self.train_interval = 4
        self.batch_size = 64
        self.gamma = 0.99

        self.processor = None
        self.random_process = OrnsteinUhlenbeckProcess(theta=.15,
                                                       mu=0.,
                                                       sigma=.3,
                                                       size=self.nb_actions)
        self.eps = 0.9
Ejemplo n.º 17
0
    def compile(self, optimizer, metrics=[]):
        metrics += [mean_q]

        if type(optimizer) in (list, tuple):
            if len(optimizer) != 2:
                raise ValueError(
                    'More than two optimizers provided. Please only provide a maximum of two optimizers, the first one for the actor and the second one for the critic.'
                )
            actor_optimizer, critic_optimizer = optimizer
        else:
            actor_optimizer = optimizer
            critic_optimizer = clone_optimizer(optimizer)
        if type(actor_optimizer) is str:
            actor_optimizer = optimizers.get(actor_optimizer)
        if type(critic_optimizer) is str:
            critic_optimizer = optimizers.get(critic_optimizer)
        assert actor_optimizer != critic_optimizer

        if len(metrics) == 2 and hasattr(metrics[0], '__len__') and hasattr(
                metrics[1], '__len__'):
            actor_metrics, critic_metrics = metrics
        else:
            actor_metrics = critic_metrics = metrics

        def clipped_error(y_true, y_pred):
            return K.mean(huber_loss(y_true, y_pred, self.delta_clip), axis=-1)

        # Compile target networks. We only use them in feed-forward mode, hence we can pass any
        # optimizer and loss since we never use it anyway.
        self.target_actor = clone_model(self.actor, self.custom_model_objects)
        self.target_actor.compile(optimizer='sgd', loss='mse')
        self.target_critic = clone_model(self.critic,
                                         self.custom_model_objects)
        self.target_critic.compile(optimizer='sgd', loss='mse')

        # We also compile the actor. We never optimize the actor using Keras but instead compute
        # the policy gradient ourselves. However, we need the actor in feed-forward mode, hence
        # we also compile it with any optimzer and
        self.actor.compile(optimizer='sgd', loss='mse')

        # Compile the critic.
        if self.target_model_update < 1.:
            # We use the `AdditionalUpdatesOptimizer` to efficiently soft-update the target model.
            critic_updates = get_soft_target_model_updates(
                self.target_critic, self.critic, self.target_model_update)
            critic_optimizer = AdditionalUpdatesOptimizer(
                critic_optimizer, critic_updates)
        self.critic.compile(optimizer=critic_optimizer,
                            loss=clipped_error,
                            metrics=critic_metrics)

        # Combine actor and critic so that we can get the policy gradient.
        # Assuming critic's state inputs are the same as actor's.
        combined_inputs = []
        state_inputs = []
        for i in self.critic.input:
            if i == self.critic_action_input:
                combined_inputs.append([])
            else:
                combined_inputs.append(i)
                state_inputs.append(i)
        combined_inputs[self.critic_action_input_idx] = self.actor(
            state_inputs)

        combined_output = self.critic(combined_inputs)

        updates = actor_optimizer.get_updates(
            params=self.actor.trainable_weights, loss=-K.mean(combined_output))
        if self.target_model_update < 1.:
            # Include soft target model updates.
            updates += get_soft_target_model_updates(self.target_actor,
                                                     self.actor,
                                                     self.target_model_update)
        updates += self.actor.updates  # include other updates of the actor, e.g. for BN

        # Finally, combine it all into a callable function.
        if K.backend() == 'tensorflow':
            self.actor_train_fn = K.function(state_inputs +
                                             [K.learning_phase()],
                                             [self.actor(state_inputs)],
                                             updates=updates)
        else:
            if self.uses_learning_phase:
                state_inputs += [K.learning_phase()]
            self.actor_train_fn = K.function(state_inputs,
                                             [self.actor(state_inputs)],
                                             updates=updates)
        self.actor_optimizer = actor_optimizer

        self.compiled = True
Ejemplo n.º 18
0
 def update_target_model(self):
     # Also, loading logic seems off
     self.model_2 = clone_model(self.model)
     logger.debug("Updated target model weights")
    def test_cnn_distributional_network(self):
        net = NetworkCNNDistributional(nb_ego_states=7, nb_states_per_vehicle=4, nb_vehicles=10, nb_actions=9,
                                       nb_conv_layers=2, nb_conv_filters=32, nb_hidden_fc_layers=2,
                                       nb_hidden_neurons=100, duel=False, prior=False,
                                       nb_quantiles=8, nb_cos_embeddings=64)
        self.assertTrue(net.model.trainable)
        state = np.random.rand(32, 1, 47)
        tau = np.random.rand(32, 1, 8)
        net_input = [state, tau]
        out = net.model.predict(net_input)
        self.assertEqual(np.shape(out), (32, 8, 9))

        state1 = np.random.rand(1, 1, 47)
        tau = np.random.rand(32, 1, 8)
        input1 = [state1, tau]
        out1 = net.model.predict(input1)
        input2 = [np.copy(state1), np.copy(tau)]
        input2[0][0, 0, 7:15] = input1[0][0, 0, 15:23]
        input2[0][0, 0, 15:23] = input1[0][0, 0, 7:15]
        self.assertFalse((input1[0] == input2[0]).all())
        out2 = net.model.predict(input2)
        self.assertTrue((out1 == out2).all())

        state = np.random.rand(1, 1, 47)
        tau = np.random.rand(1, 1, 8)
        for i in range(0, 8):
            tau[:, :, i] = tau[:, :, 0]
        net_input = [state, tau]
        out = net.model.predict(net_input)
        self.assertTrue(np.abs((out[0, 0, :] - out[0, :, :]) < 1e-6).all())   # Equal values of tau -> equal Z_tau

        # Cos embedding
        state = np.random.rand(1, 1, 47)
        tau = np.zeros((1, 1, 8))
        tau[0, 0, 0] = 0
        tau[0, 0, 1] = 1/64
        tau[0, 0, 2] = 0.5
        net_input = [state, tau]
        cos_embedding_layer = K.Model(inputs=net.model.inputs, outputs=net.model.get_layer('cos_tau').output)
        cos_embedding_output = cos_embedding_layer.predict(net_input)
        self.assertTrue((cos_embedding_output[0, 0, :] == 1).all())
        self.assertTrue(all([np.isclose(cos_embedding_output[0, 1, i], np.cos(np.pi*i*1/64), atol=1e-7)
                             for i in range(cos_embedding_output.shape[2])]))

        # Merge
        state = np.random.rand(1, 1, 47)
        tau = np.random.rand(1, 1, 8)
        net_input = [state, tau]
        tau_net_layer = K.Model(inputs=net.model.inputs, outputs=net.model.get_layer('fc_tau').output)
        state_net_layer = K.Model(inputs=net.model.inputs, outputs=net.model.get_layer('merged_extra_dim').output)
        merge_layer = K.Model(inputs=net.model.inputs, outputs=net.model.get_layer('merge').output)
        tau_net_output = tau_net_layer.predict(net_input)
        state_net_output = state_net_layer.predict(net_input)
        merge_output = merge_layer.predict(net_input)
        self.assertTrue(np.isclose(tau_net_output[None, :, 0, :] * state_net_output, merge_output[:, 0, :]).all())

        plot_model(net.model, to_file='cnn_distributional.png', show_shapes=True)

        # Test clone model, mainly to see that no custom objects are missing
        state = np.random.rand(1, 1, 47)
        tau = np.random.rand(1, 1, 8)
        net_input = [state, tau]
        target_model = clone_model(net.model)
        target_model.compile(optimizer='sgd', loss='mse')
        out = net.model.predict(net_input)
        out_clone = target_model.predict(net_input)
        self.assertTrue((out == out_clone).all())

        # Window length > 1
        net = NetworkCNNDistributional(nb_ego_states=7, nb_states_per_vehicle=4, nb_vehicles=10, nb_actions=9,
                                       nb_conv_layers=2, nb_conv_filters=32, nb_hidden_fc_layers=2,
                                       nb_hidden_neurons=100, duel=False, prior=False,
                                       nb_quantiles=8, nb_cos_embeddings=64, window_length=5)
        self.assertTrue(net.model.trainable)
        state = np.random.rand(32, 5, 47)
        tau = np.random.rand(32, 1, 8)
        net_input = [state, tau]
        out = net.model.predict(net_input)
        self.assertEqual(np.shape(out), (32, 8, 9))

        state1 = np.random.rand(1, 5, 47)
        tau = np.random.rand(32, 1, 8)
        input1 = [state1, tau]
        out1 = net.model.predict(input1)
        input2 = [np.copy(state1), np.copy(tau)]
        input2[0][0, :, 7:15] = input1[0][0, :, 15:23]
        input2[0][0, :, 15:23] = input1[0][0, :, 7:15]
        self.assertFalse((input1[0] == input2[0]).all())
        out2 = net.model.predict(input2)
        self.assertTrue((out1 == out2).all())

        state = np.random.rand(1, 5, 47)
        tau = np.random.rand(1, 1, 8)
        for i in range(0, 8):
            tau[:, :, i] = tau[:, 0, 0]
        net_input = [state, tau]
        out = net.model.predict(net_input)
        self.assertTrue(np.abs((out[0, 0, :] - out[0, :, :]) < 1e-6).all())  # Equal values of tau should give equal values of Z_tau

        plot_model(net.model, to_file='cnn_window_5_distributional.png', show_shapes=True)
Ejemplo n.º 20
0
 def update_target_model(self):
     # Also, loading logic seems off
     self.model_2 = clone_model(self.model)
     logger.debug("Updated target model weights")
    def test_cnn_dueling_distributional_with_prior(self):
        net = NetworkCNNDistributional(nb_ego_states=7, nb_states_per_vehicle=4, nb_vehicles=10, nb_actions=9,
                                       nb_conv_layers=2, nb_conv_filters=32, nb_hidden_fc_layers=2,
                                       nb_hidden_neurons=100, duel=True, prior=True, prior_scale_factor=1,
                                       nb_quantiles=8, nb_cos_embeddings=64)
        self.assertTrue(net.model.trainable)
        state = np.random.rand(32, 1, 47)
        tau = np.random.rand(32, 1, 8)
        net_input = [state, tau]
        out = net.model.predict(net_input)
        self.assertEqual(np.shape(out), (32, 8, 9))

        state1 = np.random.rand(1, 1, 47)
        tau = np.random.rand(32, 1, 8)
        input1 = [state1, tau]
        out1 = net.model.predict(input1)
        input2 = [np.copy(state1), np.copy(tau)]
        input2[0][0, 0, 7:15] = input1[0][0, 0, 15:23]
        input2[0][0, 0, 15:23] = input1[0][0, 0, 7:15]
        self.assertFalse((input1[0] == input2[0]).all())
        out2 = net.model.predict(input2)
        self.assertTrue((out1 == out2).all())

        state = np.random.rand(1, 1, 47)
        tau = np.random.rand(1, 1, 8)
        for i in range(0, 8):
            tau[:, :, i] = tau[:, :, 0]
        net_input = [state, tau]
        out = net.model.predict(net_input)
        self.assertTrue(np.abs((out[0, 0, :] - out[0, :, :]) < 1e-6).all())   # Equal values of tau -> equal Z_tau

        # Cos embedding
        state = np.random.rand(1, 1, 47)
        tau = np.zeros((1, 1, 8))
        tau[0, 0, 0] = 0
        tau[0, 0, 1] = 1/64
        tau[0, 0, 2] = 0.5
        net_input = [state, tau]
        cos_embedding_layer = K.Model(inputs=net.model.inputs, outputs=net.model.get_layer('cos_tau').output)
        cos_embedding_output = cos_embedding_layer.predict(net_input)
        self.assertTrue((cos_embedding_output[0, 0, :] == 1).all())
        self.assertTrue(all([np.isclose(cos_embedding_output[0, 1, i], np.cos(np.pi*i*1/64), atol=1e-7)
                             for i in range(cos_embedding_output.shape[2])]))

        # Merge
        state = np.random.rand(1, 1, 47)
        tau = np.random.rand(1, 1, 8)
        net_input = [state, tau]
        tau_net_layer = K.Model(inputs=net.model.inputs, outputs=net.model.get_layer('fc_tau_trainable').output)
        state_net_layer = K.Model(inputs=net.model.inputs, outputs=net.model.get_layer('merged_extra_dim_trainable').output)
        merge_layer = K.Model(inputs=net.model.inputs, outputs=net.model.get_layer('merge_trainable').output)
        tau_net_output = tau_net_layer.predict(net_input)
        state_net_output = state_net_layer.predict(net_input)
        merge_output = merge_layer.predict(net_input)
        self.assertTrue(np.isclose(tau_net_output[None, :, 0, :] * state_net_output, merge_output[:, 0, :]).all())

        plot_model(net.model, to_file='cnn_duel_distributional_with_prior.png', show_shapes=True)

        # Test clone model, mainly to see that no custom objects are missing
        state = np.random.rand(1, 1, 47)
        tau = np.random.rand(1, 1, 8)
        net_input = [state, tau]
        target_model = clone_model(net.model)
        target_model.compile(optimizer='sgd', loss='mse')
        out = net.model.predict(net_input)
        out_clone = target_model.predict(net_input)
        self.assertTrue((out == out_clone).all())

        # Prior nets not trainable
        self.assertTrue(net.model.trainable)
        for layer in net.model.layers:
            if 'prior' in layer.name and not not layer.weights:
                self.assertFalse(layer.trainable)

        net.model.get_config()  # This crashes for custom lambda layers

        before_dueling_layer = K.Model(inputs=net.model.inputs,
                                       outputs=net.model.get_layer('output_trainable_wo_dueling').output)
        before_dueling_output = before_dueling_layer.predict(net_input)
        after_dueling_layer = K.Model(inputs=net.model.inputs, outputs=net.model.get_layer('output_trainable').output)
        after_dueling_output = after_dueling_layer.predict(net_input)
        true_dueling_output = before_dueling_output[:, :, 0, None] + before_dueling_output[:, :, 1:] - \
                              np.mean(before_dueling_output[:, :, 1:, None], axis=2)
        self.assertTrue(np.isclose(after_dueling_output, true_dueling_output).all())

        single_input = [net_input[0][None, 0], net_input[1][None, 0]]
        self.assertTrue(np.isclose(out[0], net.model.predict(single_input)).all())

        # Window length > 1
        net = NetworkCNNDistributional(nb_ego_states=7, nb_states_per_vehicle=4, nb_vehicles=10, nb_actions=9,
                                       nb_conv_layers=2, nb_conv_filters=32, nb_hidden_fc_layers=2,
                                       nb_hidden_neurons=100, duel=True, prior=True, prior_scale_factor=1,
                                       nb_quantiles=8, nb_cos_embeddings=64, window_length=5)
        self.assertTrue(net.model.trainable)
        state = np.random.rand(32, 5, 47)
        tau = np.random.rand(32, 1, 8)
        net_input = [state, tau]
        out = net.model.predict(net_input)
        self.assertEqual(np.shape(out), (32, 8, 9))

        state1 = np.random.rand(1, 5, 47)
        tau = np.random.rand(32, 1, 8)
        input1 = [state1, tau]
        out1 = net.model.predict(input1)
        input2 = [np.copy(state1), np.copy(tau)]
        input2[0][0, :, 7:15] = input1[0][0, :, 15:23]
        input2[0][0, :, 15:23] = input1[0][0, :, 7:15]
        self.assertFalse((input1[0] == input2[0]).all())
        out2 = net.model.predict(input2)
        self.assertTrue((out1 == out2).all())

        state = np.random.rand(1, 5, 47)
        tau = np.random.rand(1, 1, 8)
        for i in range(0, 8):
            tau[:, :, i] = tau[:, 0, 0]
        net_input = [state, tau]
        out = net.model.predict(net_input)
        self.assertTrue(np.abs((out[0, 0, :] - out[0, :, :]) < 1e-6).all())  # Equal values of tau should give equal values of Z_tau

        plot_model(net.model, to_file='cnn_window_5_duel_distributional_with_prior.png', show_shapes=True)
Ejemplo n.º 22
0
    def compile(self):
        # def clipped_error(y_true, y_pred):
        # return K.mean(huber_loss(y_true, y_pred, self.delta_clip), axis=-1)

        # Compile target networks. We only use them in feed-forward mode, hence we can pass any
        # optimizer and loss since we never use it anyway.
        self.target_actor = clone_model(self.actor, self.custom_model_objects)
        self.target_actor.compile(optimizer='sgd', loss='mse')
        self.target_critic = clone_model(self.critic,
                                         self.custom_model_objects)
        self.target_critic.compile(optimizer='sgd', loss='mse')

        # We also compile the actor. We never optimize the actor using Keras but instead compute
        # the policy gradient ourselves. However, we need the actor in feed-forward mode, hence
        # we also compile it with any optimizer
        self.actor.compile(optimizer='sgd', loss='mse')

        # Compile the critic for the same reason
        self.critic.compile(optimizer='sgd', loss='mse')

        # Compile the critic optimizer
        critic_optimizer = tf.train.AdamOptimizer()
        self.critic_target = tf.placeholder(dtype=tf.float32, shape=(None, 1))
        # Clip the critic gradient using the huber loss
        critic_loss = K.mean(
            huber_loss(
                self.critic([self.state, self.action]), self.critic_target,
                self.delta_clip))
        critic_gradient_vars = critic_optimizer.compute_gradients(
            critic_loss, var_list=self.critic.trainable_weights)

        # Compute the norm as a metric
        critic_gradients_norms = [
            tf.norm(grad_var[0]) for grad_var in critic_gradient_vars
        ]
        critic_gradient_norm = tf.reduce_sum(critic_gradients_norms)

        self.critic_train_fn = critic_optimizer.apply_gradients(
            critic_gradient_vars)

        # Target critic optimizer
        if self.target_critic_update < 1.:
            # Include soft target model updates.
            self.target_critic_train_fn = get_soft_target_model_ops(
                self.target_critic.weights, self.critic.weights,
                self.target_critic_update)

        # Target actor optimizer
        if self.target_actor_update < 1.:
            # Include soft target model updates.
            self.target_actor_train_fn = get_soft_target_model_ops(
                self.target_actor.weights, self.actor.weights,
                self.target_actor_update)

        # Actor optimizer
        actor_optimizer = tf.train.AdamOptimizer()
        # Be careful to negate the gradient
        # Since the optimizer wants to minimize the value
        actor_loss = -tf.reduce_mean(
            self.critic([self.state, self.actor(self.state)]))

        actor_gradient_vars = actor_optimizer.compute_gradients(
            actor_loss, var_list=self.actor.trainable_weights)
        # Gradient inverting
        # as described in https://arxiv.org/abs/1511.04143
        if self.invert_gradients:
            actor_gradient_vars = [(gradient_inverter(
                x[0], self.gradient_inverter_min, self.gradient_inverter_max),
                                    x[1]) for x in actor_gradient_vars]

        # Compute the norm as a metric
        actor_gradients_norms = [
            tf.norm(grad_var[0]) for grad_var in actor_gradient_vars
        ]
        actor_gradient_norm = tf.reduce_sum(actor_gradients_norms)

        # The actual train function
        self.actor_train_fn = actor_optimizer.apply_gradients(
            actor_gradient_vars)

        # Collect metrics
        self.critic_summaries.append(
            tf.summary.scalar("critic/loss", critic_loss))
        self.critic_summaries.append(
            tf.summary.scalar("critic/gradient", critic_gradient_norm))
        for var, norm in zip(self.critic.trainable_weights,
                             critic_gradients_norms):
            self.critic_summaries.append(
                tf.summary.scalar("critic/{}".format(var.name), norm))

        self.actor_summaries.append(
            tf.summary.scalar("actor/loss", -actor_loss))
        self.actor_summaries.append(
            tf.summary.scalar("actor/gradient", actor_gradient_norm))
        for var, norm in zip(self.actor.trainable_weights,
                             actor_gradients_norms):
            self.actor_summaries.append(
                tf.summary.scalar("actor/{}".format(var.name), norm))

        # FIXME: Use directly Keras backend
        # This is a kind of a hack
        # Taken from the "initialize_variables" of the Keras Tensorflow backend
        # https://github.com/fchollet/keras/blob/master/keras/backend/tensorflow_backend.py#L330
        # It permits to only initialize variables that are not already initialized
        # Without that, the networks and target networks get initialized again, to different values (stochastic initialization)
        # This is a problem when a network and it's target network do not begin with the same parameter values...
        variables = tf.global_variables()
        uninitialized_variables = []
        for v in variables:
            if not hasattr(v,
                           '_keras_initialized') or not v._keras_initialized:
                uninitialized_variables.append(v)
                v._keras_initialized = True
        self.session.run(tf.variables_initializer(uninitialized_variables))
        # self.session.run(tf.global_variables_initializer())

        self.merged_summary = tf.summary.merge_all()

        self.compiled = True