def build_critic_models(self): state_branch = self.Sequential() state_branch.add( self.Dense(self.hidden_layers[0], input_shape=(self.env_spec['state_dim'], ), activation=self.hidden_layers_activation, init='lecun_uniform')) action_branch = self.Sequential() action_branch.add( self.Dense(self.hidden_layers[0], input_shape=(self.env_spec['action_dim'], ), activation=self.hidden_layers_activation, init='lecun_uniform')) input_layer = self.Merge([state_branch, action_branch], mode='concat') model = self.Sequential() model.add(input_layer) if (len(self.hidden_layers) > 1): for i in range(1, len(self.hidden_layers)): model.add( self.Dense(self.hidden_layers[i], init='lecun_uniform', activation=self.hidden_layers_activation)) model.add( self.Dense(1, init='lecun_uniform', activation=self.output_layer_activation)) logger.info('Critic model summary') model.summary() self.critic = model self.target_critic = clone_model(self.critic)
def build_model(self): self.model = self.build_critic_models() self.target_model = clone_model(self.model) self.critic_states = self.model.inputs[0] self.critic_actions = self.model.inputs[1] self.out = self.model.output self.network_params = self.model.trainable_weights self.target_critic_states = self.target_model.inputs[0] self.target_critic_actions = self.target_model.inputs[1] self.target_out = self.target_model.output self.target_network_params = self.target_model.trainable_weights # Op for updating target network self.update_target_network_op = [] for i, t_w in enumerate(self.target_network_params): op = t_w.assign( self.tf.multiply( self.tau, self.network_params[i] ) + self.tf.multiply(1. - self.tau, t_w)) self.update_target_network_op.append(op) # custom loss and optimization Op self.y = self.tf.placeholder(self.tf.float32, [None, 1]) self.loss = self.tf.losses.mean_squared_error(self.y, self.out) self.optimize = self.tf.train.AdamOptimizer( self.critic_lr).minimize(self.loss) self.action_gradient = self.tf.gradients(self.out, self.critic_actions) return self.model
def build_model(self): self.model = self.build_critic_models() self.target_model = clone_model(self.model) self.critic_states = self.model.inputs[0] self.critic_actions = self.model.inputs[1] self.out = self.model.output self.network_params = self.model.trainable_weights self.target_critic_states = self.target_model.inputs[0] self.target_critic_actions = self.target_model.inputs[1] self.target_out = self.target_model.output self.target_network_params = self.target_model.trainable_weights # Op for updating target network self.update_target_network_op = [] for i, t_w in enumerate(self.target_network_params): op = t_w.assign( self.tf.multiply(self.tau, self.network_params[i]) + self.tf.multiply(1. - self.tau, t_w)) self.update_target_network_op.append(op) # custom loss and optimization Op self.y = self.tf.placeholder(self.tf.float32, [None, 1]) self.loss = self.tf.losses.mean_squared_error(self.y, self.out) self.optimize = self.tf.train.AdamOptimizer(self.critic_lr).minimize( self.loss) self.action_gradient = self.tf.gradients(self.out, self.critic_actions) return self.model
def build_model(self): super(DoubleDQN, self).build_model() model_2 = clone_model(self.model) logger.info("Model 2 summary") model_2.summary() self.model_2 = model_2 logger.info("Models 1 and 2 built") return self.model, self.model_2
def build_actor_models(self): model = self.Sequential() self.build_hidden_layers(model) model.add( self.Dense(self.env_spec['action_dim'], init='lecun_uniform', activation=self.output_layer_activation)) logger.info('Actor model summary') model.summary() self.actor = model self.target_actor = clone_model(self.actor)
def test_clone_sequential_model(): seq = Sequential() seq.add(Dense(8, input_shape=(3, ))) seq.compile(optimizer='sgd', loss='mse') clone = clone_model(seq) clone.compile(optimizer='sgd', loss='mse') ins = np.random.random((4, 3)) y_pred_seq = seq.predict_on_batch(ins) y_pred_clone = clone.predict_on_batch(ins) assert y_pred_seq.shape == y_pred_clone.shape assert_allclose(y_pred_seq, y_pred_clone)
def test_clone_sequential_model(): seq = Sequential() seq.add(Dense(8, input_shape=(3,))) seq.compile(optimizer='sgd', loss='mse') clone = clone_model(seq) clone.compile(optimizer='sgd', loss='mse') ins = np.random.random((4, 3)) y_pred_seq = seq.predict_on_batch(ins) y_pred_clone = clone.predict_on_batch(ins) assert y_pred_seq.shape == y_pred_clone.shape assert_allclose(y_pred_seq, y_pred_clone)
def __init__(self, model, env, policy, target_model_update=1, gamma=.99, processor=None): self.model = model self.target_model = clone_model(self.model) self.target_model_update = target_model_update self.env = env self.processor = processor self.gamma = gamma self.policy = policy
def test_clone_graph_model(): in1 = Input(shape=(2, )) in2 = Input(shape=(3, )) x = Dense(8)(merge([in1, in2], mode='concat')) graph = Model([in1, in2], x) graph.compile(optimizer='sgd', loss='mse') clone = clone_model(graph) clone.compile(optimizer='sgd', loss='mse') ins = [np.random.random((4, 2)), np.random.random((4, 3))] y_pred_graph = graph.predict_on_batch(ins) y_pred_clone = clone.predict_on_batch(ins) assert y_pred_graph.shape == y_pred_clone.shape assert_allclose(y_pred_graph, y_pred_clone)
def test_clone_graph_model(): in1 = Input(shape=(2,)) in2 = Input(shape=(3,)) x = Dense(8)(Concatenate()([in1, in2])) graph = Model([in1, in2], x) graph.compile(optimizer='sgd', loss='mse') clone = clone_model(graph) clone.compile(optimizer='sgd', loss='mse') ins = [np.random.random((4, 2)), np.random.random((4, 3))] y_pred_graph = graph.predict_on_batch(ins) y_pred_clone = clone.predict_on_batch(ins) assert y_pred_graph.shape == y_pred_clone.shape assert_allclose(y_pred_graph, y_pred_clone)
def compile(self, optimizer, metrics=[]): """ Calculate the quantile huber loss, see the paper for details. """ metrics += [self.mean_q] # register default metrics metrics += [self.max_q] # We never train the target model, hence we can set the optimizer and loss arbitrarily. self.target_model = clone_model(self.model, self.custom_model_objects) self.target_model.compile(optimizer='sgd', loss='mse') self.model.compile(optimizer='sgd', loss='mse') # Compile model. # Create trainable model. The problem is that we need to mask the output since we only # ever want to update the Q values for a certain action. The way we achieve this is by # using a custom Lambda layer that computes the loss. This gives us the necessary flexibility # to mask out certain parameters by passing in multiple inputs to the Lambda layer. y_pred = self.model.output tau = self.model.input[1] y_true = Input(name='y_true', shape=( self.nb_sampled_quantiles, self.nb_actions, )) mask = Input(name='mask', shape=(self.nb_actions, )) loss_out = Lambda(self.clipped_masked_quantile_error, output_shape=(1, ), name='loss')([y_true, y_pred, tau, mask]) ins = [self.model.input] if type( self.model.input) is not list else self.model.input trainable_model = Model(inputs=ins + [y_true, mask], outputs=[loss_out, y_pred]) assert len(trainable_model.output_names) == 2 combined_metrics = {trainable_model.output_names[1]: metrics} losses = [ lambda y_true, y_pred: y_pred, # loss is computed in Lambda layer lambda y_true, y_pred: K.zeros_like( y_pred), # we only include this for the metrics ] trainable_model.compile(optimizer=optimizer, loss=losses, metrics=combined_metrics) self.trainable_model = trainable_model self.compiled = True
def build_model(self): self.model = super(Actor, self).build_model() self.target_model = clone_model(self.model) self.actor_states = self.model.inputs[0] self.out = self.model.output self.scaled_out = self.tf.multiply( self.out, self.env_spec['action_bound_high']) self.network_params = self.model.trainable_weights self.target_actor_states = self.target_model.inputs[0] self.target_out = self.target_model.output self.target_scaled_out = self.tf.multiply( self.target_out, self.env_spec['action_bound_high']) self.target_network_params = self.target_model.trainable_weights # Op for updating target network self.update_target_network_op = [] for i, t_w in enumerate(self.target_network_params): op = t_w.assign( self.tf.multiply( self.tau, self.network_params[i] ) + self.tf.multiply(1. - self.tau, t_w)) self.update_target_network_op.append(op) # will be fed as self.action_gradient: critic_grads self.action_gradient = self.tf.placeholder( self.tf.float32, [None, self.env_spec['action_dim']]) # actor model gradient op, to be fed from critic self.actor_gradients = self.tf.gradients( self.scaled_out, self.network_params, -self.action_gradient) # Optimization op self.optimize = self.tf.train.AdamOptimizer(self.lr).apply_gradients( zip(self.actor_gradients, self.network_params)) return self.model
def build_model(self): self.model = super(Actor, self).build_model() self.target_model = clone_model(self.model) self.actor_states = self.model.inputs[0] self.out = self.model.output self.scaled_out = self.tf.multiply(self.out, self.env_spec['action_bound_high']) self.network_params = self.model.trainable_weights self.target_actor_states = self.target_model.inputs[0] self.target_out = self.target_model.output self.target_scaled_out = self.tf.multiply( self.target_out, self.env_spec['action_bound_high']) self.target_network_params = self.target_model.trainable_weights # Op for updating target network self.update_target_network_op = [] for i, t_w in enumerate(self.target_network_params): op = t_w.assign( self.tf.multiply(self.tau, self.network_params[i]) + self.tf.multiply(1. - self.tau, t_w)) self.update_target_network_op.append(op) # will be fed as self.action_gradient: critic_grads self.action_gradient = self.tf.placeholder( self.tf.float32, [None, self.env_spec['action_dim']]) # actor model gradient op, to be fed from critic self.actor_gradients = self.tf.gradients(self.scaled_out, self.network_params, -self.action_gradient) # Optimization op self.optimize = self.tf.train.AdamOptimizer(self.lr).apply_gradients( zip(self.actor_gradients, self.network_params)) return self.model
def __init__(self, policy, model, action_provider, memory, goal: Goal, temporal_offsets, batch_size=64, target_model_update=100, default_measurements=None): """ :param policy: :param model: :param action_provider: :param memory: :param goal_function: :param temporal_offsets: list of ascending ints indicating the temporal offsets. :param default_measurements: Used to fill the future_measurements, iff episode ended before. If 'None', the latest observed measurements before episode end will be used. """ self.policy = policy self.model = model # Inputs of models need to be of form [observation, action, goal_params] self.action_provider = action_provider self.memory = memory self.batch_size = batch_size self.temporal_offsets = temporal_offsets self.goal = goal self.target_model = clone_model(self.model, {}) self.step = 0 self.target_model_update = target_model_update self.default_measurements = default_measurements self.samples = [] self.current_metrics = []
def test_mlp_distributional_network_with_prior(self): net = NetworkMLPDistributional(3, 4, nb_hidden_layers=2, nb_hidden_neurons=100, duel=False, prior=True, prior_scale_factor=1, nb_quantiles=8, nb_cos_embeddings=64) self.assertTrue(net.model.trainable) for layer in net.model.layers: if 'prior' in layer.name and not not layer.weights: self.assertFalse(layer.trainable) state = np.random.rand(32, 1, 3) tau = np.random.rand(32, 1, 8) net_input = [state, tau] out = net.model.predict(net_input) self.assertEqual(np.shape(out), (32, 8, 4)) state = np.random.rand(1, 1, 3) tau = np.random.rand(1, 1, 8) for i in range(0, 8): tau[:, :, i] = tau[:, :, 0] net_input = [state, tau] out = net.model.predict(net_input) self.assertTrue(np.abs((out[0, 0, :] - out[0, :, :]) < 1e-6).all()) # Equal values of tau -> equal Z_tau # Cos embedding state = np.random.rand(1, 1, 3) tau = np.zeros((1, 1, 8)) tau[0, 0, 0] = 0 tau[0, 0, 1] = 1/64 tau[0, 0, 2] = 0.5 net_input = [state, tau] cos_embedding_layer = K.Model(inputs=net.model.inputs, outputs=net.model.get_layer('cos_tau').output) cos_embedding_output = cos_embedding_layer.predict(net_input) self.assertTrue((cos_embedding_output[0, 0, :] == 1).all()) self.assertTrue(all([np.isclose(cos_embedding_output[0, 1, i], np.cos(np.pi*i*1/64), atol=1e-7) for i in range(cos_embedding_output.shape[2])])) # Merge state = np.random.rand(1, 1, 3) tau = np.random.rand(1, 1, 8) net_input = [state, tau] tau_net_layer = K.Model(inputs=net.model.inputs, outputs=net.model.get_layer('fc_tau_trainable').output) state_net_layer = K.Model(inputs=net.model.inputs, outputs=net.model.get_layer('fc_state_extra_dim_trainable').output) merge_layer = K.Model(inputs=net.model.inputs, outputs=net.model.get_layer('merge_trainable').output) tau_net_output = tau_net_layer.predict(net_input) state_net_output = state_net_layer.predict(net_input) merge_output = merge_layer.predict(net_input) self.assertTrue(np.isclose(tau_net_output[None, :, 0, :] * state_net_output, merge_output[:, 0, :]).all()) plot_model(net.model, to_file='mlp_distributional_with_prior.png', show_shapes=True) # Test clone model, mainly to see that no custom objects are missing state = np.random.rand(1, 1, 3) tau = np.random.rand(1, 1, 8) net_input = [state, tau] target_model = clone_model(net.model) target_model.compile(optimizer='sgd', loss='mse') out = net.model.predict(net_input) out_clone = target_model.predict(net_input) self.assertTrue((out == out_clone).all()) # Window length > 1 net = NetworkMLPDistributional(3, 4, nb_hidden_layers=2, nb_hidden_neurons=100, duel=False, prior=True, prior_scale_factor=1, nb_quantiles=8, nb_cos_embeddings=64, window_length=5) self.assertTrue(net.model.trainable) for layer in net.model.layers: if 'prior' in layer.name and not not layer.weights: self.assertFalse(layer.trainable) state = np.random.rand(32, 5, 3) tau = np.random.rand(32, 1, 8) net_input = [state, tau] out = net.model.predict(net_input) self.assertEqual(np.shape(out), (32, 8, 4)) state = np.random.rand(1, 5, 3) tau = np.random.rand(1, 1, 8) for i in range(0, 8): tau[:, :, i] = tau[:, 0, 0] net_input = [state, tau] out = net.model.predict(net_input) self.assertTrue( np.abs((out[0, 0, :] - out[0, :, :]) < 1e-6).all()) # Equal values of tau should give equal values of Z_tau plot_model(net.model, to_file='mlp_window_5_distributional_with_prior.png', show_shapes=True)
def __init__(self, env: gym.Env, **kwargs): super(MACE, self).__init__(**kwargs) self.nb_actions = env.action_space.shape[0] obs_input_actor = Input(shape=(1, ) + env.observation_space.shape, name='observation_input') x_ac = Flatten()(obs_input_actor) x_ac = Dense(units=256, activation='relu')(x_ac) obs_input_critic = Input(shape=(1, ) + env.observation_space.shape, name='observation_input') x_cr = Flatten()(obs_input_critic) x_cr = Dense(units=256, activation='relu')(x_cr) x_critic = Dense(units=128, activation='relu')(x_cr) value = Dense(units=1)(x_critic) x_actor = Dense(units=128, activation='relu')(x_ac) action = Dense(units=self.nb_actions, activation='tanh')(x_actor) actor = Model(inputs=obs_input_actor, outputs=action) critic = Model(inputs=obs_input_critic, outputs=value) metrics = [] metrics += [mean_q] critic_metrics = metrics critic_optimizer = Adam(lr=1e-3) actor_optimizer = Adam(lr=1e-3) # critic_optimizer = SGD(lr=1e-4, momentum=0.9) # actor_optimizer = SGD(lr=1e-3, momentum=0.9) self.actor = actor self.critic = critic self.target_actor = clone_model(self.actor) self.target_actor.compile(optimizer='sgd', loss='mse') self.target_critic = clone_model(self.critic) self.target_critic.compile(optimizer='sgd', loss='mse') self.target_model_update = 1e-3 #self.target_model_update=500 if self.target_model_update < 1.: # We use the `AdditionalUpdatesOptimizer` to efficiently soft-update the target model. critic_updates = get_soft_target_model_updates( self.target_critic, self.critic, self.target_model_update) critic_optimizer = AdditionalUpdatesOptimizer( critic_optimizer, critic_updates) actor_updates = get_soft_target_model_updates( self.target_actor, self.actor, self.target_model_update) actor_optimizer = AdditionalUpdatesOptimizer( actor_optimizer, actor_updates) self.delta_clip = np.inf def clipped_error(y_true, y_pred): return K.mean(huber_loss(y_true, y_pred, self.delta_clip), axis=-1) actor.compile(actor_optimizer, loss='mse') critic.compile(critic_optimizer, loss='mse', metrics=critic_metrics) self.compiled = True self.memory = SequentialMemory(limit=100000, window_length=1) self.memory_interval = 1 self.memory_actor = SequentialMemory(limit=100000, window_length=1) self.memory_critic = SequentialMemory(limit=100000, window_length=1) self.nb_steps_warmup = 50000 self.train_interval = 4 self.batch_size = 64 self.gamma = 0.99 self.processor = None self.random_process = OrnsteinUhlenbeckProcess(theta=.15, mu=0., sigma=.3, size=self.nb_actions) self.eps = 0.9
def compile(self, optimizer, metrics=[]): metrics += [mean_q] if type(optimizer) in (list, tuple): if len(optimizer) != 2: raise ValueError( 'More than two optimizers provided. Please only provide a maximum of two optimizers, the first one for the actor and the second one for the critic.' ) actor_optimizer, critic_optimizer = optimizer else: actor_optimizer = optimizer critic_optimizer = clone_optimizer(optimizer) if type(actor_optimizer) is str: actor_optimizer = optimizers.get(actor_optimizer) if type(critic_optimizer) is str: critic_optimizer = optimizers.get(critic_optimizer) assert actor_optimizer != critic_optimizer if len(metrics) == 2 and hasattr(metrics[0], '__len__') and hasattr( metrics[1], '__len__'): actor_metrics, critic_metrics = metrics else: actor_metrics = critic_metrics = metrics def clipped_error(y_true, y_pred): return K.mean(huber_loss(y_true, y_pred, self.delta_clip), axis=-1) # Compile target networks. We only use them in feed-forward mode, hence we can pass any # optimizer and loss since we never use it anyway. self.target_actor = clone_model(self.actor, self.custom_model_objects) self.target_actor.compile(optimizer='sgd', loss='mse') self.target_critic = clone_model(self.critic, self.custom_model_objects) self.target_critic.compile(optimizer='sgd', loss='mse') # We also compile the actor. We never optimize the actor using Keras but instead compute # the policy gradient ourselves. However, we need the actor in feed-forward mode, hence # we also compile it with any optimzer and self.actor.compile(optimizer='sgd', loss='mse') # Compile the critic. if self.target_model_update < 1.: # We use the `AdditionalUpdatesOptimizer` to efficiently soft-update the target model. critic_updates = get_soft_target_model_updates( self.target_critic, self.critic, self.target_model_update) critic_optimizer = AdditionalUpdatesOptimizer( critic_optimizer, critic_updates) self.critic.compile(optimizer=critic_optimizer, loss=clipped_error, metrics=critic_metrics) # Combine actor and critic so that we can get the policy gradient. # Assuming critic's state inputs are the same as actor's. combined_inputs = [] state_inputs = [] for i in self.critic.input: if i == self.critic_action_input: combined_inputs.append([]) else: combined_inputs.append(i) state_inputs.append(i) combined_inputs[self.critic_action_input_idx] = self.actor( state_inputs) combined_output = self.critic(combined_inputs) updates = actor_optimizer.get_updates( params=self.actor.trainable_weights, loss=-K.mean(combined_output)) if self.target_model_update < 1.: # Include soft target model updates. updates += get_soft_target_model_updates(self.target_actor, self.actor, self.target_model_update) updates += self.actor.updates # include other updates of the actor, e.g. for BN # Finally, combine it all into a callable function. if K.backend() == 'tensorflow': self.actor_train_fn = K.function(state_inputs + [K.learning_phase()], [self.actor(state_inputs)], updates=updates) else: if self.uses_learning_phase: state_inputs += [K.learning_phase()] self.actor_train_fn = K.function(state_inputs, [self.actor(state_inputs)], updates=updates) self.actor_optimizer = actor_optimizer self.compiled = True
def update_target_model(self): # Also, loading logic seems off self.model_2 = clone_model(self.model) logger.debug("Updated target model weights")
def test_cnn_distributional_network(self): net = NetworkCNNDistributional(nb_ego_states=7, nb_states_per_vehicle=4, nb_vehicles=10, nb_actions=9, nb_conv_layers=2, nb_conv_filters=32, nb_hidden_fc_layers=2, nb_hidden_neurons=100, duel=False, prior=False, nb_quantiles=8, nb_cos_embeddings=64) self.assertTrue(net.model.trainable) state = np.random.rand(32, 1, 47) tau = np.random.rand(32, 1, 8) net_input = [state, tau] out = net.model.predict(net_input) self.assertEqual(np.shape(out), (32, 8, 9)) state1 = np.random.rand(1, 1, 47) tau = np.random.rand(32, 1, 8) input1 = [state1, tau] out1 = net.model.predict(input1) input2 = [np.copy(state1), np.copy(tau)] input2[0][0, 0, 7:15] = input1[0][0, 0, 15:23] input2[0][0, 0, 15:23] = input1[0][0, 0, 7:15] self.assertFalse((input1[0] == input2[0]).all()) out2 = net.model.predict(input2) self.assertTrue((out1 == out2).all()) state = np.random.rand(1, 1, 47) tau = np.random.rand(1, 1, 8) for i in range(0, 8): tau[:, :, i] = tau[:, :, 0] net_input = [state, tau] out = net.model.predict(net_input) self.assertTrue(np.abs((out[0, 0, :] - out[0, :, :]) < 1e-6).all()) # Equal values of tau -> equal Z_tau # Cos embedding state = np.random.rand(1, 1, 47) tau = np.zeros((1, 1, 8)) tau[0, 0, 0] = 0 tau[0, 0, 1] = 1/64 tau[0, 0, 2] = 0.5 net_input = [state, tau] cos_embedding_layer = K.Model(inputs=net.model.inputs, outputs=net.model.get_layer('cos_tau').output) cos_embedding_output = cos_embedding_layer.predict(net_input) self.assertTrue((cos_embedding_output[0, 0, :] == 1).all()) self.assertTrue(all([np.isclose(cos_embedding_output[0, 1, i], np.cos(np.pi*i*1/64), atol=1e-7) for i in range(cos_embedding_output.shape[2])])) # Merge state = np.random.rand(1, 1, 47) tau = np.random.rand(1, 1, 8) net_input = [state, tau] tau_net_layer = K.Model(inputs=net.model.inputs, outputs=net.model.get_layer('fc_tau').output) state_net_layer = K.Model(inputs=net.model.inputs, outputs=net.model.get_layer('merged_extra_dim').output) merge_layer = K.Model(inputs=net.model.inputs, outputs=net.model.get_layer('merge').output) tau_net_output = tau_net_layer.predict(net_input) state_net_output = state_net_layer.predict(net_input) merge_output = merge_layer.predict(net_input) self.assertTrue(np.isclose(tau_net_output[None, :, 0, :] * state_net_output, merge_output[:, 0, :]).all()) plot_model(net.model, to_file='cnn_distributional.png', show_shapes=True) # Test clone model, mainly to see that no custom objects are missing state = np.random.rand(1, 1, 47) tau = np.random.rand(1, 1, 8) net_input = [state, tau] target_model = clone_model(net.model) target_model.compile(optimizer='sgd', loss='mse') out = net.model.predict(net_input) out_clone = target_model.predict(net_input) self.assertTrue((out == out_clone).all()) # Window length > 1 net = NetworkCNNDistributional(nb_ego_states=7, nb_states_per_vehicle=4, nb_vehicles=10, nb_actions=9, nb_conv_layers=2, nb_conv_filters=32, nb_hidden_fc_layers=2, nb_hidden_neurons=100, duel=False, prior=False, nb_quantiles=8, nb_cos_embeddings=64, window_length=5) self.assertTrue(net.model.trainable) state = np.random.rand(32, 5, 47) tau = np.random.rand(32, 1, 8) net_input = [state, tau] out = net.model.predict(net_input) self.assertEqual(np.shape(out), (32, 8, 9)) state1 = np.random.rand(1, 5, 47) tau = np.random.rand(32, 1, 8) input1 = [state1, tau] out1 = net.model.predict(input1) input2 = [np.copy(state1), np.copy(tau)] input2[0][0, :, 7:15] = input1[0][0, :, 15:23] input2[0][0, :, 15:23] = input1[0][0, :, 7:15] self.assertFalse((input1[0] == input2[0]).all()) out2 = net.model.predict(input2) self.assertTrue((out1 == out2).all()) state = np.random.rand(1, 5, 47) tau = np.random.rand(1, 1, 8) for i in range(0, 8): tau[:, :, i] = tau[:, 0, 0] net_input = [state, tau] out = net.model.predict(net_input) self.assertTrue(np.abs((out[0, 0, :] - out[0, :, :]) < 1e-6).all()) # Equal values of tau should give equal values of Z_tau plot_model(net.model, to_file='cnn_window_5_distributional.png', show_shapes=True)
def test_cnn_dueling_distributional_with_prior(self): net = NetworkCNNDistributional(nb_ego_states=7, nb_states_per_vehicle=4, nb_vehicles=10, nb_actions=9, nb_conv_layers=2, nb_conv_filters=32, nb_hidden_fc_layers=2, nb_hidden_neurons=100, duel=True, prior=True, prior_scale_factor=1, nb_quantiles=8, nb_cos_embeddings=64) self.assertTrue(net.model.trainable) state = np.random.rand(32, 1, 47) tau = np.random.rand(32, 1, 8) net_input = [state, tau] out = net.model.predict(net_input) self.assertEqual(np.shape(out), (32, 8, 9)) state1 = np.random.rand(1, 1, 47) tau = np.random.rand(32, 1, 8) input1 = [state1, tau] out1 = net.model.predict(input1) input2 = [np.copy(state1), np.copy(tau)] input2[0][0, 0, 7:15] = input1[0][0, 0, 15:23] input2[0][0, 0, 15:23] = input1[0][0, 0, 7:15] self.assertFalse((input1[0] == input2[0]).all()) out2 = net.model.predict(input2) self.assertTrue((out1 == out2).all()) state = np.random.rand(1, 1, 47) tau = np.random.rand(1, 1, 8) for i in range(0, 8): tau[:, :, i] = tau[:, :, 0] net_input = [state, tau] out = net.model.predict(net_input) self.assertTrue(np.abs((out[0, 0, :] - out[0, :, :]) < 1e-6).all()) # Equal values of tau -> equal Z_tau # Cos embedding state = np.random.rand(1, 1, 47) tau = np.zeros((1, 1, 8)) tau[0, 0, 0] = 0 tau[0, 0, 1] = 1/64 tau[0, 0, 2] = 0.5 net_input = [state, tau] cos_embedding_layer = K.Model(inputs=net.model.inputs, outputs=net.model.get_layer('cos_tau').output) cos_embedding_output = cos_embedding_layer.predict(net_input) self.assertTrue((cos_embedding_output[0, 0, :] == 1).all()) self.assertTrue(all([np.isclose(cos_embedding_output[0, 1, i], np.cos(np.pi*i*1/64), atol=1e-7) for i in range(cos_embedding_output.shape[2])])) # Merge state = np.random.rand(1, 1, 47) tau = np.random.rand(1, 1, 8) net_input = [state, tau] tau_net_layer = K.Model(inputs=net.model.inputs, outputs=net.model.get_layer('fc_tau_trainable').output) state_net_layer = K.Model(inputs=net.model.inputs, outputs=net.model.get_layer('merged_extra_dim_trainable').output) merge_layer = K.Model(inputs=net.model.inputs, outputs=net.model.get_layer('merge_trainable').output) tau_net_output = tau_net_layer.predict(net_input) state_net_output = state_net_layer.predict(net_input) merge_output = merge_layer.predict(net_input) self.assertTrue(np.isclose(tau_net_output[None, :, 0, :] * state_net_output, merge_output[:, 0, :]).all()) plot_model(net.model, to_file='cnn_duel_distributional_with_prior.png', show_shapes=True) # Test clone model, mainly to see that no custom objects are missing state = np.random.rand(1, 1, 47) tau = np.random.rand(1, 1, 8) net_input = [state, tau] target_model = clone_model(net.model) target_model.compile(optimizer='sgd', loss='mse') out = net.model.predict(net_input) out_clone = target_model.predict(net_input) self.assertTrue((out == out_clone).all()) # Prior nets not trainable self.assertTrue(net.model.trainable) for layer in net.model.layers: if 'prior' in layer.name and not not layer.weights: self.assertFalse(layer.trainable) net.model.get_config() # This crashes for custom lambda layers before_dueling_layer = K.Model(inputs=net.model.inputs, outputs=net.model.get_layer('output_trainable_wo_dueling').output) before_dueling_output = before_dueling_layer.predict(net_input) after_dueling_layer = K.Model(inputs=net.model.inputs, outputs=net.model.get_layer('output_trainable').output) after_dueling_output = after_dueling_layer.predict(net_input) true_dueling_output = before_dueling_output[:, :, 0, None] + before_dueling_output[:, :, 1:] - \ np.mean(before_dueling_output[:, :, 1:, None], axis=2) self.assertTrue(np.isclose(after_dueling_output, true_dueling_output).all()) single_input = [net_input[0][None, 0], net_input[1][None, 0]] self.assertTrue(np.isclose(out[0], net.model.predict(single_input)).all()) # Window length > 1 net = NetworkCNNDistributional(nb_ego_states=7, nb_states_per_vehicle=4, nb_vehicles=10, nb_actions=9, nb_conv_layers=2, nb_conv_filters=32, nb_hidden_fc_layers=2, nb_hidden_neurons=100, duel=True, prior=True, prior_scale_factor=1, nb_quantiles=8, nb_cos_embeddings=64, window_length=5) self.assertTrue(net.model.trainable) state = np.random.rand(32, 5, 47) tau = np.random.rand(32, 1, 8) net_input = [state, tau] out = net.model.predict(net_input) self.assertEqual(np.shape(out), (32, 8, 9)) state1 = np.random.rand(1, 5, 47) tau = np.random.rand(32, 1, 8) input1 = [state1, tau] out1 = net.model.predict(input1) input2 = [np.copy(state1), np.copy(tau)] input2[0][0, :, 7:15] = input1[0][0, :, 15:23] input2[0][0, :, 15:23] = input1[0][0, :, 7:15] self.assertFalse((input1[0] == input2[0]).all()) out2 = net.model.predict(input2) self.assertTrue((out1 == out2).all()) state = np.random.rand(1, 5, 47) tau = np.random.rand(1, 1, 8) for i in range(0, 8): tau[:, :, i] = tau[:, 0, 0] net_input = [state, tau] out = net.model.predict(net_input) self.assertTrue(np.abs((out[0, 0, :] - out[0, :, :]) < 1e-6).all()) # Equal values of tau should give equal values of Z_tau plot_model(net.model, to_file='cnn_window_5_duel_distributional_with_prior.png', show_shapes=True)
def compile(self): # def clipped_error(y_true, y_pred): # return K.mean(huber_loss(y_true, y_pred, self.delta_clip), axis=-1) # Compile target networks. We only use them in feed-forward mode, hence we can pass any # optimizer and loss since we never use it anyway. self.target_actor = clone_model(self.actor, self.custom_model_objects) self.target_actor.compile(optimizer='sgd', loss='mse') self.target_critic = clone_model(self.critic, self.custom_model_objects) self.target_critic.compile(optimizer='sgd', loss='mse') # We also compile the actor. We never optimize the actor using Keras but instead compute # the policy gradient ourselves. However, we need the actor in feed-forward mode, hence # we also compile it with any optimizer self.actor.compile(optimizer='sgd', loss='mse') # Compile the critic for the same reason self.critic.compile(optimizer='sgd', loss='mse') # Compile the critic optimizer critic_optimizer = tf.train.AdamOptimizer() self.critic_target = tf.placeholder(dtype=tf.float32, shape=(None, 1)) # Clip the critic gradient using the huber loss critic_loss = K.mean( huber_loss( self.critic([self.state, self.action]), self.critic_target, self.delta_clip)) critic_gradient_vars = critic_optimizer.compute_gradients( critic_loss, var_list=self.critic.trainable_weights) # Compute the norm as a metric critic_gradients_norms = [ tf.norm(grad_var[0]) for grad_var in critic_gradient_vars ] critic_gradient_norm = tf.reduce_sum(critic_gradients_norms) self.critic_train_fn = critic_optimizer.apply_gradients( critic_gradient_vars) # Target critic optimizer if self.target_critic_update < 1.: # Include soft target model updates. self.target_critic_train_fn = get_soft_target_model_ops( self.target_critic.weights, self.critic.weights, self.target_critic_update) # Target actor optimizer if self.target_actor_update < 1.: # Include soft target model updates. self.target_actor_train_fn = get_soft_target_model_ops( self.target_actor.weights, self.actor.weights, self.target_actor_update) # Actor optimizer actor_optimizer = tf.train.AdamOptimizer() # Be careful to negate the gradient # Since the optimizer wants to minimize the value actor_loss = -tf.reduce_mean( self.critic([self.state, self.actor(self.state)])) actor_gradient_vars = actor_optimizer.compute_gradients( actor_loss, var_list=self.actor.trainable_weights) # Gradient inverting # as described in https://arxiv.org/abs/1511.04143 if self.invert_gradients: actor_gradient_vars = [(gradient_inverter( x[0], self.gradient_inverter_min, self.gradient_inverter_max), x[1]) for x in actor_gradient_vars] # Compute the norm as a metric actor_gradients_norms = [ tf.norm(grad_var[0]) for grad_var in actor_gradient_vars ] actor_gradient_norm = tf.reduce_sum(actor_gradients_norms) # The actual train function self.actor_train_fn = actor_optimizer.apply_gradients( actor_gradient_vars) # Collect metrics self.critic_summaries.append( tf.summary.scalar("critic/loss", critic_loss)) self.critic_summaries.append( tf.summary.scalar("critic/gradient", critic_gradient_norm)) for var, norm in zip(self.critic.trainable_weights, critic_gradients_norms): self.critic_summaries.append( tf.summary.scalar("critic/{}".format(var.name), norm)) self.actor_summaries.append( tf.summary.scalar("actor/loss", -actor_loss)) self.actor_summaries.append( tf.summary.scalar("actor/gradient", actor_gradient_norm)) for var, norm in zip(self.actor.trainable_weights, actor_gradients_norms): self.actor_summaries.append( tf.summary.scalar("actor/{}".format(var.name), norm)) # FIXME: Use directly Keras backend # This is a kind of a hack # Taken from the "initialize_variables" of the Keras Tensorflow backend # https://github.com/fchollet/keras/blob/master/keras/backend/tensorflow_backend.py#L330 # It permits to only initialize variables that are not already initialized # Without that, the networks and target networks get initialized again, to different values (stochastic initialization) # This is a problem when a network and it's target network do not begin with the same parameter values... variables = tf.global_variables() uninitialized_variables = [] for v in variables: if not hasattr(v, '_keras_initialized') or not v._keras_initialized: uninitialized_variables.append(v) v._keras_initialized = True self.session.run(tf.variables_initializer(uninitialized_variables)) # self.session.run(tf.global_variables_initializer()) self.merged_summary = tf.summary.merge_all() self.compiled = True