def test_mlp_dueling_network(self):
        net = NetworkMLPDistributional(3, 4, nb_hidden_layers=2, nb_hidden_neurons=100, duel=True, prior=False,
                                       nb_quantiles=8, nb_cos_embeddings=64)
        self.assertTrue(net.model.trainable)
        state = np.random.rand(32, 1, 3)
        tau = np.random.rand(32, 1, 8)
        net_input = [state, tau]
        out = net.model.predict(net_input)
        self.assertEqual(np.shape(out), (32, 8, 4))

        before_dueling_layer = K.Model(inputs=net.model.inputs, outputs=net.model.layers[-2].output)
        before_dueling_output = before_dueling_layer.predict(net_input)
        true_output = before_dueling_output[:, :, 0, None] + before_dueling_output[:, :, 1:] - \
                      np.mean(before_dueling_output[:, :, 1:, None], axis=2)
        self.assertTrue(np.isclose(out, true_output).all())

        single_input = [net_input[0][None, 0], net_input[1][None, 0]]
        self.assertTrue(np.isclose(out[0], net.model.predict(single_input)).all())

        state = np.random.rand(1, 1, 3)
        tau = np.random.rand(1, 1, 8)
        for i in range(0, 7):
            tau[:, :, i+1] = tau[:, :, 0]
        net_input = [state, tau]
        out = net.model.predict(net_input)
        self.assertTrue(np.abs((out[0, 0, :] - out[0, :, :]) < 1e-6).all())   # Equal values of tau -> equal Z_tau

        plot_model(net.model, to_file='mlp_duel_distributional.png', show_shapes=True)
 def test_trainable_weights_mlp_prior(self):
     net = NetworkMLPDistributional(5, 3,  nb_hidden_layers=2, nb_hidden_neurons=64, duel=True, prior=True,
                                    prior_scale_factor=1, nb_quantiles=8, nb_cos_embeddings=64)
     net.model.compile(loss='mse', optimizer='adam')
     x = np.random.rand(100, 1, 5)
     tau = np.random.rand(100, 1, 8)
     y = np.random.rand(100, 8, 3)
     net_input = [x, tau]
     initial_model = copy.deepcopy(net.model)
     net.model.fit(net_input, y, epochs=10, batch_size=100, verbose=0)
     for layer_init, layer in zip(initial_model.layers, net.model.layers):
         if not layer.trainable:
             init_weights = layer_init.get_weights()
             weights = layer.get_weights()
             for row_init, row in zip(init_weights, weights):
                 tmp = row_init == row
                 self.assertTrue(tmp.all())
     get_prior_output_initial = K.backend.function([initial_model.get_layer('state_input').input, initial_model.get_layer('tau_input').input],
                                                   initial_model.get_layer('output_prior').output)
     prior_out_initial = get_prior_output_initial([x[0, :, :], tau[0, :, :]])[0]
     get_prior_output = K.backend.function([net.model.get_layer('state_input').input, net.model.get_layer('tau_input').input],
                                            net.model.get_layer('output_prior').output)
     prior_out = get_prior_output([x[0, :, :], tau[0, :, :]])[0]
     self.assertTrue((prior_out_initial == prior_out).all())
     get_trainable_output_initial = K.backend.function([initial_model.get_layer('state_input').input, initial_model.get_layer('tau_input').input],
                                                       initial_model.get_layer('output_trainable').output)
     trainable_out_initial = get_trainable_output_initial([x[0, :, :], tau[0, :, :]])[0]
     get_trainable_output = K.backend.function([net.model.get_layer('state_input').input, net.model.get_layer('tau_input').input],
                                               net.model.get_layer('output_trainable').output)
     trainable_out = get_trainable_output([x[0, :, :], tau[0, :, :]])[0]
     self.assertTrue((trainable_out_initial != trainable_out).all())
 def __init__(self, *args, **kwargs):
     super(Tester, self).__init__(*args, **kwargs)
     self.model = NetworkMLPDistributional(nb_inputs=10,
                                           nb_outputs=4,
                                           nb_hidden_layers=2,
                                           nb_hidden_neurons=100,
                                           nb_quantiles=32,
                                           nb_cos_embeddings=64,
                                           duel=True,
                                           prior=False,
                                           activation='relu',
                                           duel_type='avg',
                                           window_length=1).model
     self.policy = LinearAnnealedPolicy(
         DistributionalEpsGreedyPolicy(eps=None),
         attr='eps',
         value_max=1.,
         value_min=0.1,
         value_test=.0,
         nb_steps=10000)
     self.test_policy = DistributionalEpsGreedyPolicy(eps=0)
     self.memory = SequentialMemory(limit=10000, window_length=1)
     self.agent = IQNAgent(model=self.model,
                           policy=self.policy,
                           test_policy=self.test_policy,
                           enable_double_dqn=True,
                           nb_samples_policy=32,
                           nb_sampled_quantiles=32,
                           cvar_eta=1,
                           nb_actions=4,
                           memory=self.memory,
                           gamma=0.99,
                           batch_size=32,
                           nb_steps_warmup=1000,
                           train_interval=1,
                           memory_interval=1,
                           target_model_update=1000,
                           delta_clip=10)
    def test_mlp_duel_distributional_with_prior_network(self):
        net = NetworkMLPDistributional(3, 4, nb_hidden_layers=2, nb_hidden_neurons=100, duel=True, prior=True,
                                       prior_scale_factor=1, nb_quantiles=8, nb_cos_embeddings=64)
        self.assertTrue(net.model.trainable)
        for layer in net.model.layers:
            if 'prior' in layer.name and not not layer.weights:
                self.assertFalse(layer.trainable)

        state = np.random.rand(32, 1, 3)
        tau = np.random.rand(32, 1, 8)
        net_input = [state, tau]
        out = net.model.predict(net_input)
        self.assertEqual(np.shape(out), (32, 8, 4))

        net.model.get_config()  # This crashes for custom lambda layers

        before_dueling_layer = K.Model(inputs=net.model.inputs,
                                       outputs=net.model.get_layer('output_trainable_wo_dueling').output)
        before_dueling_output = before_dueling_layer.predict(net_input)
        after_dueling_layer = K.Model(inputs=net.model.inputs, outputs=net.model.get_layer('output_trainable').output)
        after_dueling_output = after_dueling_layer.predict(net_input)
        true_dueling_output = before_dueling_output[:, :, 0, None] + before_dueling_output[:, :, 1:] - \
                              np.mean(before_dueling_output[:, :, 1:, None], axis=2)
        self.assertTrue((after_dueling_output == true_dueling_output).all())

        single_input = [net_input[0][None, 0], net_input[1][None, 0]]
        self.assertTrue(np.isclose(out[0], net.model.predict(single_input)).all())

        state = np.random.rand(1, 1, 3)
        tau = np.random.rand(1, 1, 8)
        for i in range(0, 7):
            tau[:, :, i+1] = tau[:, :, 0]
        net_input = [state, tau]
        out = net.model.predict(net_input)
        self.assertTrue(np.abs((out[0, 0, :] - out[0, :, :]) < 1e-6).all())   # Equal values of tau -> equal Z_tau

        plot_model(net.model, to_file='mlp_duel_distributional_with_prior.png', show_shapes=True)
    def test_mlp_distributional_network(self):
        net = NetworkMLPDistributional(3, 4, nb_hidden_layers=2, nb_hidden_neurons=100, duel=False, prior=False,
                                       nb_quantiles=8, nb_cos_embeddings=64)
        self.assertTrue(net.model.trainable)
        state = np.random.rand(32, 1, 3)
        tau = np.random.rand(32, 1, 8)
        net_input = [state, tau]
        out = net.model.predict(net_input)
        self.assertEqual(np.shape(out), (32, 8, 4))

        state = np.random.rand(1, 1, 3)
        tau = np.random.rand(1, 1, 8)
        for i in range(0, 8):
            tau[:, :, i] = tau[:, :, 0]
        net_input = [state, tau]
        out = net.model.predict(net_input)
        self.assertTrue(np.abs((out[0, 0, :] - out[0, :, :]) < 1e-6).all())   # Equal values of tau -> equal Z_tau

        # Cos embedding
        state = np.random.rand(1, 1, 3)
        tau = np.zeros((1, 1, 8))
        tau[0, 0, 0] = 0
        tau[0, 0, 1] = 1/64
        tau[0, 0, 2] = 0.5
        net_input = [state, tau]
        cos_embedding_layer = K.Model(inputs=net.model.inputs, outputs=net.model.get_layer('cos_tau').output)
        cos_embedding_output = cos_embedding_layer.predict(net_input)
        self.assertTrue((cos_embedding_output[0, 0, :] == 1).all())
        self.assertTrue(all([np.isclose(cos_embedding_output[0, 1, i], np.cos(np.pi*i*1/64), atol=1e-7)
                             for i in range(cos_embedding_output.shape[2])]))

        # Merge
        state = np.random.rand(1, 1, 3)
        tau = np.random.rand(1, 1, 8)
        net_input = [state, tau]
        tau_net_layer = K.Model(inputs=net.model.inputs, outputs=net.model.get_layer('fc_tau').output)
        state_net_layer = K.Model(inputs=net.model.inputs, outputs=net.model.get_layer('fc_state_extra_dim').output)
        merge_layer = K.Model(inputs=net.model.inputs, outputs=net.model.get_layer('merge').output)
        tau_net_output = tau_net_layer.predict(net_input)
        state_net_output = state_net_layer.predict(net_input)
        merge_output = merge_layer.predict(net_input)
        self.assertTrue(np.isclose(tau_net_output[None, :, 0, :] * state_net_output, merge_output[:, 0, :]).all())

        plot_model(net.model, to_file='mlp_distributional.png', show_shapes=True)

        # Test clone model, mainly to see that no custom objects are missing
        state = np.random.rand(1, 1, 3)
        tau = np.random.rand(1, 1, 8)
        net_input = [state, tau]
        target_model = clone_model(net.model)
        target_model.compile(optimizer='sgd', loss='mse')
        out = net.model.predict(net_input)
        out_clone = target_model.predict(net_input)
        self.assertTrue((out == out_clone).all())

        # Window length > 1
        net = NetworkMLPDistributional(3, 4, nb_hidden_layers=2, nb_hidden_neurons=100, duel=False, prior=False,
                                       nb_quantiles=8, nb_cos_embeddings=64, window_length=5)
        self.assertTrue(net.model.trainable)
        state = np.random.rand(32, 5, 3)
        tau = np.random.rand(32, 1, 8)
        net_input = [state, tau]
        out = net.model.predict(net_input)
        self.assertEqual(np.shape(out), (32, 8, 4))

        state = np.random.rand(1, 5, 3)
        tau = np.random.rand(1, 1, 8)
        for i in range(0, 8):
            tau[:, :, i] = tau[:, 0, 0]
        net_input = [state, tau]
        out = net.model.predict(net_input)
        self.assertTrue(
            np.abs((out[0, 0, :] - out[0, :, :]) < 1e-6).all())  # Equal values of tau should give equal values of Z_tau

        plot_model(net.model, to_file='mlp_window_5_distributional.png', show_shapes=True)
                 duel=p.agent_par['duel_q'],
                 prior=True,
                 activation='relu',
                 duel_type='avg',
                 window_length=p.agent_par["window_length"],
                 prior_scale_factor=p.agent_par["prior_scale_factor"]).
             model)
     else:
         models.append(
             NetworkMLPDistributional(
                 nb_observations,
                 nb_actions,
                 nb_hidden_layers=p.agent_par['nb_hidden_fc_layers'],
                 nb_hidden_neurons=p.agent_par['nb_hidden_neurons'],
                 nb_quantiles=p.agent_par['nb_quantiles'],
                 nb_cos_embeddings=p.agent_par['nb_cos_embeddings'],
                 duel=p.agent_par['duel_q'],
                 prior=True,
                 activation='relu',
                 duel_type='avg',
                 window_length=p.agent_par["window_length"],
                 prior_scale_factor=p.agent_par["prior_scale_factor"]).
             model)
 print(models[0].summary())
 plot_model(models[0],
            to_file=save_path + "/" + 'model.png',
            show_shapes=True)
 model_as_string = models[0].to_json()
 greedy_policy = DistributionalEpsGreedyPolicy(eps=0)
 test_policy = DistributionalEnsembleTestPolicy()
 memory = BootstrappingMemory(
     nb_nets=p.agent_par['number_of_networks'],
    def test_trainable_model(self):
        nb_inputs = 10
        nb_actions = 5
        nb_quantiles = 32
        batch_size = 64
        delta_clip = 1
        model = NetworkMLPDistributional(nb_inputs=nb_inputs,
                                         nb_outputs=nb_actions,
                                         nb_hidden_layers=2,
                                         nb_hidden_neurons=100,
                                         nb_quantiles=nb_quantiles,
                                         nb_cos_embeddings=64,
                                         duel=True,
                                         prior=False,
                                         activation='relu',
                                         duel_type='avg',
                                         window_length=1).model
        policy = LinearAnnealedPolicy(DistributionalEpsGreedyPolicy(eps=None),
                                      attr='eps',
                                      value_max=1.,
                                      value_min=0.1,
                                      value_test=.0,
                                      nb_steps=10000)
        test_policy = DistributionalEpsGreedyPolicy(eps=0)
        memory = SequentialMemory(limit=10000, window_length=1)
        agent = IQNAgent(model=model,
                         policy=policy,
                         test_policy=test_policy,
                         enable_double_dqn=True,
                         nb_samples_policy=nb_quantiles,
                         nb_sampled_quantiles=nb_quantiles,
                         cvar_eta=1,
                         nb_actions=nb_actions,
                         memory=memory,
                         gamma=0.99,
                         batch_size=1,
                         nb_steps_warmup=1000,
                         train_interval=1,
                         memory_interval=1,
                         target_model_update=1000,
                         delta_clip=delta_clip)

        agent.compile(Adam(lr=0.01))
        plot_model(agent.trainable_model,
                   to_file='trainable_model_2.png',
                   show_shapes=True)

        # Test input
        states = np.random.rand(batch_size, 1, nb_inputs)
        actions = np.random.randint(nb_actions, size=batch_size)
        quantiles = np.random.rand(batch_size, 1, nb_quantiles)
        targets = np.random.rand(batch_size, nb_quantiles)

        predictions = agent.model.predict_on_batch([states, quantiles])

        def huber(deltas, quantile):
            if np.abs(deltas) < delta_clip:
                loss = 0.5 * deltas**2
            else:
                loss = delta_clip * (np.abs(deltas) - 0.5 * delta_clip)
            if deltas > 0:
                loss *= quantile / delta_clip
            else:
                loss *= (1 - quantile) / delta_clip
            if loss < 0:
                raise Exception("Loss should always be positive")
            return loss

        true_loss = np.zeros(batch_size)
        for idx in range(batch_size):
            for i in range(nb_quantiles):
                for j in range(nb_quantiles):
                    true_loss[idx] += huber(
                        targets[idx, j] - predictions[idx, i, actions[idx]],
                        quantiles[idx, 0, i])
            true_loss[idx] *= 1 / nb_quantiles

        masks = np.zeros((batch_size, nb_actions))
        masks[range(batch_size), actions] = 1
        targets_expanded = np.zeros((batch_size, nb_quantiles, nb_actions))
        targets_expanded[range(batch_size), :,
                         actions] = targets[range(batch_size), :]
        out = agent.trainable_model.predict_on_batch(
            [states, quantiles, targets_expanded, masks])

        self.assertTrue(np.isclose(true_loss, out[0]).all())
        self.assertTrue((predictions == out[1]).all())

        metrics = agent.trainable_model.train_on_batch(
            [states, quantiles, targets_expanded, masks],
            [targets, targets_expanded])
        self.assertTrue(np.isclose(np.mean(true_loss), metrics[0]))

        average_q_value = np.mean(predictions)
        average_max_q_value = np.mean(
            np.max(np.mean(predictions, axis=1), axis=-1))
        self.assertTrue(np.isclose(average_q_value, metrics[3]))
        self.assertTrue(np.isclose(average_max_q_value, metrics[4]))
    def test_quantile_regression(self):
        nb_inputs = 10
        nb_actions = 3
        nb_quantiles = 32
        batch_size = 64
        delta_clip = 1
        model = NetworkMLPDistributional(nb_inputs=nb_inputs,
                                         nb_outputs=nb_actions,
                                         nb_hidden_layers=2,
                                         nb_hidden_neurons=100,
                                         nb_quantiles=nb_quantiles,
                                         nb_cos_embeddings=64,
                                         duel=True,
                                         prior=False,
                                         activation='relu',
                                         duel_type='avg',
                                         window_length=1).model
        policy = LinearAnnealedPolicy(DistributionalEpsGreedyPolicy(eps=1),
                                      attr='eps',
                                      value_max=1.,
                                      value_min=0.1,
                                      value_test=.0,
                                      nb_steps=10000)
        test_policy = DistributionalEpsGreedyPolicy(eps=0)
        memory = SequentialMemory(limit=10000, window_length=1)
        agent = IQNAgent(model=model,
                         policy=policy,
                         test_policy=test_policy,
                         enable_double_dqn=True,
                         nb_samples_policy=nb_quantiles,
                         nb_sampled_quantiles=nb_quantiles,
                         cvar_eta=1,
                         nb_actions=nb_actions,
                         memory=memory,
                         gamma=0.99,
                         batch_size=batch_size,
                         nb_steps_warmup=1000,
                         train_interval=1,
                         memory_interval=1,
                         target_model_update=1000,
                         delta_clip=delta_clip)

        agent.compile(Adam(lr=0.0001))
        plot_model(agent.trainable_model,
                   to_file='trainable_model_2.png',
                   show_shapes=True)

        # Test input
        states = np.random.rand(batch_size, 1, nb_inputs)
        actions = np.random.randint(nb_actions, size=batch_size)
        test_quantiles = np.linspace(0, 1, nb_quantiles)
        z_values = agent.model.predict_on_batch(
            [states, test_quantiles[None, None, :]])
        # print(z_values[0])

        for i in range(3000):
            quantiles = np.random.rand(batch_size, 1, nb_quantiles)
            # targets = np.random.choice([1, 2, 3], batch_size)
            targets = np.random.choice([10, 22, 35], batch_size)
            targets = np.repeat(targets[:, None], nb_quantiles, axis=1)

            predictions = agent.model.predict_on_batch([states, quantiles])

            masks = np.zeros((batch_size, nb_actions))
            masks[range(batch_size), actions] = 1
            targets_expanded = np.zeros((batch_size, nb_quantiles, nb_actions))
            targets_expanded[range(batch_size), :,
                             actions] = targets[range(batch_size), :]

            loss = agent.trainable_model.predict_on_batch(
                [states, quantiles, targets_expanded, masks])

            metrics = agent.trainable_model.train_on_batch(
                [states, quantiles, targets_expanded, masks],
                [targets, targets_expanded])

            if np.mod(i, 100) == 0:
                test_quantiles = np.linspace(0, 1, nb_quantiles)
                z_values = agent.model.predict_on_batch(
                    [states, test_quantiles[None, None, :]])

        self.assertTrue(np.abs(np.mean(z_values[:, 1:10, :]) - 10) < 1.0)
        self.assertTrue(np.abs(np.mean(z_values[:, 12:20, :]) - 22) < 1.0)
        self.assertTrue(np.abs(np.mean(z_values[:, 23:31, :]) - 35) < 1.0)