Ejemplo n.º 1
0
    def test_can_fit_model(self):
        """ This test check ability of fitting model in PER to random vector. """
        state_shape = (4, )
        action_space = 2

        model = PrioritizedExperienceReplayTests._create_model(
            state_shape, action_space)
        PER = PrioritizedExperienceReplay(maxlen=1,
                                          model=model,
                                          key_scaling=10,
                                          gamma=1)
        model_wrapper = ModelWrapper(
            model=model, optimizer=K.optimizers.Adam(learning_rate=0.01))
        model_wrapper.compile()

        sample = Sample(action=np.random.randint(0, action_space),
                        state=np.random.rand(state_shape[0]),
                        reward=10,
                        next_state=None)
        PER.add(samples=[sample])

        history_of_loss = []
        fit_vector = np.zeros((action_space, ))
        fit_vector[sample.action] = sample.reward
        for _ in range(100):
            model_wrapper.fit(sample.state, fit_vector)
            history_of_loss.append(PER._loss_calculate(sample=sample))

        for idx, loss in enumerate(history_of_loss[:-1]):
            self.assertGreater(loss, history_of_loss[idx + 1])
Ejemplo n.º 2
0
class DQN(AbstractAgent):
    def __init__(self,
                 environment: Env,
                 memory: AbstractMemory,
                 policy: AbstractPolicy,
                 model: K.Model,
                 logger: Logger,
                 gamma: float,
                 optimizer: K.optimizers.Optimizer,
                 n_step: int = 1):

        self.model = ModelWrapper(model, optimizer)
        #self.model.compile()
        self.current_model = None

        self.gamma = gamma
        self.n_step = n_step

        super(DQN, self).__init__(environment=environment,
                                  memory=memory,
                                  policy=policy,
                                  model=model,
                                  optimizer=optimizer,
                                  logger=logger)

    def _bellman_equation(self, batch: List[Sample]) -> np.ndarray:
        state = np.array([sample.state for sample in batch])
        q_values = self.current_model.predict(state)

        for idx in range(q_values.shape[0]):
            q_values[idx][batch[idx].action] = batch[idx].reward
            if not batch[idx].is_done():
                best_action_for_q_next = np.argmax(
                    self.model.predict(batch[idx].next_state))
                q_next = self.model.predict(
                    batch[idx].next_state)[0][best_action_for_q_next]
                q_values[idx][batch[idx].action] += self.gamma * q_next

        return q_values

    def learn(self, epochs: int, batch_size_in_step: int,
              min_n_game_in_exploration: int, batch_size_in_exploration: int,
              change_model_delay: int):

        self.model.compile()
        self.current_model = self.model.clone()
        self.current_model.compile()

        eval_score, starting_experience = self._explore_env(self.memory.maxlen)
        self.memory.add(starting_experience)

        for epoch in tqdm(range(epochs), desc='Learning in progress: '):

            if epoch % change_model_delay == 0:
                self.model = self.current_model.clone()
                self.model.compile()
                if type(self.memory) == PrioritizedExperienceReplay:
                    self.memory.update_model(self.model)
                eval_score, batch = self._explore_env(
                    batch_size_in_exploration, min_n_game_in_exploration)
                self.memory.add(batch)
            batch = self.memory.sample(batch_size_in_step)

            q_values = self._bellman_equation(batch)
            state = np.array([sample.state for sample in batch])
            loss = self.current_model.fit(state, q_values)
            self.policy.update()
            self.logger.add_event({
                'loss_value': loss,
                'mean_gain': eval_score,
                'epoch': epoch
            })

    def __str__(self):
        return "Agent: " + self.__class__.__name__ + "\n\n" + \
               "Discount value: " + str(self.gamma) + "\n"\
                "N-step: " + str(self.n_step) + "\n\n"\
                "Environment:\n" + str(self.environment) + "\n\n" + \
                "Memory:\n" + str(self.memory) + "\n" + \
                "Policy:\n" + str(self.policy)