def __init__(
     self,
     hidden_layers: int,
     neurons_per_hidden_layer: int,
     action_space_size: int,
     alpha: float = 0.01,
     gamma: float = 0.999,
     epsilon: float = 1,
     epsilon_decay: float = 0.995,
     epsilon_min: float = 0.01,
     batch_size: int = 32,
 ):
     self.Q = DQNBrain(output_dim=action_space_size,
                       learning_rate=alpha,
                       hidden_layers_count=hidden_layers,
                       neurons_per_hidden_layer=neurons_per_hidden_layer)
     self.alternate_Q = DQNBrain(
         output_dim=action_space_size,
         learning_rate=alpha,
         hidden_layers_count=hidden_layers,
         neurons_per_hidden_layer=neurons_per_hidden_layer)
     self.action_space_size = action_space_size
     self.s = None
     self.a = None
     self.r = None
     self.gamma = gamma
     self.epsilon = epsilon
     self.epsilon_min = epsilon_min
     self.epsilon_decay = epsilon_decay
     self.batch_size = batch_size
     self.memory = deque(maxlen=20000)
     self.remember(self.s, self.s, self.a, self.r, True)
Ejemplo n.º 2
0
class DeepQLearningAgent(Agent):
    def __init__(
        self,
        hidden_layers: int,
        neurons_per_hidden_layer: int,
        action_space_size: int,
        alpha: float = 0.01,
        gamma: float = 0.999,
        epsilon: float = 0.1,
    ):
        self.Q = DQNBrain(output_dim=action_space_size,
                          learning_rate=alpha,
                          hidden_layers_count=hidden_layers,
                          neurons_per_hidden_layer=neurons_per_hidden_layer)
        self.action_space_size = action_space_size
        self.s = None
        self.a = None
        self.r = None
        self.gamma = gamma
        self.epsilon = epsilon

    def act(self, gs: GameState) -> int:
        #gs_unique_id = gs.get_unique_id()
        available_actions = gs.get_available_actions(gs.get_active_player())

        state_vec = gs.get_vectorized_state()
        predicted_Q_values = self.Q.predict(state_vec)

        if np.random.random() <= self.epsilon:
            chosen_action = np.random.choice(available_actions)
        else:
            chosen_action = available_actions[int(
                np.argmax(predicted_Q_values[available_actions]))]

        if self.s is not None:
            target = self.r + self.gamma * max(
                predicted_Q_values[available_actions])
            self.Q.train(self.s, self.a, target)

        self.s = state_vec
        self.a = to_categorical(chosen_action, self.action_space_size)
        self.r = 0.0

        return chosen_action

    def observe(self, r: float, t: bool, player_index: int):
        if self.r is None:
            return

        self.r += r

        if t:
            target = self.r
            self.Q.train(self.s, self.a, target)
            self.s = None
            self.a = None
            self.r = None
 def __init__(self,
              action_space_size: int,
              alpha: float = 0.01,
              gamma: float = 0.999,
              epsilon: float = 0.1,
              ):
     self.Q = DQNBrain(output_dim=action_space_size, learning_rate=alpha)
     self.action_space_size = action_space_size
     self.s = None
     self.a = None
     self.r = None
     self.gamma = gamma
     self.epsilon = epsilon
 def __init__(
     self,
     action_space_size: int,
     alpha: float = 0.05,
     gamma: float = 0.999,
     epsilon: float = 0.1,
 ):
     self.Q = DQNBrain(output_dim=action_space_size,
                       learning_rate=alpha,
                       hidden_layers_count=5,
                       neurons_per_hidden_layer=128)
     self.action_space_size = action_space_size
     self.s = None
     self.a = None
     self.r = None
     self.gamma = gamma
     self.epsilon = epsilon
class DeepQLearningExperienceReplayAgent(Agent):
    def __init__(
        self,
        action_space_size: int,
        alpha: float = 0.01,
        gamma: float = 0.999,
        epsilon: float = 0.1,
    ):
        self.Q = DQNBrain(output_dim=action_space_size,
                          learning_rate=alpha,
                          hidden_layers_count=2,
                          neurons_per_hidden_layer=128)
        self.action_space_size = action_space_size
        self.s = None
        self.a = None
        self.r = None
        self.experience = deque(maxlen=20)
        self.gamma = gamma
        self.epsilon = epsilon

    def act(self, gs: GameState) -> int:
        gs_unique_id = gs.get_unique_id()
        available_actions = gs.get_available_actions(gs.get_active_player())

        state_vec = gs.get_vectorized_state()
        predicted_Q_values = self.Q.predict(state_vec)

        if np.random.random() <= self.epsilon:
            chosen_action = np.random.choice(available_actions)
        else:
            chosen_action = available_actions[int(
                np.argmax(predicted_Q_values[available_actions]))]

        if self.s is not None:
            target = self.r + self.gamma * max(
                predicted_Q_values[available_actions])
            self.Q.train(self.s, self.a, target)
            self.experience.append(
                (self.s.copy(), self.a.copy(), self.r, state_vec.copy()))
        print("experience", len(self.experience))

        if len(self.experience) % 10 == 0:
            for el in self.experience:
                target = el[2] + self.gamma * el[1]
                self.Q.train(el[0], el[1], target)
        self.s = state_vec
        self.a = to_categorical(chosen_action, self.action_space_size)
        self.r = 0.0

        return chosen_action

    def observe(self, r: float, t: bool, player_index: int):
        if self.r is None:
            return

        self.r += r

        if t:
            target = self.r
            self.Q.train(self.s, self.a, target)
            self.s = None
            self.a = None
            self.r = None
class DoubleDeepQLearningAgent(Agent):
    def __init__(
        self,
        action_space_size: int,
        alpha: float = 0.05,
        gamma: float = 0.999,
        epsilon: float = 0.1,
    ):
        self.Q_action = DQNBrain(output_dim=action_space_size,
                                 learning_rate=alpha,
                                 hidden_layers_count=5,
                                 neurons_per_hidden_layer=128)
        self.Q_evaluation = DQNBrain(output_dim=action_space_size,
                                     learning_rate=alpha,
                                     hidden_layers_count=5,
                                     neurons_per_hidden_layer=128)
        self.action_space_size = action_space_size
        self.s = None
        self.a = None
        self.r = None
        self.count_state = 1
        self.gamma = gamma
        self.epsilon = epsilon
        self.tau = 0.01

    def act(self, gs: GameState) -> int:
        available_actions = gs.get_available_actions(gs.get_active_player())

        state_vec = gs.get_vectorized_state()
        predicted_Q_values = self.Q_action.predict(state_vec)

        if np.random.random() <= self.epsilon:
            chosen_action = np.random.choice(available_actions)
        else:
            chosen_action = available_actions[int(
                np.argmax(predicted_Q_values[available_actions]))]

        if self.s is not None:
            target = self.r + self.gamma * predicted_Q_values[int(
                np.argmax(self.Q_evaluation.predict(self.s)))]
            self.Q_action.train(self.s, self.a, target)

        if self.s is not None:
            update_Q_evaluation = self.tau * np.array(
                self.Q_action.model.get_weights()) + (1 - self.tau) * np.array(
                    self.Q_evaluation.model.get_weights())
            self.Q_evaluation.model.set_weights(update_Q_evaluation)

        self.s = state_vec
        self.a = to_categorical(chosen_action, self.action_space_size)
        self.r = 0.0
        self.count_state += 1

        return chosen_action

    def observe(self, r: float, t: bool, player_index: int):
        if self.r is None:
            return

        self.r += r

        if t:
            target = self.r
            self.Q_action.train(self.s, self.a, target)
            self.s = None
            self.a = None
            self.r = None
class DDQNAgentWithER(Agent):
    def __init__(
        self,
        hidden_layers: int,
        neurons_per_hidden_layer: int,
        action_space_size: int,
        alpha: float = 0.01,
        gamma: float = 0.999,
        epsilon: float = 1,
        epsilon_decay: float = 0.995,
        epsilon_min: float = 0.01,
        batch_size: int = 32,
    ):
        self.Q = DQNBrain(output_dim=action_space_size,
                          learning_rate=alpha,
                          hidden_layers_count=hidden_layers,
                          neurons_per_hidden_layer=neurons_per_hidden_layer)
        self.alternate_Q = DQNBrain(
            output_dim=action_space_size,
            learning_rate=alpha,
            hidden_layers_count=hidden_layers,
            neurons_per_hidden_layer=neurons_per_hidden_layer)
        self.action_space_size = action_space_size
        self.s = None
        self.a = None
        self.r = None
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.batch_size = batch_size
        self.memory = deque(maxlen=20000)
        self.remember(self.s, self.s, self.a, self.r, True)

    def remember(self, state, next_state, action, reward, done):
        """
        remember the experience
        :param state:
        :param next_state:
        :param action:
        :param reward:
        :param done:
        :return:
        """
        self.memory.append((state, next_state, action, reward, done))

    def act(self, gs: GameState) -> int:
        available_actions = gs.get_available_actions(gs.get_active_player())
        state_vec = gs.get_vectorized_state()
        predicted_Q_values = self.Q.predict(state_vec)
        if np.random.random() <= self.epsilon:
            chosen_action = np.random.choice(available_actions)
        else:
            chosen_action = available_actions[int(
                np.argmax(predicted_Q_values[available_actions]))]
        batch = random.choices(self.memory, k=self.batch_size)
        for state, next_state, action, reward, done in batch:
            target = reward
            if not done:
                if self.s is not None:
                    target = target + self.gamma * self.alternate_Q.predict(
                        state_vec)[available_actions][np.argmax(
                            self.Q.predict(state_vec)[available_actions])]
                    self.Q.train(self.s, self.a, target)
                    self.remember(self.s, state_vec, self.a, self.r, True)
        self.s = state_vec
        self.a = to_categorical(chosen_action, self.action_space_size)
        self.r = 0.0
        return chosen_action

    def observe(self, r: float, t: bool, player_index: int):
        if self.r is None:
            return

        self.r += r

        if t:
            target = self.r
            self.Q.train(self.s, self.a, target)
            self.s = None
            self.a = None
            self.r = None
class DDQNAgentWithPER(Agent):
    def __init__(
        self,
        hidden_layers: int,
        neurons_per_hidden_layer: int,
        action_space_size: int,
        alpha: float = 0.01,
        gamma: float = 0.999,
        epsilon: float = 1,
        epsilon_decay: float = 0.995,
        epsilon_min: float = 0.01,
        batch_size: int = 32,
    ):
        self.Q = DQNBrain(output_dim=action_space_size,
                          learning_rate=alpha,
                          hidden_layers_count=hidden_layers,
                          neurons_per_hidden_layer=neurons_per_hidden_layer)
        self.alternate_Q = DQNBrain(
            output_dim=action_space_size,
            learning_rate=alpha,
            hidden_layers_count=hidden_layers,
            neurons_per_hidden_layer=neurons_per_hidden_layer)
        self.action_space_size = action_space_size
        self.s = None
        self.a = None
        self.r = None
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.batch_size = batch_size
        self.memory = deque(maxlen=20000)
        self.priority = deque(maxlen=20000)

    #     self.remember(self.s, self.s, self.a, self.r, True)
    #
    # def remember(self, state, next_state, action, reward, done):
    #     """
    #     remember the experience
    #     :param state:
    #     :param next_state:
    #     :param action:
    #     :param reward:
    #     :param done:
    #     :return:
    #     """
    #     self.prioritize(state, next_state, action, reward, done)
    #
    # def prioritize(self, state, next_state, action, reward, done, alpha=0.6):
    #     actions = to_categorical(5, self.action_space_size)
    #     q_next = reward + self.gamma * np.max(self.Q.predict(next_state)[actions])
    #     q = self.Q.predict(next_state)[actions]
    #     p = (np.abs(q_next - q) + (np.e ** -10)) ** alpha
    #     self.priority.append(p)
    #     self.memory.append((state, next_state, action, reward, done))

    def get_priority_experience_batch(self):
        p_sum = np.sum(self.priority)
        prob = self.priority / p_sum
        sample_indices = random.choices(range(len(prob)),
                                        k=self.batch_size,
                                        weights=prob)
        importance = (1 / prob) * (1 / len(self.priority))
        importance = np.array(importance)[sample_indices]
        samples = np.array(self.memory)[sample_indices]
        return samples, importance

    def act(self, gs: GameState) -> int:
        self.priority.append(0.001)
        self.memory.append((self.s, self.s, self.a, self.r, True))
        available_actions = gs.get_available_actions(gs.get_active_player())
        state_vec = gs.get_vectorized_state()
        predicted_Q_values = self.Q.predict(state_vec)
        if np.random.random() <= self.epsilon:
            chosen_action = np.random.choice(available_actions)
        else:
            chosen_action = available_actions[int(
                np.argmax(predicted_Q_values[available_actions]))]

        batch, importance = self.get_priority_experience_batch()
        for b, i in zip(batch, importance):
            state, next_state, action, reward, done = b
            target = reward
            if not done:
                if self.s is not None:
                    # target = target + self.gamma * self.alternate_Q.predict(state_vec)[available_actions][
                    #                 np.argmax(self.Q.predict(state_vec)[available_actions])]
                    # self.Q.train(self.s, self.a, target)
                    q_next = reward + self.gamma * self.alternate_Q.predict(
                        next_state)[available_actions][np.argmax(
                            self.Q.predict(next_state)[available_actions])]
                    target = q_next
                    q = self.alternate_Q.predict(
                        next_state)[available_actions][np.argmax(
                            self.Q.predict(next_state)[available_actions])]
                    p = (np.abs(q_next - q) + (np.e**-10))**self.alpha
                    self.priority.append(p)
                    self.memory.append(
                        (state, next_state, action, reward, done))
                    self.Q.train(self.s, self.a, target)

            imp = i**(1 - self.epsilon)
            imp = np.reshape(imp, 1)
            # self.remember(self.s, state_vec, self.a, self.r, True)

        # batch = random.choices(self.memory, k=self.batch_size)
        # for state, next_state, action, reward, done in batch:
        #     target = reward
        #     if not done:
        #         if self.s is not None:
        #             target = target + self.gamma * self.alternate_Q.predict(state_vec)[available_actions][
        #                 np.argmax(self.Q.predict(state_vec)[available_actions])]
        #             self.Q.train(self.s, self.a, target)
        #             self.remember(self.s, state_vec, self.a, self.r, True)
        self.s = state_vec
        self.a = to_categorical(chosen_action, self.action_space_size)
        self.r = 0.0
        return chosen_action

    def observe(self, r: float, t: bool, player_index: int):
        if self.r is None:
            return

        self.r += r

        if t:
            target = self.r
            self.Q.train(self.s, self.a, target)
            self.s = None
            self.a = None
            self.r = None