Example #1
0
    def get_action(self, task, state, non_normalized_state, epsilon):
        """
        method returns action to take
        """
        if not epsilon:
            q_value = self.model_net.predict(np.array([state]))
        else:
            if np.random.rand() <= self.current_epsilon:
                if task.name == "2048-v0":
                    possible_actions = possible_moves(non_normalized_state)
                    while True:
                        rand_action = np.random.randint(0,
                                                        self.action_size,
                                                        size=1)[0]
                        if possible_actions[rand_action] == 1:
                            return rand_action
                else:
                    return np.random.randint(0, self.action_size, size=1)[0]
            else:
                q_value = self.model_net.predict(np.array([state]))

        if task.name == "2048-v0":
            possible_actions = possible_moves(non_normalized_state)
            while True:
                chosen_action = np.argmax(q_value)
                if possible_actions[chosen_action] == 1:
                    return chosen_action
                else:
                    q_value[0][chosen_action] = -100

        return np.argmax(q_value)
Example #2
0
    def train_ddqn(self):
        """
        method trains agent using DDQN
        """
        if self.memory_type == "basic":
            if len(self.memory) >= self.minibatch_size:
                state, action, reward, next_state, done = self.get_minibatch()
            else:
                return
        else:
            if self.memory.length >= self.minibatch_size:
                state, action, reward, next_state, done = self.get_minibatch()
            else:
                return

        errors = np.zeros(self.minibatch_size)
        """
        if self.model_type == "experimental":
            state = split_2048(state)
            next_state = split_2048(next_state)
            q_value = self.model_net.predict(state)
            ns_model_pred = self.model_net.predict(next_state)
            ns_target_pred = self.target_net.predict(next_state)
        else:"""
        possible_actions_curr = []
        if self.args.environment == "2048-v0":
            for i, item in enumerate(state):
                possible_actions_curr.append(possible_moves(item))

            state = state / 16384.0 - 0.5
            next_state = next_state / 16384.0 - 0.5

        q_value = self.model_net.predict(state)
        ns_model_pred = self.model_net.predict(next_state)
        ns_target_pred = self.target_net.predict(next_state)

        for i in range(0, self.minibatch_size):
            errors[i] = q_value[i][action[i]]

            if done[i] == 1:
                q_value[i][action[i]] = reward[i]
            else:
                q_value[i][action[
                    i]] = reward[i] + self.gamma * ns_target_pred[i][np.argmax(
                        ns_model_pred[i])]

            errors[i] = abs(errors[i] - q_value[i][action[i]])

        for i, item in enumerate(possible_actions_curr):
            for e, elem in enumerate(item):
                if elem == 0:
                    q_value[i][e] = -1

        self.model_net.fit(state, q_value, epochs=1, verbose=0)
        if self.memory_type == "dueling":
            self.memory.update_minibatch(minibatch, errors)