예제 #1
0
    def choose_action(self, state):
        if self.player_num == 2:
            state_p1 = self.convert_to_p1_perspective(state)
        else:
            state_p1 = state

        probs, _ = self.brain.predict(state_p1)
        valid_actions = alpha_gomoku_common.get_valid_actions(state,
                                                              distance=2)
        prob_mask = np.array([
            1 if alpha_gomoku_common.index_to_pos(i) in valid_actions else 0
            for i in range(BOARD_SIZE)
        ])
        probs *= prob_mask
        return alpha_gomoku_common.index_to_pos(np.argmax(probs))
예제 #2
0
    def choose_action_training(self, state_p1_np):

        # if board is empty, place piece in middle
        if np.count_nonzero(state_p1_np) == 0:
            best_action = (7, 7)
            probs = np.zeros(225)
            probs[112] = 1
            return best_action, np.array(probs)

        probs = self.player.get_probs(state_p1_np)
        best_action = alpha_gomoku_common.index_to_pos(probs.index(max(probs)))
        return best_action, probs
예제 #3
0
    def choose_action(self, state):
        if self.player_num == 2:
            state = self.convert_to_p1_perspective(state)

        state_np = np.array(state)
        if np.count_nonzero(state_np) == 0:
            best_action = (7, 7)
        else:
            probs = self.player.get_probs(state_np)
            best_action = alpha_gomoku_common.index_to_pos(
                probs.index(max(probs)))

        return best_action
예제 #4
0
    def predict(self, board):
        for action in alpha_gomoku_common.get_valid_actions(board):
            board[action[0]][action[1]] = 1
            win = alpha_gomoku_common.detect_win(board, 1)
            board[action[0]][action[1]] = 0
            if win:
                return [
                    0 if alpha_gomoku_common.index_to_pos(i) != action else 1
                    for i in range(BOARD_SIZE)
                ], 0.99
            # state_copy = copy.deepcopy(board)
            # state_copy[action[0]][action[1]] = 1
            # if common.detect_win(state_copy, 1):
            #     return [0 if common.index_to_pos(i) != action else 1 for i in range(BOARD_SIZE)], 0.99

        # nn_input: 4*15*15, numpy array
        nn_input = self.convert_to_nn_readable(board)
        p, v = self.model.predict(nn_input)
        return p[0], v[0][0]
예제 #5
0
    def choose_action(self, state):
        if self.player_num == 2:
            state_p1 = self.convert_to_p1_perspective(state)
        else:
            state_p1 = state

        probs, _ = self.brain.predict(state_p1)
        valid_actions = alpha_gomoku_common.get_valid_actions(state, distance=2)
        prob_mask = np.array(
            [1 if alpha_gomoku_common.index_to_pos(i) in valid_actions else 0 for i in range(BOARD_SIZE)])
        probs *= prob_mask

        nn_action = alpha_gomoku_common.index_to_pos(np.argmax(probs))

        if self.forced_actions:
            next_state = copy.deepcopy(state_p1)
            next_state[nn_action[0]][nn_action[1]] = 1
            next_state_valid_actions = alpha_gomoku_common.get_valid_actions(next_state, distance=1)

            # force win
            for action in valid_actions:
                state_p1[action[0]][action[1]] = 1
                if gomoku_pattern_detection.detect_pattern(state_p1, 'ooooo', 1) >= 1:
                    state_p1[action[0]][action[1]] = 0
                    return action
                state_p1[action[0]][action[1]] = 0

            # force block lose
            for action in valid_actions:
                state_p1[action[0]][action[1]] = 2
                win = alpha_gomoku_common.detect_win(state_p1, 2)
                state_p1[action[0]][action[1]] = 0
                if win:
                    return action

            # force create -oooo-
            for action in valid_actions:
                state_p1[action[0]][action[1]] = 1
                if gomoku_pattern_detection.detect_pattern(state_p1, '-oooo-', 1) >= 1:
                    state_p1[action[0]][action[1]] = 0
                    return action
                state_p1[action[0]][action[1]] = 0

            # force block -oooo-
            if gomoku_pattern_detection.detect_pattern(next_state, '-oooo-', 2) >= 1:
                for action in next_state_valid_actions:
                    next_state[action[0]][action[1]] = 1
                    if gomoku_pattern_detection.detect_pattern(next_state, '-oooo-', 2) == 0:
                        next_state[action[0]][action[1]] = 0
                        return action
                    next_state[action[0]][action[1]] = 0

            # force block -ooo--
            if gomoku_pattern_detection.detect_pattern(next_state, '-ooo--', 2) >= 1:
                for action in next_state_valid_actions:
                    next_state[action[0]][action[1]] = 1
                    if gomoku_pattern_detection.detect_pattern(next_state, '-ooo--', 2) == 0:
                        next_state[action[0]][action[1]] = 0
                        return action
                    next_state[action[0]][action[1]] = 0

            # force block double threats
            for action in next_state_valid_actions:
                next_state[action[0]][action[1]] = 2
                if (gomoku_pattern_detection.detect_pattern(next_state, '-ooo--', 2) -
                    gomoku_pattern_detection.detect_pattern(next_state, '--ooo--', 2)) + \
                        (gomoku_pattern_detection.detect_pattern(next_state, 'oooo-', 2) -
                         gomoku_pattern_detection.detect_pattern(next_state, '-oooo-', 2)) + \
                        gomoku_pattern_detection.detect_pattern(next_state, 'oo-oo', 2) \
                        >= 2:
                    next_state[action[0]][action[1]] = 0
                    return action
                next_state[action[0]][action[1]] = 0

            # force create double threats
            for action in valid_actions:
                state_p1[action[0]][action[1]] = 1
                if (gomoku_pattern_detection.detect_pattern(state_p1, '-ooo--', 1) -
                    gomoku_pattern_detection.detect_pattern(state_p1, '--ooo--', 1)) + \
                        (gomoku_pattern_detection.detect_pattern(state_p1, 'oooo-', 1) -
                         gomoku_pattern_detection.detect_pattern(state_p1, '-oooo-', 1)) + \
                        gomoku_pattern_detection.detect_pattern(state_p1, 'oo-oo', 1) \
                        >= 2:
                    state_p1[action[0]][action[1]] = 0
                    return action
                state_p1[action[0]][action[1]] = 0

        return nn_action
예제 #6
0
    def search(self, state_p1_np):
        # s_bytes = np.array_str(state_p1_np)
        s_bytes = state_p1_np.tobytes()
        s_list = state_p1_np.tolist()

        # check if reward is assigned to this state yet
        if s_bytes not in self.s_r:
            self.s_r[s_bytes] = alpha_gomoku_common.get_reward(s_list, 1)

        # if is terminal state, return
        if self.s_r[s_bytes] != 0 or np.count_nonzero(
                state_p1_np) == BOARD_SIZE:
            return -self.s_r[s_bytes]

        # check if is leaf node
        # if is leaf node, will have no move probabilities yet
        if s_bytes not in self.s_p:
            probs, value = self.policy_value_obj.predict(s_list)
            valid_actions_1d = alpha_gomoku_common.get_valid_actions_1d(
                s_list, self.valid_actions_distance)
            self.s_valid_moves[s_bytes] = valid_actions_1d

            # mask = [1 if a in valid_actions_1d else 0 for a in range(BOARD_SIZE)]
            # probs = np.array([probs[i] * mask[i] for i in range(BOARD_SIZE)])
            probs = np.array([
                probs[a] if a in valid_actions_1d else 0
                for a in range(BOARD_SIZE)
            ])
            probs_sum = np.sum(probs)

            # if probs_sum is 0, set all probs to equal numbers
            # and print error
            if probs_sum == 0:
                probs = np.array([1 / probs.size for _ in range(probs.size)])
                print('warning: all moves masked')

            # re-normalize
            elif probs_sum != 1:
                probs /= probs_sum

            self.s_p[s_bytes] = probs
            self.s_n[s_bytes] = 0

            return -value

        # if code reaches here it means the state is not a leaf node
        # if not leaf node, select the next node
        best_node_score = float('-inf')
        best_action = -1

        for a in self.s_valid_moves[s_bytes]:
            if (s_bytes, a) in self.sa_q:
                u = self.c_puct * self.s_p[s_bytes][a] * math.sqrt(
                    self.s_n[s_bytes]) / (1 + self.sa_n[(s_bytes, a)])
                score = self.sa_q[(s_bytes, a)] + u  # q + u
            else:
                u = self.c_puct * self.s_p[s_bytes][a] * math.sqrt(
                    self.s_n[s_bytes] + EPS)
                score = u

            if score > best_node_score:
                best_node_score = score
                best_action = a

        next_s = self.convert_perspective(state_p1_np)
        best_action_2d = alpha_gomoku_common.index_to_pos(best_action)
        next_s[best_action_2d[0]][best_action_2d[1]] = 2

        value = self.search(next_s)

        if (s_bytes, best_action) in self.sa_q:
            sa_n = self.sa_n[(s_bytes, best_action)]
            self.sa_q[(
                s_bytes,
                best_action)] = (self.sa_n[(s_bytes, best_action)] * self.sa_q[
                    (s_bytes, best_action)] + value) / (sa_n + 1)
            self.sa_n[(s_bytes, best_action)] = sa_n + 1

        else:
            self.sa_q[(s_bytes, best_action)] = value
            self.sa_n[(s_bytes, best_action)] = 1

        self.s_n[s_bytes] += 1
        return -value