def test_available_moves(self): size = 3 board = clean_board(size) self.assertEqual(len(available_moves(board)), 9) apply_move_inplace(board, (0, 0), 1) self.assertEqual(len(available_moves(board)), 8) board = np.ones((size, size), dtype=np.int) self.assertEqual(len(available_moves(board)), 0)
def add_board(self, board): board_hash = hash_board(board) if self.q_table.get(board_hash) is None: legal_moves = available_moves(board) self.q_table[board_hash] = {move: 1.0 for move in legal_moves} return board_hash
def get_move(self, board): legal_moves = available_moves(board) new_boards = np.array([apply_move(board, move, self.side_to_play) for move in legal_moves]) # print(new_boards.reshape(len(legal_moves), 9)) # possible_boards = [apply_move(board, move, self.side_to_play) for move in legal_moves] evaluations = self.model.predict(new_boards.reshape(len(legal_moves), 9)) print(legal_moves) print(evaluations) return legal_moves[self.min_max_best_move(evaluations)]
def learn_q(self, board, move): board_hash = self.add_board(board) new_board = apply_move(board, move, self.side) new_board_hash = self.add_board(new_board) reward = self.calculate_reward(new_board) if reward != 0 or len(available_moves(new_board_hash)) == 0: expected = reward else: expected_rewards = self.q_table[new_board_hash] expected = reward + (0.9 * max(expected_rewards.values())) change = 0.3 * (expected - self.q_table[board_hash][move]) self.q_table[board_hash][move] += change
def get_move(self, board, side): return random.choice(available_moves(board))