예제 #1
0
def QLearning(forever, width, height):
    # Reward matrix, with values for unchecked, miss, hit, and sink
    rewardMatrix = [
        0, -1, 0, 1
    ]  # unchecked value there because unchecked value on board is 0

    # Step size
    alpha = 0.1
    # Epsilon for epsilon-greedy action selection
    epsilon = 0.05
    # Discount factor
    gamma = 0.9

    # Q-table initialization
    q = np.zeros(shape=(3, 3, 3, 3, 3, 3, 3, 3, 8))

    # Generating Agent object to train
    w = width
    h = height
    agent = Agent(w, h)

    # Array to save number of steps to sink a ship every 10 episodes
    time_steps = []
    # Counter to check when 10 episodes have passed to save time_steps out
    episode_count = 0
    # Stores number of steps to sink a ship for every episode
    temp_time_steps = []

    # Loop for each episode in forever
    for i in range(forever):
        episode_count += 1
        # Resetting board each episode
        board = np.zeros((h, w))
        # Padding board with 1s to handle shots fired at edge needing a 3x3 surrounding
        board = np.pad(board, 1, 'constant', constant_values=(1, 1))
        # Setting ships randomly on board
        agent.ships = setShips(h, w)
        ships = agent.ships
        # Add 1 to ship indexes to account for board padding
        ships = [[[z + 1 for z in y] for y in x] for x in ships]
        # Boolean for terminal state
        win = False
        # Ship counter to figure out when game has been won
        shipCount = 0
        # Randomly choosing first space to fire at
        location = chooseAction(board, h, w)
        # Boolean to decide which policy to follow, random or "hit policy"
        hit = False
        # Counter for number of steps to finish a game, will be divided by 3 to approximate # of steps to sink a ship
        time_steps_episode = 0
        # Looping while not at terminal state
        while not win:
            # If hit occurred, move into Q-learning
            if hit:
                # Only counting steps to finish a game when in hit policy
                time_steps_episode += 1
                # Getting current state by looking at board around location that was just hit and getting a size 8 array
                tempState = []
                for y in range(location[1] - 1, location[1] + 2):
                    for x in range(location[0] - 1, location[0] + 2):
                        if [
                                x, y
                        ] != location:  # ignores square we just hit, only need locations around it
                            tempState.append(int(copy(board[x][y])))
                # Extracting policy array from q-table using S
                S = copy(
                    q[tempState[0]][tempState[1]][tempState[2]][tempState[3]]
                    [tempState[4]][tempState[5]][tempState[6]][tempState[7]])
                # Call to bestChoice to choose A from S using epsilon-greedy
                # qA is the action index to update S (the q table)
                # bA is the action to compare to the whole board
                (qA, bA) = bestChoice(S, location, epsilon, board)
                # Take chosen action and update board
                result = takeAction(board, bA, ships)
                # Determine reward to be given based on result
                reward = rewardMatrix[result]

                # Get new state indices for q-table based on action taken
                temp2State = copy(tempState)
                # If result is sunk, the value to be put in q-table is 2. 3 was only for reward indexing
                if result == 3:
                    temp2State[qA] = 2
                else:
                    temp2State[qA] = result

                # Get S' from q-table using temp2State
                newS = q[temp2State[0]][temp2State[1]][temp2State[2]][
                    temp2State[3]][temp2State[4]][temp2State[5]][
                        temp2State[6]][temp2State[7]]
                # From S', observe what optimal value will be for max(a)Q(S',a)
                maxnewS = max(newS)

                # Q update
                q[tempState[0]][tempState[1]][tempState[2]][tempState[3]][tempState[4]][tempState[5]][tempState[6]][
                    tempState[7]][qA] \
                    = S[qA] + alpha * (reward + gamma * maxnewS - S[qA])

                # If action was miss, do not shift 3x3
                if result == 1:
                    S = newS
                # If action was hit but not sink, recenter S on the new hit
                elif result == 2:
                    location = bA
                # If action was a sink
                elif result == 3:
                    # Sunk ship count increases
                    shipCount = shipCount + 1
                    # Checking if game has been won (always use 3 ships right now)
                    if shipCount == 3:
                        # Terminal state has been reached
                        win = True
                        # Average total steps taken to figure out how long it takes to sink 1 ship
                        temp_time_steps.append(time_steps_episode / 3)
                        # Average step count over 10 episodes for graphs
                        if episode_count == 10:
                            time_steps.append(np.mean(temp_time_steps))
                            # Reset values for next 10 episodes
                            temp_time_steps = []
                            episode_count = 0
                    # Game hasn't ended yet, still ships to sink
                    else:
                        # Return to random policy when ship has been sunk
                        hit = False
                        # Choosing an action to take next time step
                        location = chooseAction(board, h, w)

            # If miss occurred or first iteration, use random policy
            else:
                # Take random chosen action and get result as hit, miss, or sink
                result = takeAction(board, location, ships)
                # If action was hit but not sink, set hit to true so we move into Q-learning policy
                if result == 2:
                    hit = True
                # If action was a sink
                elif result == 3:
                    # Sunk ship count increases
                    shipCount = shipCount + 1
                    # It took 1 attempt to sink the ship
                    time_steps_episode += 1
                    # Checking if game has been won (always use 3 ships right now)
                    if shipCount == 3:
                        # Terminal state has been reached
                        win = True
                        # Average total steps taken to figure out how long it takes to sink 1 ship
                        temp_time_steps.append(time_steps_episode / 3)
                        # Average step count over 10 episodes for graphs
                        if episode_count == 10:
                            time_steps.append(np.mean(temp_time_steps))
                            # Reset values for next 10 episodes
                            temp_time_steps = []
                            episode_count = 0
                # If action was miss, randomly choose next action
                else:
                    location = chooseAction(board, h, w)

    return board, time_steps