def QLearning(forever, width, height): # Reward matrix, with values for unchecked, miss, hit, and sink rewardMatrix = [ 0, -1, 0, 1 ] # unchecked value there because unchecked value on board is 0 # Step size alpha = 0.1 # Epsilon for epsilon-greedy action selection epsilon = 0.05 # Discount factor gamma = 0.9 # Q-table initialization q = np.zeros(shape=(3, 3, 3, 3, 3, 3, 3, 3, 8)) # Generating Agent object to train w = width h = height agent = Agent(w, h) # Array to save number of steps to sink a ship every 10 episodes time_steps = [] # Counter to check when 10 episodes have passed to save time_steps out episode_count = 0 # Stores number of steps to sink a ship for every episode temp_time_steps = [] # Loop for each episode in forever for i in range(forever): episode_count += 1 # Resetting board each episode board = np.zeros((h, w)) # Padding board with 1s to handle shots fired at edge needing a 3x3 surrounding board = np.pad(board, 1, 'constant', constant_values=(1, 1)) # Setting ships randomly on board agent.ships = setShips(h, w) ships = agent.ships # Add 1 to ship indexes to account for board padding ships = [[[z + 1 for z in y] for y in x] for x in ships] # Boolean for terminal state win = False # Ship counter to figure out when game has been won shipCount = 0 # Randomly choosing first space to fire at location = chooseAction(board, h, w) # Boolean to decide which policy to follow, random or "hit policy" hit = False # Counter for number of steps to finish a game, will be divided by 3 to approximate # of steps to sink a ship time_steps_episode = 0 # Looping while not at terminal state while not win: # If hit occurred, move into Q-learning if hit: # Only counting steps to finish a game when in hit policy time_steps_episode += 1 # Getting current state by looking at board around location that was just hit and getting a size 8 array tempState = [] for y in range(location[1] - 1, location[1] + 2): for x in range(location[0] - 1, location[0] + 2): if [ x, y ] != location: # ignores square we just hit, only need locations around it tempState.append(int(copy(board[x][y]))) # Extracting policy array from q-table using S S = copy( q[tempState[0]][tempState[1]][tempState[2]][tempState[3]] [tempState[4]][tempState[5]][tempState[6]][tempState[7]]) # Call to bestChoice to choose A from S using epsilon-greedy # qA is the action index to update S (the q table) # bA is the action to compare to the whole board (qA, bA) = bestChoice(S, location, epsilon, board) # Take chosen action and update board result = takeAction(board, bA, ships) # Determine reward to be given based on result reward = rewardMatrix[result] # Get new state indices for q-table based on action taken temp2State = copy(tempState) # If result is sunk, the value to be put in q-table is 2. 3 was only for reward indexing if result == 3: temp2State[qA] = 2 else: temp2State[qA] = result # Get S' from q-table using temp2State newS = q[temp2State[0]][temp2State[1]][temp2State[2]][ temp2State[3]][temp2State[4]][temp2State[5]][ temp2State[6]][temp2State[7]] # From S', observe what optimal value will be for max(a)Q(S',a) maxnewS = max(newS) # Q update q[tempState[0]][tempState[1]][tempState[2]][tempState[3]][tempState[4]][tempState[5]][tempState[6]][ tempState[7]][qA] \ = S[qA] + alpha * (reward + gamma * maxnewS - S[qA]) # If action was miss, do not shift 3x3 if result == 1: S = newS # If action was hit but not sink, recenter S on the new hit elif result == 2: location = bA # If action was a sink elif result == 3: # Sunk ship count increases shipCount = shipCount + 1 # Checking if game has been won (always use 3 ships right now) if shipCount == 3: # Terminal state has been reached win = True # Average total steps taken to figure out how long it takes to sink 1 ship temp_time_steps.append(time_steps_episode / 3) # Average step count over 10 episodes for graphs if episode_count == 10: time_steps.append(np.mean(temp_time_steps)) # Reset values for next 10 episodes temp_time_steps = [] episode_count = 0 # Game hasn't ended yet, still ships to sink else: # Return to random policy when ship has been sunk hit = False # Choosing an action to take next time step location = chooseAction(board, h, w) # If miss occurred or first iteration, use random policy else: # Take random chosen action and get result as hit, miss, or sink result = takeAction(board, location, ships) # If action was hit but not sink, set hit to true so we move into Q-learning policy if result == 2: hit = True # If action was a sink elif result == 3: # Sunk ship count increases shipCount = shipCount + 1 # It took 1 attempt to sink the ship time_steps_episode += 1 # Checking if game has been won (always use 3 ships right now) if shipCount == 3: # Terminal state has been reached win = True # Average total steps taken to figure out how long it takes to sink 1 ship temp_time_steps.append(time_steps_episode / 3) # Average step count over 10 episodes for graphs if episode_count == 10: time_steps.append(np.mean(temp_time_steps)) # Reset values for next 10 episodes temp_time_steps = [] episode_count = 0 # If action was miss, randomly choose next action else: location = chooseAction(board, h, w) return board, time_steps