def SARSA(q, s, a, newState, epsilon, alpha, gamma): reward = tictactoe.observeReward(q.player, newState) if newState.terminal(): newQ = reward else: newAction = tictactoe.chooseAction(q, newState, epsilon) newQ = q.Q(newState, newAction) newScore = q.Q(s, a) + alpha*(reward + gamma*newQ - q.Q(s, a)) q.update(s, a, newScore)
def rewardPerEpisode(q, gamma): if q.player == tictactoe.PlayerCircle: opponent = tictactoe.ActionValueFunc(tictactoe.PlayerCross) else: opponent = tictactoe.ActionValueFunc(tictactoe.PlayerCircle) rpe = 0.0 # reward per episode t = 0 # time step s = tictactoe.State() # Randomly determine whether the player or her opponent should move first. if random.random() < 0.5: a = tictactoe.chooseAction(opponent, s, 0) s = tictactoe.takeAction(opponent.player, s, a) t += 1 while True: # Player makes a move and defers observing the reward until her opponent has made his move. # Only under the special case where the move is the last move should the player observe reward before exiting. a = tictactoe.chooseAction(q, s, 0) s1 = tictactoe.takeAction(q.player, s, a) t += 1 if s1.terminal(): reward = tictactoe.observeReward(q.player, s1) rpe += math.pow(gamma, t) * reward break # Opponent make a move, and the resulting state is observed by player to calculate her reward. opponentAction = tictactoe.chooseAction(opponent, s1, 0) s2 = tictactoe.takeAction(opponent.player, s1, opponentAction) t += 1 reward = tictactoe.observeReward(q.player, s2) rpe += math.pow(gamma, t) * reward s = s2 if s.terminal(): break return rpe
def QLearning(q, s, a, newState, alpha, gamma): reward = tictactoe.observeReward(q.player, newState) newScore = q.Q(s, a) + alpha*(reward + gamma*q.best(newState) - q.Q(s, a)) q.update(s, a, newScore)