Ejemplo n.º 1
0
def SARSA(q, s, a, newState, epsilon, alpha, gamma):
  reward = tictactoe.observeReward(q.player, newState)

  if newState.terminal():
    newQ = reward
  else:
    newAction = tictactoe.chooseAction(q, newState, epsilon)
    newQ = q.Q(newState, newAction)

  newScore = q.Q(s, a) + alpha*(reward + gamma*newQ - q.Q(s, a))
  q.update(s, a, newScore)
Ejemplo n.º 2
0
def rewardPerEpisode(q, gamma):
  if q.player == tictactoe.PlayerCircle:
    opponent = tictactoe.ActionValueFunc(tictactoe.PlayerCross)
  else:
    opponent = tictactoe.ActionValueFunc(tictactoe.PlayerCircle)

  rpe = 0.0 # reward per episode
  t = 0 # time step
  s = tictactoe.State()

  # Randomly determine whether the player or her opponent should move first.
  if random.random() < 0.5:
    a = tictactoe.chooseAction(opponent, s, 0)
    s = tictactoe.takeAction(opponent.player, s, a)
    t += 1

  while True:
    # Player makes a move and defers observing the reward until her opponent has made his move.
    # Only under the special case where the move is the last move should the player observe reward before exiting.
    a = tictactoe.chooseAction(q, s, 0)
    s1 = tictactoe.takeAction(q.player, s, a)
    t += 1
    if s1.terminal():
      reward = tictactoe.observeReward(q.player, s1)
      rpe += math.pow(gamma, t) * reward
      break

    # Opponent make a move, and the resulting state is observed by player to calculate her reward.
    opponentAction = tictactoe.chooseAction(opponent, s1, 0)
    s2 = tictactoe.takeAction(opponent.player, s1, opponentAction)
    t += 1
    reward = tictactoe.observeReward(q.player, s2)
    rpe += math.pow(gamma, t) * reward

    s = s2
    if s.terminal():
      break

  return rpe
Ejemplo n.º 3
0
def QLearning(q, s, a, newState, alpha, gamma):
  reward = tictactoe.observeReward(q.player, newState)

  newScore = q.Q(s, a) + alpha*(reward + gamma*q.best(newState) - q.Q(s, a))
  q.update(s, a, newScore)