def __init__(self, iterations): self.mdp = BlackjackMarkovDecisionProcess() self.values = {} self.discount = 0.9 states = self.mdp.getStates() for _ in range(0, iterations): nextValues = {} for state in states: if self.mdp.isTerminal(state): nextValues[state] = self.values.get(state, 0) else: nextValues[state] = max( map(lambda action: self.getQValue(state, action), self.mdp.getPossibleActions(state))) self.values = nextValues
class ValueIterationAgent(Agent): def __init__(self, iterations): self.mdp = BlackjackMarkovDecisionProcess() self.values = {} self.discount = 0.9 states = self.mdp.getStates() for _ in range(0, iterations): nextValues = {} for state in states: if self.mdp.isTerminal(state): nextValues[state] = self.values.get(state, 0) else: nextValues[state] = max( map(lambda action: self.getQValue(state, action), self.mdp.getPossibleActions(state))) self.values = nextValues def getNextAction(self, gameState, hand): state = (False, len(hand.getCards()) == 2, hand.isDoubleDown(), hand.getHasAce(), hand.getHardCount(), gameState.getDealerUpCard().getSoftCount()) return self.getActionForState(state) def getActionForState(self, state): actions = self.mdp.getPossibleActions(state) if not actions: return None else: maxValue, maxActions = None, [] for action in actions: value = self.getQValue(state, action) if value == maxValue: maxActions.append(action) elif value > maxValue: maxValue, maxActions = value, [action] return random.choice(maxActions) def getQValue(self, state, action): """ The q-value of the state-action pair. """ statesAndProbs = self.mdp.getTransitionStatesAndProbs(state, action) qValue = 0 for transitionState, prob in statesAndProbs: reward = self.mdp.getReward(state, action, transitionState) qValue += prob * (reward + self.discount * self.values.get(transitionState, 0)) return qValue def printPolicies(self): for hasAce in [False, True]: if not hasAce: labelString = "sc" else: labelString = "aces" for dealerSoftCount in range(2, 12): labelString = "{0}\t{1}".format(labelString, dealerSoftCount) print labelString for hardCount in range(2, 22): formatString = "{0}".format(hardCount) for dealerSoftCount in range(2, 12): state = (False, True, False, hasAce, hardCount, dealerSoftCount) action = self.getActionForState(state) formatString += "\t{0}".format(action[0]) print formatString def __str__(self): return "Value iteration agent"