-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
231 lines (209 loc) · 9.08 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
import cardUtils, util, copy
from collections import Counter
from simulate import simulate
from featureExtractor import tdDominionFeatureExtractor
from featureExtractor import qDominionFeatureExtractor
from tdlearning import TDLearningAlgorithm
from qLearning import QLearningAlgorithm
class ValueIteration(util.MDPAlgorithm):
# Implement value iteration. First, compute V_opt using the methods
# discussed in class. Once you have computed V_opt, compute the optimal
# policy pi. Note that ValueIteration is an instance of util.MDPAlgrotithm,
# which means you will need to set pi and V (see util.py).cou
def solve(self, mdp, epsilon=0.001):
mdp.computeStates()
# BEGIN_YOUR_CODE (around 15 lines of code expected)
def computeActionValue(state, action, vPrevious):
total = 0.0
for newState, prob, reward in mdp.succAndProbReward(state, action):
total += prob * (reward + (mdp.discount() * vPrevious[newState]))
return total
def maxDiff(v, vPrevious):
m = max(abs(v[state] - vPrevious[state]) for state in mdp.states)
return m
#TODO: make this not use dicts (non-hashable states)
pi = {}
v = {state:0.0 for state in mdp.states}
while True:
vPrevious = copy.copy(v)
for state in mdp.states:
if mdp.actions(state) == []:
pi[state] = None
else:
actionValues = {action:computeActionValue(state, action, vPrevious) for action in mdp.actions(state)}
optAction, optValue = max(actionValues.iteritems(), key=lambda pair: pair[1])
v[state] = optValue
pi[state] = optAction
if mdp.succAndProbReward(state, optAction) == []:
pi[state] = None
if maxDiff(v, vPrevious) <= epsilon:
break
self.V = v
self.pi = pi
# END_YOUR_CODE
class DominionMDP:
def __init__(self):
# BEGIN_YOUR_CODE (around 5 lines of code expected)
#kingdom: possible cards in stock to buy (by ID)
self.kingdom = [0, 1, 2, 3, 4, 5] # TODO save this better/read this better
self.handSize = 5
self.maxTurns = 4
return
# END_YOUR_CODE
def startState(self):
# BEGIN_YOUR_CODE (around 5 lines of code expected)
#starting deck by ID, number of cards in that ID
deck = util.Hashabledict(Counter({0:7, 3:3})) # TODO Make this better: read from human readable input/sparse vector?
hand = (0,0,0,0,0) # TODO randomize?
drawPile = copy.copy(deck)
for cardID in hand:
drawPile[cardID] -= 1
discardPile = util.Hashabledict(Counter())
phase = "buy"
turn = 0
return (deck, hand, drawPile, discardPile, phase, turn)
# END_YOUR_CODE
# Return set of actions possible from |state|. Actions is a list of ("buy cardName") strings.
def actions(self, state):
def computeHandValue(hand):
handValue = 0
for cardID in hand:
card = cardUtils.getCardFromID(cardID)
if card.cardType == "treasure":
handValue += card.treasureValue
return handValue
# BEGIN_YOUR_CODE (around 5 lines of code expected)
results = []
results.append(("buy", -1))
deck, hand, drawPile, discardPile, phase, turn = state
if turn == self.maxTurns:
return []
if phase == "buy":
handValue = computeHandValue(hand)
for cardID in self.kingdom:
buyCard = cardUtils.getCardFromID(cardID)
if buyCard.cardCost <= handValue:
results.append(("buy", buyCard.cardID))
#print handValue
return results
# END_YOUR_CODE
# Return a list of (newState, prob, reward) tuples corresponding to edges
# coming out of |state|.
def succAndProbReward(self, state, action):
# BEGIN_YOUR_CODE (around 5 lines of code expected)
#do we need to have handValue in the state...?
deck, hand, drawPile, discardPile, phase, turn = state
actionType, buyCardID = action
nextDeck = util.Hashabledict(Counter(deck))
nextDiscardPile = util.Hashabledict(Counter(discardPile))
if actionType == "buy":
if buyCardID != -1:
#Buy the card
nextDeck[buyCardID] += 1
nextDiscardPile[buyCardID] += 1
#Discard your hand
for cardID in hand:
nextDiscardPile[cardID] += 1
#Compute new hands and probabilities
newHand = []
handSize = self.handSize
if sum(drawPile.values()) < self.handSize:
#Reshuffle.
#TODO: fix.
for cardID in drawPile.keys():
for i in range(drawPile[cardID]):
newHand.append(cardID)
drawPile[cardID] -= 1
handSize -= 1
drawPile = nextDiscardPile
nextDiscardPile = util.Hashabledict(Counter())
def computeHandProbability(hand, drawPile, numHands):
prob = 1
handCounter = util.Hashabledict(Counter(hand))
for cardID in handCounter.keys():
prob *= util.nchoosek(drawPile[cardID] + handCounter[cardID], handCounter[cardID])
prob /= (0.0 + numHands)
return prob
def computeHandsAndProbs(hand, numHands, results, drawPile, drawIndex, handSize):
if drawIndex >= len(self.kingdom):
return
if len(hand) == handSize:
prob = computeHandProbability(hand, drawPile, numHands)
results.append((list(hand), prob))
#print "appended"
#print results
return
#print "drawPile: ",drawPile,"drawIndex: ",drawIndex
if drawPile[drawIndex] > 0:
hand.append(drawIndex)
drawPile[drawIndex] -= 1
computeHandsAndProbs(hand, numHands, results, drawPile, drawIndex, handSize)
drawPile[drawIndex] += 1
hand.pop()
computeHandsAndProbs(hand, numHands, results, drawPile, drawIndex + 1, handSize)
numHands = util.nchoosek(sum(drawPile.values()), handSize)
handsAndProbs = []
computeHandsAndProbs([], numHands, handsAndProbs, drawPile, 0, handSize)
#print "handsAndProbs: ",handsAndProbs
reward = 0
if turn == self.maxTurns - 1:
reward = nextDeck[3] + 3 * nextDeck[4] + 6 * nextDeck[5] #TODO: make readable. extract into method (sum victory points).
#deck, hand, drawPile, discardPile, phase, turn = state
#(newState, prob, reward)
results = []
for handAddition, prob in handsAndProbs:
nextHand = newHand + handAddition
nextDrawPile = util.Hashabledict(Counter(drawPile))
for cardID in handAddition:
nextDrawPile[cardID] -= 1
newState = (nextDeck, tuple(nextHand), nextDrawPile, nextDiscardPile, phase, turn + 1)
results.append((newState, prob, reward))
#print "Results:",results
return results
# END_YOUR_CODE
def discount(self):
# BEGIN_YOUR_CODE (around 5 lines of code expected)
return 1
# END_YOUR_CODE
def computeStates(self):
self.states = []
queue = []
self.states.append(self.startState())
queue.append(self.startState())
while len(queue) > 0:
state = queue.pop()
for action in self.actions(state):
#print action
for newState, prob, reward in self.succAndProbReward(state, action):
self.states.append(newState)
queue.append(newState)
cardUtils.initializeCardList("cards.txt")
def printActions(actions):
for action in actions:
actionType, cardID = action
print actionType, cardUtils.getCardFromID(cardID).cardName, ",",
print
dominion = DominionMDP()
'''vi = ValueIteration()
vi.solve(dominion)
for state, action in vi.pi.iteritems():
if action != None:
if action[0] == 'buy':
buy, buyCardID = action
if buyCardID == -1:
cardName = "None"
else:
cardName = cardUtils.getCardFromID(buyCardID).cardName
#print "State: ", state, "Action:", buy, cardName'''
#print "Policy: ", vi.pi
#actions = dominion.actions(dominion.startState())
#printActions(dominion.actions(dominion.startState()))
#dominion.succAndProbReward(dominion.startState(), actions[0])
#dominion.succAndProbReward(dominion.startState(), actions[1])
tdFeatureExtractor = tdDominionFeatureExtractor
qFeatureExtractor = qDominionFeatureExtractor
discount = 1
td = TDLearningAlgorithm(dominion, discount, tdFeatureExtractor)
qLearn = QLearningAlgorithm(dominion.actions, discount, qFeatureExtractor)
rewards = simulate(dominion, td, 100)
print "Average reward: ", sum(rewards) / (0.0 + len(rewards))